/src/php-src/ext/pcre/pcre2lib/pcre2_script_run.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2021 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | /* This module contains the function for checking a script run. */ |
42 | | |
43 | | #ifdef HAVE_CONFIG_H |
44 | | #include "config.h" |
45 | | #endif |
46 | | |
47 | | #include "pcre2_internal.h" |
48 | | |
49 | | |
50 | | /************************************************* |
51 | | * Check script run * |
52 | | *************************************************/ |
53 | | |
54 | | /* A script run is conceptually a sequence of characters all in the same |
55 | | Unicode script. However, it isn't quite that simple. There are special rules |
56 | | for scripts that are commonly used together, and also special rules for digits. |
57 | | This function implements the appropriate checks, which is possible only when |
58 | | PCRE2 is compiled with Unicode support. The function returns TRUE if there is |
59 | | no Unicode support; however, it should never be called in that circumstance |
60 | | because an error is given by pcre2_compile() if a script run is called for in a |
61 | | version of PCRE2 compiled without Unicode support. |
62 | | |
63 | | Arguments: |
64 | | pgr point to the first character |
65 | | endptr point after the last character |
66 | | utf TRUE if in UTF mode |
67 | | |
68 | | Returns: TRUE if this is a valid script run |
69 | | */ |
70 | | |
71 | | /* These are states in the checking process. */ |
72 | | |
73 | | enum { SCRIPT_UNSET, /* Requirement as yet unknown */ |
74 | | SCRIPT_MAP, /* Bitmap contains acceptable scripts */ |
75 | | SCRIPT_HANPENDING, /* Have had only Han characters */ |
76 | | SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */ |
77 | | SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */ |
78 | | SCRIPT_HANHANGUL /* Expect Han or Hangul */ |
79 | | }; |
80 | | |
81 | 0 | #define UCD_MAPSIZE (ucp_Unknown/32 + 1) |
82 | 0 | #define FULL_MAPSIZE (ucp_Script_Count/32 + 1) |
83 | | |
84 | | BOOL |
85 | | PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf) |
86 | 0 | { |
87 | 0 | #ifdef SUPPORT_UNICODE |
88 | 0 | uint32_t require_state = SCRIPT_UNSET; |
89 | 0 | uint32_t require_map[FULL_MAPSIZE]; |
90 | 0 | uint32_t map[FULL_MAPSIZE]; |
91 | 0 | uint32_t require_digitset = 0; |
92 | 0 | uint32_t c; |
93 | |
|
94 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
95 | | (void)utf; /* Avoid compiler warning */ |
96 | | #endif |
97 | | |
98 | | /* Any string containing fewer than 2 characters is a valid script run. */ |
99 | |
|
100 | 0 | if (ptr >= endptr) return TRUE; |
101 | 0 | GETCHARINCTEST(c, ptr); |
102 | 0 | if (ptr >= endptr) return TRUE; |
103 | | |
104 | | /* Initialize the require map. This is a full-size bitmap that has a bit for |
105 | | every script, as opposed to the maps in ucd_script_sets, which only have bits |
106 | | for scripts less than ucp_Unknown - those that appear in script extension |
107 | | lists. */ |
108 | | |
109 | 0 | for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0; |
110 | | |
111 | | /* Scan strings of two or more characters, checking the Unicode characteristics |
112 | | of each code point. There is special code for scripts that can be combined with |
113 | | characters from the Han Chinese script. This may be used in conjunction with |
114 | | four other scripts in these combinations: |
115 | | |
116 | | . Han with Hiragana and Katakana is allowed (for Japanese). |
117 | | . Han with Bopomofo is allowed (for Taiwanese Mandarin). |
118 | | . Han with Hangul is allowed (for Korean). |
119 | | |
120 | | If the first significant character's script is one of the four, the required |
121 | | script type is immediately known. However, if the first significant |
122 | | character's script is Han, we have to keep checking for a non-Han character. |
123 | | Hence the SCRIPT_HANPENDING state. */ |
124 | |
|
125 | 0 | for (;;) |
126 | 0 | { |
127 | 0 | const ucd_record *ucd = GET_UCD(c); |
128 | 0 | uint32_t script = ucd->script; |
129 | | |
130 | | /* If the script is Unknown, the string is not a valid script run. Such |
131 | | characters can only form script runs of length one (see test above). */ |
132 | |
|
133 | 0 | if (script == ucp_Unknown) return FALSE; |
134 | | |
135 | | /* A character without any script extensions whose script is Inherited or |
136 | | Common is always accepted with any script. If there are extensions, the |
137 | | following processing happens for all scripts. */ |
138 | | |
139 | 0 | if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common)) |
140 | 0 | { |
141 | 0 | BOOL OK; |
142 | | |
143 | | /* Set up a full-sized map for this character that can include bits for all |
144 | | scripts. Copy the scriptx map for this character (which covers those |
145 | | scripts that appear in script extension lists), set the remaining values to |
146 | | zero, and then, except for Common or Inherited, add this script's bit to |
147 | | the map. */ |
148 | |
|
149 | 0 | memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t)); |
150 | 0 | memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t)); |
151 | 0 | if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script); |
152 | | |
153 | | /* Handle the different checking states */ |
154 | |
|
155 | 0 | switch(require_state) |
156 | 0 | { |
157 | | /* First significant character - it might follow Common or Inherited |
158 | | characters that do not have any script extensions. */ |
159 | | |
160 | 0 | case SCRIPT_UNSET: |
161 | 0 | switch(script) |
162 | 0 | { |
163 | 0 | case ucp_Han: |
164 | 0 | require_state = SCRIPT_HANPENDING; |
165 | 0 | break; |
166 | | |
167 | 0 | case ucp_Hiragana: |
168 | 0 | case ucp_Katakana: |
169 | 0 | require_state = SCRIPT_HANHIRAKATA; |
170 | 0 | break; |
171 | | |
172 | 0 | case ucp_Bopomofo: |
173 | 0 | require_state = SCRIPT_HANBOPOMOFO; |
174 | 0 | break; |
175 | | |
176 | 0 | case ucp_Hangul: |
177 | 0 | require_state = SCRIPT_HANHANGUL; |
178 | 0 | break; |
179 | | |
180 | 0 | default: |
181 | 0 | memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t)); |
182 | 0 | require_state = SCRIPT_MAP; |
183 | 0 | break; |
184 | 0 | } |
185 | 0 | break; |
186 | | |
187 | | /* The first significant character was Han. An inspection of the Unicode |
188 | | 11.0.0 files shows that there are the following types of Script Extension |
189 | | list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul |
190 | | scripts: |
191 | | |
192 | | . Bopomofo + Han |
193 | | . Han + Hiragana + Katakana |
194 | | . Hiragana + Katakana |
195 | | . Bopopmofo + Hangul + Han + Hiragana + Katakana |
196 | | |
197 | | The following code tries to make sense of this. */ |
198 | | |
199 | 0 | #define FOUND_BOPOMOFO 1 |
200 | 0 | #define FOUND_HIRAGANA 2 |
201 | 0 | #define FOUND_KATAKANA 4 |
202 | 0 | #define FOUND_HANGUL 8 |
203 | | |
204 | 0 | case SCRIPT_HANPENDING: |
205 | 0 | if (script != ucp_Han) /* Another Han does nothing */ |
206 | 0 | { |
207 | 0 | uint32_t chspecial = 0; |
208 | |
|
209 | 0 | if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO; |
210 | 0 | if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA; |
211 | 0 | if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA; |
212 | 0 | if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL; |
213 | |
|
214 | 0 | if (chspecial == 0) return FALSE; /* Not allowed with Han */ |
215 | | |
216 | 0 | if (chspecial == FOUND_BOPOMOFO) |
217 | 0 | require_state = SCRIPT_HANBOPOMOFO; |
218 | 0 | else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) |
219 | 0 | require_state = SCRIPT_HANHIRAKATA; |
220 | | |
221 | | /* Otherwise this character must be allowed with all of them, so remain |
222 | | in the pending state. */ |
223 | 0 | } |
224 | 0 | break; |
225 | | |
226 | | /* Previously encountered one of the "with Han" scripts. Check that |
227 | | this character is appropriate. */ |
228 | | |
229 | 0 | case SCRIPT_HANHIRAKATA: |
230 | 0 | if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) + |
231 | 0 | MAPBIT(map, ucp_Katakana) == 0) return FALSE; |
232 | 0 | break; |
233 | | |
234 | 0 | case SCRIPT_HANBOPOMOFO: |
235 | 0 | if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE; |
236 | 0 | break; |
237 | | |
238 | 0 | case SCRIPT_HANHANGUL: |
239 | 0 | if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE; |
240 | 0 | break; |
241 | | |
242 | | /* Previously encountered one or more characters that are allowed with a |
243 | | list of scripts. */ |
244 | | |
245 | 0 | case SCRIPT_MAP: |
246 | 0 | OK = FALSE; |
247 | |
|
248 | 0 | for (int i = 0; i < FULL_MAPSIZE; i++) |
249 | 0 | { |
250 | 0 | if ((require_map[i] & map[i]) != 0) |
251 | 0 | { |
252 | 0 | OK = TRUE; |
253 | 0 | break; |
254 | 0 | } |
255 | 0 | } |
256 | |
|
257 | 0 | if (!OK) return FALSE; |
258 | | |
259 | | /* The rest of the string must be in this script, but we have to |
260 | | allow for the Han complications. */ |
261 | | |
262 | 0 | switch(script) |
263 | 0 | { |
264 | 0 | case ucp_Han: |
265 | 0 | require_state = SCRIPT_HANPENDING; |
266 | 0 | break; |
267 | | |
268 | 0 | case ucp_Hiragana: |
269 | 0 | case ucp_Katakana: |
270 | 0 | require_state = SCRIPT_HANHIRAKATA; |
271 | 0 | break; |
272 | | |
273 | 0 | case ucp_Bopomofo: |
274 | 0 | require_state = SCRIPT_HANBOPOMOFO; |
275 | 0 | break; |
276 | | |
277 | 0 | case ucp_Hangul: |
278 | 0 | require_state = SCRIPT_HANHANGUL; |
279 | 0 | break; |
280 | | |
281 | | /* Compute the intersection of the required list of scripts and the |
282 | | allowed scripts for this character. */ |
283 | | |
284 | 0 | default: |
285 | 0 | for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i]; |
286 | 0 | break; |
287 | 0 | } |
288 | | |
289 | 0 | break; |
290 | 0 | } |
291 | 0 | } /* End checking character's script and extensions. */ |
292 | | |
293 | | /* The character is in an acceptable script. We must now ensure that all |
294 | | decimal digits in the string come from the same set. Some scripts (e.g. |
295 | | Common, Arabic) have more than one set of decimal digits. This code does |
296 | | not allow mixing sets, even within the same script. The vector called |
297 | | PRIV(ucd_digit_sets)[] contains, in its first element, the number of |
298 | | following elements, and then, in ascending order, the code points of the |
299 | | '9' characters in every set of 10 digits. Each set is identified by the |
300 | | offset in the vector of its '9' character. An initial check of the first |
301 | | value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ |
302 | | |
303 | 0 | if (ucd->chartype == ucp_Nd) |
304 | 0 | { |
305 | 0 | uint32_t digitset; |
306 | |
|
307 | 0 | if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else |
308 | 0 | { |
309 | 0 | int mid; |
310 | 0 | int bot = 1; |
311 | 0 | int top = PRIV(ucd_digit_sets)[0]; |
312 | 0 | for (;;) |
313 | 0 | { |
314 | 0 | if (top <= bot + 1) /* <= rather than == is paranoia */ |
315 | 0 | { |
316 | 0 | digitset = top; |
317 | 0 | break; |
318 | 0 | } |
319 | 0 | mid = (top + bot) / 2; |
320 | 0 | if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; |
321 | 0 | } |
322 | 0 | } |
323 | | |
324 | | /* A required value of 0 means "unset". */ |
325 | |
|
326 | 0 | if (require_digitset == 0) require_digitset = digitset; |
327 | 0 | else if (digitset != require_digitset) return FALSE; |
328 | 0 | } /* End digit handling */ |
329 | | |
330 | | /* If we haven't yet got to the end, pick up the next character. */ |
331 | | |
332 | 0 | if (ptr >= endptr) return TRUE; |
333 | 0 | GETCHARINCTEST(c, ptr); |
334 | 0 | } /* End checking loop */ |
335 | |
|
336 | | #else /* NOT SUPPORT_UNICODE */ |
337 | | (void)ptr; |
338 | | (void)endptr; |
339 | | (void)utf; |
340 | | return TRUE; |
341 | | #endif /* SUPPORT_UNICODE */ |
342 | 0 | } |
343 | | |
344 | | /* End of pcre2_script_run.c */ |