/src/pcre2/src/pcre2_compile_class.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2024 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | |
42 | | #include "pcre2_compile.h" |
43 | | |
44 | | |
45 | | |
46 | | typedef struct { |
47 | | /* Option bits for eclass. */ |
48 | | uint32_t options; |
49 | | uint32_t xoptions; |
50 | | /* Rarely used members. */ |
51 | | int *errorcodeptr; |
52 | | compile_block *cb; |
53 | | /* Bitmap is needed. */ |
54 | | BOOL needs_bitmap; |
55 | | } eclass_context; |
56 | | |
57 | | /* Checks the allowed tokens at the end of a class structure in debug mode. |
58 | | When a new token is not processed by all loops, and the token is equals to |
59 | | a) one of the cases here: |
60 | | the compiler will complain about a duplicated case value. |
61 | | b) none of the cases here: |
62 | | the loop without the handler will stop with an assertion failure. */ |
63 | | |
64 | | #ifdef PCRE2_DEBUG |
65 | | #define CLASS_END_CASES(meta) \ |
66 | | default: \ |
67 | | PCRE2_ASSERT((meta) <= META_END); \ |
68 | | /* Fall through */ \ |
69 | | case META_CLASS: \ |
70 | | case META_CLASS_NOT: \ |
71 | | case META_CLASS_EMPTY: \ |
72 | | case META_CLASS_EMPTY_NOT: \ |
73 | | case META_CLASS_END: \ |
74 | | case META_ECLASS_AND: \ |
75 | | case META_ECLASS_OR: \ |
76 | | case META_ECLASS_SUB: \ |
77 | | case META_ECLASS_XOR: \ |
78 | | case META_ECLASS_NOT: |
79 | | #else |
80 | | #define CLASS_END_CASES(meta) \ |
81 | 4.80M | default: |
82 | | #endif |
83 | | |
84 | | #ifdef SUPPORT_WIDE_CHARS |
85 | | |
86 | | /* Heapsort algorithm. */ |
87 | | |
88 | | static void do_heapify(uint32_t *buffer, size_t size, size_t i) |
89 | 2.08M | { |
90 | 2.08M | size_t max; |
91 | 2.08M | size_t left; |
92 | 2.08M | size_t right; |
93 | 2.08M | uint32_t tmp1, tmp2; |
94 | | |
95 | 11.6M | while (TRUE) |
96 | 11.6M | { |
97 | 11.6M | max = i; |
98 | 11.6M | left = (i << 1) + 2; |
99 | 11.6M | right = left + 2; |
100 | | |
101 | 11.6M | if (left < size && buffer[left] > buffer[max]) max = left; |
102 | 11.6M | if (right < size && buffer[right] > buffer[max]) max = right; |
103 | 11.6M | if (i == max) return; |
104 | | |
105 | | /* Swap items. */ |
106 | 9.51M | tmp1 = buffer[i]; |
107 | 9.51M | tmp2 = buffer[i + 1]; |
108 | 9.51M | buffer[i] = buffer[max]; |
109 | 9.51M | buffer[i + 1] = buffer[max + 1]; |
110 | 9.51M | buffer[max] = tmp1; |
111 | 9.51M | buffer[max + 1] = tmp2; |
112 | 9.51M | i = max; |
113 | 9.51M | } |
114 | 2.08M | } |
115 | | |
116 | | #ifdef SUPPORT_UNICODE |
117 | | |
118 | 39.0k | #define PARSE_CLASS_UTF 0x1 |
119 | 1.14M | #define PARSE_CLASS_CASELESS_UTF 0x2 |
120 | 2.58M | #define PARSE_CLASS_RESTRICTED_UTF 0x4 |
121 | 6.49M | #define PARSE_CLASS_TURKISH_UTF 0x8 |
122 | | |
123 | | /* Get the range of nocase characters which includes the |
124 | | 'c' character passed as argument, or directly follows 'c'. */ |
125 | | |
126 | | static const uint32_t* |
127 | | get_nocase_range(uint32_t c) |
128 | 798k | { |
129 | 798k | uint32_t left = 0; |
130 | 798k | uint32_t right = PRIV(ucd_nocase_ranges_size); |
131 | 798k | uint32_t middle; |
132 | | |
133 | 798k | if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right; |
134 | | |
135 | 4.08M | while (TRUE) |
136 | 4.08M | { |
137 | | /* Range end of the middle element. */ |
138 | 4.08M | middle = ((left + right) >> 1) | 0x1; |
139 | | |
140 | 4.08M | if (PRIV(ucd_nocase_ranges)[middle] <= c) |
141 | 16.6k | left = middle + 1; |
142 | 4.06M | else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c) |
143 | 3.26M | right = middle - 1; |
144 | 798k | else |
145 | 798k | return PRIV(ucd_nocase_ranges) + (middle - 1); |
146 | 4.08M | } |
147 | 798k | } |
148 | | |
149 | | /* Get the list of othercase characters, which belongs to the passed range. |
150 | | Create ranges from these characters, and append them to the buffer argument. */ |
151 | | |
152 | | static size_t |
153 | | utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options, |
154 | | uint32_t *buffer) |
155 | 798k | { |
156 | 798k | uint32_t new_start = start; |
157 | 798k | uint32_t new_end = end; |
158 | 798k | uint32_t c = start; |
159 | 798k | const uint32_t *list; |
160 | 798k | uint32_t tmp[3]; |
161 | 798k | size_t result = 2; |
162 | 798k | const uint32_t *skip_range = get_nocase_range(c); |
163 | 798k | uint32_t skip_start = skip_range[0]; |
164 | | |
165 | 798k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
166 | 798k | PCRE2_ASSERT(options & PARSE_CLASS_UTF); |
167 | 798k | #endif |
168 | | |
169 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
170 | | if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT; |
171 | | #endif |
172 | | |
173 | 3.05M | while (c <= end) |
174 | 2.26M | { |
175 | 2.26M | uint32_t co; |
176 | | |
177 | 2.26M | if (c > skip_start) |
178 | 94.4k | { |
179 | 94.4k | c = skip_range[1]; |
180 | 94.4k | skip_range += 2; |
181 | 94.4k | skip_start = skip_range[0]; |
182 | 94.4k | continue; |
183 | 94.4k | } |
184 | | |
185 | | /* Compute caseless set. */ |
186 | | |
187 | 2.16M | if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) == |
188 | 2.16M | PARSE_CLASS_TURKISH_UTF && |
189 | 2.16M | UCD_ANY_I(c)) |
190 | 0 | { |
191 | 0 | co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3); |
192 | 0 | } |
193 | 2.16M | else if ((co = UCD_CASESET(c)) != 0 && |
194 | 2.16M | (options & PARSE_CLASS_RESTRICTED_UTF) != 0 && |
195 | 2.16M | PRIV(ucd_caseless_sets)[co] < 128) |
196 | 372 | { |
197 | 372 | co = 0; /* Ignore the caseless set if it's restricted. */ |
198 | 372 | } |
199 | | |
200 | 2.16M | if (co != 0) |
201 | 417k | list = PRIV(ucd_caseless_sets) + co; |
202 | 1.74M | else |
203 | 1.74M | { |
204 | 1.74M | co = UCD_OTHERCASE(c); |
205 | 1.74M | list = tmp; |
206 | 1.74M | tmp[0] = c; |
207 | 1.74M | tmp[1] = NOTACHAR; |
208 | | |
209 | 1.74M | if (co != c) |
210 | 1.60M | { |
211 | 1.60M | tmp[1] = co; |
212 | 1.60M | tmp[2] = NOTACHAR; |
213 | 1.60M | } |
214 | 1.74M | } |
215 | 2.16M | c++; |
216 | | |
217 | | /* Add characters. */ |
218 | 2.16M | do |
219 | 4.61M | { |
220 | | #if PCRE2_CODE_UNIT_WIDTH == 16 |
221 | | if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue; |
222 | | #endif |
223 | | |
224 | 4.61M | if (*list < new_start) |
225 | 724k | { |
226 | 724k | if (*list + 1 == new_start) |
227 | 722 | { |
228 | 722 | new_start--; |
229 | 722 | continue; |
230 | 722 | } |
231 | 724k | } |
232 | 3.88M | else if (*list > new_end) |
233 | 461k | { |
234 | 461k | if (*list - 1 == new_end) |
235 | 15.9k | { |
236 | 15.9k | new_end++; |
237 | 15.9k | continue; |
238 | 15.9k | } |
239 | 461k | } |
240 | 3.42M | else continue; |
241 | | |
242 | 1.16M | result += 2; |
243 | 1.16M | if (buffer != NULL) |
244 | 584k | { |
245 | 584k | buffer[0] = *list; |
246 | 584k | buffer[1] = *list; |
247 | 584k | buffer += 2; |
248 | 584k | } |
249 | 1.16M | } |
250 | 4.61M | while (*(++list) != NOTACHAR); |
251 | 2.16M | } |
252 | | |
253 | 798k | if (buffer != NULL) |
254 | 399k | { |
255 | 399k | buffer[0] = new_start; |
256 | 399k | buffer[1] = new_end; |
257 | 399k | buffer += 2; |
258 | 399k | (void)buffer; |
259 | 399k | } |
260 | 798k | return result; |
261 | 798k | } |
262 | | |
263 | | #endif |
264 | | |
265 | | /* Add a character list to a buffer. */ |
266 | | |
267 | | static size_t |
268 | | append_char_list(const uint32_t *p, uint32_t *buffer) |
269 | 41.5k | { |
270 | 41.5k | const uint32_t *n; |
271 | 41.5k | size_t result = 0; |
272 | | |
273 | 402k | while (*p != NOTACHAR) |
274 | 361k | { |
275 | 361k | n = p; |
276 | 764k | while (n[0] == n[1] - 1) n++; |
277 | | |
278 | 361k | PCRE2_ASSERT(*p < 0xffff); |
279 | | |
280 | 361k | if (buffer != NULL) |
281 | 180k | { |
282 | 180k | buffer[0] = *p; |
283 | 180k | buffer[1] = *n; |
284 | 180k | buffer += 2; |
285 | 180k | } |
286 | | |
287 | 361k | result += 2; |
288 | 361k | p = n + 1; |
289 | 361k | } |
290 | | |
291 | 41.5k | return result; |
292 | 41.5k | } |
293 | | |
294 | | static uint32_t |
295 | | get_highest_char(uint32_t options) |
296 | 9.21k | { |
297 | 9.21k | (void)options; /* Avoid compiler warning. */ |
298 | | |
299 | 9.21k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
300 | 9.21k | return MAX_UTF_CODE_POINT; |
301 | | #else |
302 | | #ifdef SUPPORT_UNICODE |
303 | | return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0); |
304 | | #else |
305 | | return MAX_UCHAR_VALUE; |
306 | | #endif |
307 | | #endif |
308 | 9.21k | } |
309 | | |
310 | | /* Add a negated character list to a buffer. */ |
311 | | static size_t |
312 | | append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer) |
313 | 15.0k | { |
314 | 15.0k | const uint32_t *n; |
315 | 15.0k | uint32_t start = 0; |
316 | 15.0k | size_t result = 2; |
317 | | |
318 | 15.0k | PCRE2_ASSERT(*p > 0); |
319 | | |
320 | 139k | while (*p != NOTACHAR) |
321 | 124k | { |
322 | 124k | n = p; |
323 | 264k | while (n[0] == n[1] - 1) n++; |
324 | | |
325 | 124k | PCRE2_ASSERT(*p < 0xffff); |
326 | | |
327 | 124k | if (buffer != NULL) |
328 | 62.4k | { |
329 | 62.4k | buffer[0] = start; |
330 | 62.4k | buffer[1] = *p - 1; |
331 | 62.4k | buffer += 2; |
332 | 62.4k | } |
333 | | |
334 | 124k | result += 2; |
335 | 124k | start = *n + 1; |
336 | 124k | p = n + 1; |
337 | 124k | } |
338 | | |
339 | 15.0k | if (buffer != NULL) |
340 | 7.50k | { |
341 | 7.50k | buffer[0] = start; |
342 | 7.50k | buffer[1] = get_highest_char(options); |
343 | 7.50k | buffer += 2; |
344 | 7.50k | (void)buffer; |
345 | 7.50k | } |
346 | | |
347 | 15.0k | return result; |
348 | 15.0k | } |
349 | | |
350 | | static uint32_t * |
351 | | append_non_ascii_range(uint32_t options, uint32_t *buffer) |
352 | 2.90k | { |
353 | 2.90k | if (buffer == NULL) return NULL; |
354 | | |
355 | 1.45k | buffer[0] = 0x100; |
356 | 1.45k | buffer[1] = get_highest_char(options); |
357 | 1.45k | return buffer + 2; |
358 | 2.90k | } |
359 | | |
360 | | static size_t |
361 | | parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer) |
362 | 77.0k | { |
363 | 77.0k | size_t total_size = 0; |
364 | 77.0k | size_t size; |
365 | 77.0k | uint32_t meta_arg; |
366 | 77.0k | uint32_t start_char; |
367 | | |
368 | 1.28M | while (TRUE) |
369 | 1.28M | { |
370 | 1.28M | switch (META_CODE(*ptr)) |
371 | 1.28M | { |
372 | 72.4k | case META_ESCAPE: |
373 | 72.4k | meta_arg = META_DATA(*ptr); |
374 | 72.4k | switch (meta_arg) |
375 | 72.4k | { |
376 | 784 | case ESC_D: |
377 | 2.00k | case ESC_W: |
378 | 2.77k | case ESC_S: |
379 | 2.77k | buffer = append_non_ascii_range(options, buffer); |
380 | 2.77k | total_size += 2; |
381 | 2.77k | break; |
382 | | |
383 | 39.4k | case ESC_h: |
384 | 39.4k | size = append_char_list(PRIV(hspace_list), buffer); |
385 | 39.4k | total_size += size; |
386 | 39.4k | if (buffer != NULL) buffer += size; |
387 | 39.4k | break; |
388 | | |
389 | 13.3k | case ESC_H: |
390 | 13.3k | size = append_negated_char_list(PRIV(hspace_list), options, buffer); |
391 | 13.3k | total_size += size; |
392 | 13.3k | if (buffer != NULL) buffer += size; |
393 | 13.3k | break; |
394 | | |
395 | 2.09k | case ESC_v: |
396 | 2.09k | size = append_char_list(PRIV(vspace_list), buffer); |
397 | 2.09k | total_size += size; |
398 | 2.09k | if (buffer != NULL) buffer += size; |
399 | 2.09k | break; |
400 | | |
401 | 1.68k | case ESC_V: |
402 | 1.68k | size = append_negated_char_list(PRIV(vspace_list), options, buffer); |
403 | 1.68k | total_size += size; |
404 | 1.68k | if (buffer != NULL) buffer += size; |
405 | 1.68k | break; |
406 | | |
407 | 6.79k | case ESC_p: |
408 | 11.9k | case ESC_P: |
409 | 11.9k | ptr++; |
410 | 11.9k | if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY) |
411 | 518 | { |
412 | 518 | if (buffer != NULL) |
413 | 259 | { |
414 | 259 | buffer[0] = 0; |
415 | 259 | buffer[1] = get_highest_char(options); |
416 | 259 | buffer += 2; |
417 | 259 | } |
418 | 518 | total_size += 2; |
419 | 518 | } |
420 | 11.9k | break; |
421 | 72.4k | } |
422 | 72.4k | ptr++; |
423 | 72.4k | continue; |
424 | 132 | case META_POSIX_NEG: |
425 | 132 | buffer = append_non_ascii_range(options, buffer); |
426 | 132 | total_size += 2; |
427 | 132 | ptr += 2; |
428 | 132 | continue; |
429 | 945 | case META_POSIX: |
430 | 945 | ptr += 2; |
431 | 945 | continue; |
432 | 0 | case META_BIGVALUE: |
433 | | /* Character literal */ |
434 | 0 | ptr++; |
435 | 0 | break; |
436 | 1.20M | CLASS_END_CASES(*ptr) |
437 | 1.20M | if (*ptr >= META_END) return total_size; |
438 | 1.13M | break; |
439 | 1.28M | } |
440 | | |
441 | 1.13M | start_char = *ptr; |
442 | | |
443 | 1.13M | if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED) |
444 | 6.87k | { |
445 | 6.87k | ptr += 2; |
446 | 6.87k | PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE); |
447 | | |
448 | 6.87k | if (*ptr == META_BIGVALUE) ptr++; |
449 | | |
450 | | #ifdef EBCDIC |
451 | | #error "Missing EBCDIC support" |
452 | | #endif |
453 | 6.87k | } |
454 | | |
455 | 1.13M | #ifdef SUPPORT_UNICODE |
456 | 1.13M | if (options & PARSE_CLASS_CASELESS_UTF) |
457 | 798k | { |
458 | 798k | size = utf_caseless_extend(start_char, *ptr++, options, buffer); |
459 | 798k | if (buffer != NULL) buffer += size; |
460 | 798k | total_size += size; |
461 | 798k | continue; |
462 | 798k | } |
463 | 332k | #endif |
464 | | |
465 | 332k | if (buffer != NULL) |
466 | 166k | { |
467 | 166k | buffer[0] = start_char; |
468 | 166k | buffer[1] = *ptr; |
469 | 166k | buffer += 2; |
470 | 166k | } |
471 | | |
472 | 332k | ptr++; |
473 | 332k | total_size += 2; |
474 | 332k | } |
475 | | |
476 | 0 | return total_size; |
477 | 77.0k | } |
478 | | |
479 | | /* Extra uint32_t values for storing the lengths of range lists in |
480 | | the worst case. Two uint32_t lengths and a range end for a range |
481 | | starting before 255 */ |
482 | 38.0k | #define CHAR_LIST_EXTRA_SIZE 3 |
483 | | |
484 | | /* Starting character values for each character list. */ |
485 | | |
486 | | static const uint32_t char_list_starts[] = { |
487 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
488 | | XCL_CHAR_LIST_HIGH_32_START, |
489 | | #endif |
490 | | #if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE |
491 | | XCL_CHAR_LIST_LOW_32_START, |
492 | | #endif |
493 | | XCL_CHAR_LIST_HIGH_16_START, |
494 | | /* Must be terminated by XCL_CHAR_LIST_LOW_16_START, |
495 | | which also represents the end of the bitset. */ |
496 | | XCL_CHAR_LIST_LOW_16_START, |
497 | | }; |
498 | | |
499 | | static class_ranges * |
500 | | compile_optimize_class(uint32_t *start_ptr, uint32_t options, |
501 | | uint32_t xoptions, compile_block *cb) |
502 | 39.0k | { |
503 | 39.0k | class_ranges* cranges; |
504 | 39.0k | uint32_t *ptr; |
505 | 39.0k | uint32_t *buffer; |
506 | 39.0k | uint32_t *dst; |
507 | 39.0k | uint32_t class_options = 0; |
508 | 39.0k | size_t range_list_size = 0, total_size, i; |
509 | 39.0k | uint32_t tmp1, tmp2; |
510 | 39.0k | const uint32_t *char_list_next; |
511 | 39.0k | uint16_t *next_char; |
512 | 39.0k | uint32_t char_list_start, char_list_end; |
513 | 39.0k | uint32_t range_start, range_end; |
514 | | |
515 | 39.0k | #ifdef SUPPORT_UNICODE |
516 | 39.0k | if (options & PCRE2_UTF) |
517 | 39.0k | class_options |= PARSE_CLASS_UTF; |
518 | | |
519 | 39.0k | if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP))) |
520 | 13.4k | class_options |= PARSE_CLASS_CASELESS_UTF; |
521 | | |
522 | 39.0k | if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) |
523 | 309 | class_options |= PARSE_CLASS_RESTRICTED_UTF; |
524 | | |
525 | 39.0k | if (xoptions & PCRE2_EXTRA_TURKISH_CASING) |
526 | 0 | class_options |= PARSE_CLASS_TURKISH_UTF; |
527 | | #else |
528 | | (void)options; /* Avoid compiler warning. */ |
529 | | (void)xoptions; /* Avoid compiler warning. */ |
530 | | #endif |
531 | | |
532 | | /* Compute required space for the range. */ |
533 | | |
534 | 39.0k | range_list_size = parse_class(start_ptr, class_options, NULL); |
535 | 39.0k | PCRE2_ASSERT((range_list_size & 0x1) == 0); |
536 | | |
537 | | /* Allocate buffer. The total_size also represents the end of the buffer. */ |
538 | | |
539 | 39.0k | total_size = range_list_size + |
540 | 39.0k | ((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0); |
541 | | |
542 | 39.0k | cranges = cb->cx->memctl.malloc( |
543 | 39.0k | sizeof(class_ranges) + total_size * sizeof(uint32_t), |
544 | 39.0k | cb->cx->memctl.memory_data); |
545 | | |
546 | 39.0k | if (cranges == NULL) return NULL; |
547 | | |
548 | 39.0k | cranges->header.next = NULL; |
549 | | #ifdef PCRE2_DEBUG |
550 | | cranges->header.type = CDATA_CRANGE; |
551 | | #endif |
552 | 39.0k | cranges->range_list_size = (uint16_t)range_list_size; |
553 | 39.0k | cranges->char_lists_types = 0; |
554 | 39.0k | cranges->char_lists_size = 0; |
555 | 39.0k | cranges->char_lists_start = 0; |
556 | | |
557 | 39.0k | if (range_list_size == 0) return cranges; |
558 | | |
559 | 38.0k | buffer = (uint32_t*)(cranges + 1); |
560 | 38.0k | parse_class(start_ptr, class_options, buffer); |
561 | | |
562 | | /* Using <= instead of == to help static analysis. */ |
563 | 38.0k | if (range_list_size <= 2) return cranges; |
564 | | |
565 | | /* In-place sorting of ranges. */ |
566 | | |
567 | 31.6k | i = (((range_list_size >> 2) - 1) << 1); |
568 | 688k | while (TRUE) |
569 | 688k | { |
570 | 688k | do_heapify(buffer, range_list_size, i); |
571 | 688k | if (i == 0) break; |
572 | 657k | i -= 2; |
573 | 657k | } |
574 | | |
575 | 31.6k | i = range_list_size - 2; |
576 | 1.39M | while (TRUE) |
577 | 1.39M | { |
578 | 1.39M | tmp1 = buffer[i]; |
579 | 1.39M | tmp2 = buffer[i + 1]; |
580 | 1.39M | buffer[i] = buffer[0]; |
581 | 1.39M | buffer[i + 1] = buffer[1]; |
582 | 1.39M | buffer[0] = tmp1; |
583 | 1.39M | buffer[1] = tmp2; |
584 | | |
585 | 1.39M | do_heapify(buffer, i, 0); |
586 | 1.39M | if (i == 0) break; |
587 | 1.36M | i -= 2; |
588 | 1.36M | } |
589 | | |
590 | | /* Merge ranges whenever possible. */ |
591 | 31.6k | dst = buffer; |
592 | 31.6k | ptr = buffer + 2; |
593 | 31.6k | range_list_size -= 2; |
594 | | |
595 | | /* The second condition is a very rare corner case, where the end of the last |
596 | | range is the maximum character. This range cannot be extended further. */ |
597 | | |
598 | 1.39M | while (range_list_size > 0 && dst[1] != ~(uint32_t)0) |
599 | 1.36M | { |
600 | 1.36M | if (dst[1] + 1 < ptr[0]) |
601 | 202k | { |
602 | 202k | dst += 2; |
603 | 202k | dst[0] = ptr[0]; |
604 | 202k | dst[1] = ptr[1]; |
605 | 202k | } |
606 | 1.16M | else if (dst[1] < ptr[1]) dst[1] = ptr[1]; |
607 | | |
608 | 1.36M | ptr += 2; |
609 | 1.36M | range_list_size -= 2; |
610 | 1.36M | } |
611 | | |
612 | 31.6k | PCRE2_ASSERT(dst[1] <= get_highest_char(class_options)); |
613 | | |
614 | | /* When the number of ranges are less than six, |
615 | | they are not converted to range lists. */ |
616 | | |
617 | 31.6k | ptr = buffer; |
618 | 153k | while (ptr < dst && ptr[1] < 0x100) ptr += 2; |
619 | 31.6k | if (dst - ptr < (2 * (6 - 1))) |
620 | 17.8k | { |
621 | 17.8k | cranges->range_list_size = (uint16_t)(dst + 2 - buffer); |
622 | 17.8k | return cranges; |
623 | 17.8k | } |
624 | | |
625 | | /* Compute character lists structures. */ |
626 | | |
627 | 13.7k | char_list_next = char_list_starts; |
628 | 13.7k | char_list_start = *char_list_next++; |
629 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
630 | | char_list_end = XCL_CHAR_LIST_HIGH_32_END; |
631 | | #elif defined SUPPORT_UNICODE |
632 | 13.7k | char_list_end = XCL_CHAR_LIST_LOW_32_END; |
633 | | #else |
634 | | char_list_end = XCL_CHAR_LIST_HIGH_16_END; |
635 | | #endif |
636 | 13.7k | next_char = (uint16_t*)(buffer + total_size); |
637 | | |
638 | 13.7k | tmp1 = 0; |
639 | 13.7k | tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN; |
640 | 13.7k | PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN); |
641 | 13.7k | range_start = dst[0]; |
642 | 13.7k | range_end = dst[1]; |
643 | | |
644 | 130k | while (TRUE) |
645 | 130k | { |
646 | 130k | if (range_start >= char_list_start) |
647 | 89.1k | { |
648 | 89.1k | if (range_start == range_end || range_end < char_list_end) |
649 | 86.7k | { |
650 | 86.7k | tmp1++; |
651 | 86.7k | next_char--; |
652 | | |
653 | 86.7k | if (char_list_start < XCL_CHAR_LIST_LOW_32_START) |
654 | 84.4k | *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END); |
655 | 2.25k | else |
656 | 2.25k | *(uint32_t*)(--next_char) = |
657 | 2.25k | (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END; |
658 | 86.7k | } |
659 | | |
660 | 89.1k | if (range_start < range_end) |
661 | 26.8k | { |
662 | 26.8k | if (range_start > char_list_start) |
663 | 26.7k | { |
664 | 26.7k | tmp1++; |
665 | 26.7k | next_char--; |
666 | | |
667 | 26.7k | if (char_list_start < XCL_CHAR_LIST_LOW_32_START) |
668 | 26.6k | *next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT); |
669 | 121 | else |
670 | 121 | *(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT); |
671 | 26.7k | } |
672 | 66 | else |
673 | 66 | cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2; |
674 | 26.8k | } |
675 | | |
676 | 89.1k | PCRE2_ASSERT((uint32_t*)next_char >= dst + 2); |
677 | | |
678 | 89.1k | if (dst > buffer) |
679 | 89.0k | { |
680 | 89.0k | dst -= 2; |
681 | 89.0k | range_start = dst[0]; |
682 | 89.0k | range_end = dst[1]; |
683 | 89.0k | continue; |
684 | 89.0k | } |
685 | | |
686 | 132 | range_start = 0; |
687 | 132 | range_end = 0; |
688 | 132 | } |
689 | | |
690 | 41.0k | if (range_end >= char_list_start) |
691 | 7.19k | { |
692 | 7.19k | PCRE2_ASSERT(range_start < char_list_start); |
693 | | |
694 | 7.19k | if (range_end < char_list_end) |
695 | 4.81k | { |
696 | 4.81k | tmp1++; |
697 | 4.81k | next_char--; |
698 | | |
699 | 4.81k | if (char_list_start < XCL_CHAR_LIST_LOW_32_START) |
700 | 2.40k | *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END); |
701 | 2.40k | else |
702 | 2.40k | *(uint32_t*)(--next_char) = |
703 | 2.40k | (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END; |
704 | | |
705 | 4.81k | PCRE2_ASSERT((uint32_t*)next_char >= dst + 2); |
706 | 4.81k | } |
707 | | |
708 | 7.19k | cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2; |
709 | 7.19k | } |
710 | | |
711 | 41.0k | if (tmp1 >= XCL_ITEM_COUNT_MASK) |
712 | 13.9k | { |
713 | 13.9k | cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2; |
714 | 13.9k | next_char--; |
715 | | |
716 | 13.9k | if (char_list_start < XCL_CHAR_LIST_LOW_32_START) |
717 | 13.7k | *next_char = (uint16_t)tmp1; |
718 | 184 | else |
719 | 184 | *(uint32_t*)(--next_char) = tmp1; |
720 | 13.9k | } |
721 | 27.1k | else |
722 | 27.1k | cranges->char_lists_types |= tmp1 << tmp2; |
723 | | |
724 | 41.0k | if (range_start < XCL_CHAR_LIST_LOW_16_START) break; |
725 | | |
726 | 27.3k | PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN); |
727 | 27.3k | char_list_end = char_list_start - 1; |
728 | 27.3k | char_list_start = *char_list_next++; |
729 | 27.3k | tmp1 = 0; |
730 | 27.3k | tmp2 -= XCL_TYPE_BIT_LEN; |
731 | 27.3k | } |
732 | | |
733 | 13.7k | if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2; |
734 | 13.7k | PCRE2_ASSERT((uint16_t*)dst <= next_char); |
735 | | |
736 | 13.7k | cranges->char_lists_size = |
737 | 13.7k | (size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char); |
738 | 13.7k | cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer); |
739 | 13.7k | cranges->range_list_size = (uint16_t)(dst - buffer); |
740 | 13.7k | return cranges; |
741 | 31.6k | } |
742 | | |
743 | | #endif /* SUPPORT_WIDE_CHARS */ |
744 | | |
745 | | #ifdef SUPPORT_UNICODE |
746 | | |
747 | | void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated, |
748 | | uint8_t *classbits) |
749 | 40.4k | { |
750 | | /* Update PRIV(xclass) when this function is changed. */ |
751 | 40.4k | int c, chartype; |
752 | 40.4k | const ucd_record *prop; |
753 | 40.4k | uint32_t gentype; |
754 | 40.4k | BOOL set_bit; |
755 | | |
756 | 40.4k | if (ptype == PT_ANY) |
757 | 0 | { |
758 | 0 | if (!negated) memset(classbits, 0xff, 32); |
759 | 0 | return; |
760 | 0 | } |
761 | | |
762 | 10.3M | for (c = 0; c < 256; c++) |
763 | 10.3M | { |
764 | 10.3M | prop = GET_UCD(c); |
765 | 10.3M | set_bit = FALSE; |
766 | 10.3M | (void)set_bit; |
767 | | |
768 | 10.3M | switch (ptype) |
769 | 10.3M | { |
770 | 553k | case PT_LAMP: |
771 | 553k | chartype = prop->chartype; |
772 | 553k | set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt); |
773 | 553k | break; |
774 | | |
775 | 834k | case PT_GC: |
776 | 834k | set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata); |
777 | 834k | break; |
778 | | |
779 | 1.41M | case PT_PC: |
780 | 1.41M | set_bit = (prop->chartype == pdata); |
781 | 1.41M | break; |
782 | | |
783 | 27.6k | case PT_SC: |
784 | 27.6k | set_bit = (prop->script == pdata); |
785 | 27.6k | break; |
786 | | |
787 | 108k | case PT_SCX: |
788 | 108k | set_bit = (prop->script == pdata || |
789 | 108k | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); |
790 | 108k | break; |
791 | | |
792 | 188k | case PT_ALNUM: |
793 | 188k | gentype = PRIV(ucp_gentype)[prop->chartype]; |
794 | 188k | set_bit = (gentype == ucp_L || gentype == ucp_N); |
795 | 188k | break; |
796 | | |
797 | 1.90M | case PT_SPACE: /* Perl space */ |
798 | 1.99M | case PT_PXSPACE: /* POSIX space */ |
799 | 1.99M | switch(c) |
800 | 1.99M | { |
801 | 46.7k | HSPACE_BYTE_CASES: |
802 | 62.2k | VSPACE_BYTE_CASES: |
803 | 62.2k | set_bit = TRUE; |
804 | 62.2k | break; |
805 | | |
806 | 1.93M | default: |
807 | 1.93M | set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z); |
808 | 1.93M | break; |
809 | 1.99M | } |
810 | 1.99M | break; |
811 | | |
812 | 2.88M | case PT_WORD: |
813 | 2.88M | chartype = prop->chartype; |
814 | 2.88M | gentype = PRIV(ucp_gentype)[chartype]; |
815 | 2.88M | set_bit = (gentype == ucp_L || gentype == ucp_N || |
816 | 2.88M | chartype == ucp_Mn || chartype == ucp_Pc); |
817 | 2.88M | break; |
818 | | |
819 | 495k | case PT_UCNC: |
820 | 495k | set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || |
821 | 495k | c == CHAR_GRAVE_ACCENT || c >= 0xa0); |
822 | 495k | break; |
823 | | |
824 | 95.2k | case PT_BIDICL: |
825 | 95.2k | set_bit = (UCD_BIDICLASS_PROP(prop) == pdata); |
826 | 95.2k | break; |
827 | | |
828 | 40.9k | case PT_BOOL: |
829 | 40.9k | set_bit = MAPBIT(PRIV(ucd_boolprop_sets) + |
830 | 40.9k | UCD_BPROPS_PROP(prop), pdata) != 0; |
831 | 40.9k | break; |
832 | | |
833 | 234k | case PT_PXGRAPH: |
834 | 234k | chartype = prop->chartype; |
835 | 234k | gentype = PRIV(ucp_gentype)[chartype]; |
836 | 234k | set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf)); |
837 | 234k | break; |
838 | | |
839 | 180k | case PT_PXPRINT: |
840 | 180k | chartype = prop->chartype; |
841 | 180k | set_bit = (chartype != ucp_Zl && chartype != ucp_Zp && |
842 | 180k | (PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf)); |
843 | 180k | break; |
844 | | |
845 | 511k | case PT_PXPUNCT: |
846 | 511k | gentype = PRIV(ucp_gentype)[prop->chartype]; |
847 | 511k | set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S)); |
848 | 511k | break; |
849 | | |
850 | 782k | default: |
851 | 782k | PCRE2_ASSERT(ptype == PT_PXXDIGIT); |
852 | 782k | set_bit = (c >= CHAR_0 && c <= CHAR_9) || |
853 | 782k | (c >= CHAR_A && c <= CHAR_F) || |
854 | 782k | (c >= CHAR_a && c <= CHAR_f); |
855 | 782k | break; |
856 | 10.3M | } |
857 | | |
858 | 10.3M | if (negated) set_bit = !set_bit; |
859 | 10.3M | if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7)); |
860 | 10.3M | if ((c & 0x7) == 0x7) classbits++; |
861 | 10.3M | } |
862 | 40.4k | } |
863 | | |
864 | | #endif /* SUPPORT_UNICODE */ |
865 | | |
866 | | |
867 | | |
868 | | #ifdef SUPPORT_WIDE_CHARS |
869 | | |
870 | | /************************************************* |
871 | | * XClass related properties * |
872 | | *************************************************/ |
873 | | |
874 | | /* XClass needs to be generated. */ |
875 | 276k | #define XCLASS_REQUIRED 0x1 |
876 | | /* XClass has 8 bit character. */ |
877 | 2.60M | #define XCLASS_HAS_8BIT_CHARS 0x2 |
878 | | /* XClass has properties. */ |
879 | 94.8k | #define XCLASS_HAS_PROPS 0x4 |
880 | | /* XClass has character lists. */ |
881 | 131k | #define XCLASS_HAS_CHAR_LISTS 0x8 |
882 | | /* XClass matches to all >= 256 characters. */ |
883 | 84.5k | #define XCLASS_HIGH_ANY 0x10 |
884 | | |
885 | | #endif |
886 | | |
887 | | |
888 | | /************************************************* |
889 | | * Internal entry point for add range to class * |
890 | | *************************************************/ |
891 | | |
892 | | /* This function sets the overall range for characters < 256. |
893 | | It also handles non-utf case folding. |
894 | | |
895 | | Arguments: |
896 | | options the options bits |
897 | | xoptions the extra options bits |
898 | | cb compile data |
899 | | start start of range character |
900 | | end end of range character |
901 | | |
902 | | Returns: cb->classbits is updated |
903 | | */ |
904 | | |
905 | | static void |
906 | | add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb, |
907 | | uint32_t start, uint32_t end) |
908 | 2.65M | { |
909 | 2.65M | uint8_t *classbits = cb->classbits.classbits; |
910 | 2.65M | uint32_t c, byte_start, byte_end; |
911 | 2.65M | uint32_t classbits_end = (end <= 0xff ? end : 0xff); |
912 | | |
913 | | #ifndef SUPPORT_UNICODE |
914 | | (void)xoptions; /* Avoid compiler warning. */ |
915 | | #endif |
916 | | |
917 | | /* If caseless matching is required, scan the range and process alternate |
918 | | cases. In Unicode, there are 8-bit characters that have alternate cases that |
919 | | are greater than 255 and vice-versa (though these may be ignored if caseless |
920 | | restriction is in force). Sometimes we can just extend the original range. */ |
921 | | |
922 | 2.65M | if ((options & PCRE2_CASELESS) != 0) |
923 | 361k | { |
924 | 361k | #ifdef SUPPORT_UNICODE |
925 | | /* UTF mode. This branch is taken if we don't support wide characters (e.g. |
926 | | 8-bit library, without UTF), but we do treat those characters as Unicode |
927 | | (if UCP flag is set). In this case, we only need to expand the character class |
928 | | set to include the case pairs which are in the 0-255 codepoint range. */ |
929 | 361k | if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0) |
930 | 270k | { |
931 | 270k | BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) == |
932 | 270k | PCRE2_EXTRA_TURKISH_CASING; |
933 | 270k | if (start < 128) |
934 | 202k | { |
935 | 202k | uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127); |
936 | 456k | for (c = start; c <= lo_end; c++) |
937 | 253k | { |
938 | 253k | if (turkish_i && UCD_ANY_I(c)) continue; |
939 | 253k | SETBIT(classbits, cb->fcc[c]); |
940 | 253k | } |
941 | 202k | } |
942 | 270k | if (classbits_end >= 128) |
943 | 67.6k | { |
944 | 67.6k | uint32_t hi_start = (start > 128 ? start : 128); |
945 | 158k | for (c = hi_start; c <= classbits_end; c++) |
946 | 90.4k | { |
947 | 90.4k | uint32_t co = UCD_OTHERCASE(c); |
948 | 90.4k | if (co <= 0xff) SETBIT(classbits, co); |
949 | 90.4k | } |
950 | 67.6k | } |
951 | 270k | } |
952 | | |
953 | 91.1k | else |
954 | 91.1k | #endif /* SUPPORT_UNICODE */ |
955 | | |
956 | | /* Not UTF mode */ |
957 | 91.1k | { |
958 | 207k | for (c = start; c <= classbits_end; c++) |
959 | 115k | SETBIT(classbits, cb->fcc[c]); |
960 | 91.1k | } |
961 | 361k | } |
962 | | |
963 | | /* Use the bitmap for characters < 256. Otherwise use extra data. */ |
964 | | |
965 | 2.65M | byte_start = (start + 7) >> 3; |
966 | 2.65M | byte_end = (classbits_end + 1) >> 3; |
967 | | |
968 | 2.65M | if (byte_start >= byte_end) |
969 | 2.57M | { |
970 | 5.17M | for (c = start; c <= classbits_end; c++) |
971 | | /* Regardless of start, c will always be <= 255. */ |
972 | 2.60M | SETBIT(classbits, c); |
973 | 2.57M | return; |
974 | 2.57M | } |
975 | | |
976 | 755k | for (c = byte_start; c < byte_end; c++) |
977 | 669k | classbits[c] = 0xff; |
978 | | |
979 | 85.7k | byte_start <<= 3; |
980 | 85.7k | byte_end <<= 3; |
981 | | |
982 | 441k | for (c = start; c < byte_start; c++) |
983 | 355k | SETBIT(classbits, c); |
984 | | |
985 | 159k | for (c = byte_end; c <= classbits_end; c++) |
986 | 73.4k | SETBIT(classbits, c); |
987 | 85.7k | } |
988 | | |
989 | | |
990 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
991 | | /************************************************* |
992 | | * Internal entry point for add list to class * |
993 | | *************************************************/ |
994 | | |
995 | | /* This function is used for adding a list of horizontal or vertical whitespace |
996 | | characters to a class. The list must be in order so that ranges of characters |
997 | | can be detected and handled appropriately. This function sets the overall range |
998 | | so that the internal functions can try to avoid duplication when handling |
999 | | case-independence. |
1000 | | |
1001 | | Arguments: |
1002 | | options the options bits |
1003 | | xoptions the extra options bits |
1004 | | cb contains pointers to tables etc. |
1005 | | p points to row of 32-bit values, terminated by NOTACHAR |
1006 | | |
1007 | | Returns: cb->classbits is updated |
1008 | | */ |
1009 | | |
1010 | | static void |
1011 | | add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb, |
1012 | | const uint32_t *p) |
1013 | 14.2k | { |
1014 | 54.4k | while (p[0] < 256) |
1015 | 40.1k | { |
1016 | 40.1k | unsigned int n = 0; |
1017 | | |
1018 | 48.1k | while(p[n+1] == p[0] + n + 1) n++; |
1019 | 40.1k | add_to_class(options, xoptions, cb, p[0], p[n]); |
1020 | | |
1021 | 40.1k | p += n + 1; |
1022 | 40.1k | } |
1023 | 14.2k | } |
1024 | | |
1025 | | |
1026 | | |
1027 | | /************************************************* |
1028 | | * Add characters not in a list to a class * |
1029 | | *************************************************/ |
1030 | | |
1031 | | /* This function is used for adding the complement of a list of horizontal or |
1032 | | vertical whitespace to a class. The list must be in order. |
1033 | | |
1034 | | Arguments: |
1035 | | options the options bits |
1036 | | xoptions the extra options bits |
1037 | | cb contains pointers to tables etc. |
1038 | | p points to row of 32-bit values, terminated by NOTACHAR |
1039 | | |
1040 | | Returns: cb->classbits is updated |
1041 | | */ |
1042 | | |
1043 | | static void |
1044 | | add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb, |
1045 | | const uint32_t *p) |
1046 | 15.9k | { |
1047 | 15.9k | if (p[0] > 0) |
1048 | 15.9k | add_to_class(options, xoptions, cb, 0, p[0] - 1); |
1049 | 61.5k | while (p[0] < 256) |
1050 | 45.6k | { |
1051 | 52.0k | while (p[1] == p[0] + 1) p++; |
1052 | 45.6k | add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1); |
1053 | 45.6k | p++; |
1054 | 45.6k | } |
1055 | 15.9k | } |
1056 | | #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
1057 | | |
1058 | | |
1059 | | |
1060 | | /************************************************* |
1061 | | * Main entry-point to compile a character class * |
1062 | | *************************************************/ |
1063 | | |
1064 | | /* This function consumes a "leaf", which is a set of characters that will |
1065 | | become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */ |
1066 | | |
1067 | | uint32_t * |
1068 | | PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions, |
1069 | | uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap, |
1070 | | int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) |
1071 | 199k | { |
1072 | 199k | uint32_t *pptr = start_ptr; |
1073 | 199k | PCRE2_UCHAR *code = *pcode; |
1074 | 199k | BOOL should_flip_negation; |
1075 | 199k | const uint8_t *cbits = cb->cbits; |
1076 | | /* Some functions such as add_to_class() or eclass processing |
1077 | | expects that the bitset is stored in cb->classbits.classbits. */ |
1078 | 199k | uint8_t *const classbits = cb->classbits.classbits; |
1079 | | |
1080 | 199k | #ifdef SUPPORT_UNICODE |
1081 | 199k | BOOL utf = (options & PCRE2_UTF) != 0; |
1082 | | #else /* No Unicode support */ |
1083 | | BOOL utf = FALSE; |
1084 | | #endif |
1085 | | |
1086 | | /* Helper variables for OP_XCLASS opcode (for characters > 255). */ |
1087 | | |
1088 | 199k | #ifdef SUPPORT_WIDE_CHARS |
1089 | 199k | uint32_t xclass_props; |
1090 | 199k | PCRE2_UCHAR *class_uchardata; |
1091 | 199k | class_ranges* cranges; |
1092 | | #else |
1093 | | (void)has_bitmap; /* Avoid compiler warning. */ |
1094 | | (void)errorcodeptr; /* Avoid compiler warning. */ |
1095 | | (void)lengthptr; /* Avoid compiler warning. */ |
1096 | | #endif |
1097 | | |
1098 | | /* If an XClass contains a negative special such as \S, we need to flip the |
1099 | | negation flag at the end, so that support for characters > 255 works correctly |
1100 | | (they are all included in the class). An XClass may need to insert specific |
1101 | | matching or non-matching code for wide characters. |
1102 | | */ |
1103 | | |
1104 | 199k | should_flip_negation = FALSE; |
1105 | | |
1106 | | /* XClass will be used when characters > 255 might match. */ |
1107 | | |
1108 | 199k | #ifdef SUPPORT_WIDE_CHARS |
1109 | 199k | xclass_props = 0; |
1110 | | |
1111 | 199k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1112 | 199k | cranges = NULL; |
1113 | | |
1114 | 199k | if (utf) |
1115 | 60.2k | #endif |
1116 | 60.2k | { |
1117 | 60.2k | if (lengthptr != NULL) |
1118 | 39.0k | { |
1119 | 39.0k | cranges = compile_optimize_class(pptr, options, xoptions, cb); |
1120 | | |
1121 | 39.0k | if (cranges == NULL) |
1122 | 0 | { |
1123 | 0 | *errorcodeptr = ERR21; |
1124 | 0 | return NULL; |
1125 | 0 | } |
1126 | | |
1127 | | /* Caching the pre-processed character ranges. */ |
1128 | 39.0k | if (cb->last_data != NULL) |
1129 | 35.3k | cb->last_data->next = &cranges->header; |
1130 | 3.68k | else |
1131 | 3.68k | cb->first_data = &cranges->header; |
1132 | | |
1133 | 39.0k | cb->last_data = &cranges->header; |
1134 | 39.0k | } |
1135 | 21.1k | else |
1136 | 21.1k | { |
1137 | | /* Reuse the pre-processed character ranges. */ |
1138 | 21.1k | cranges = (class_ranges*)cb->first_data; |
1139 | 21.1k | PCRE2_ASSERT(cranges != NULL && cranges->header.type == CDATA_CRANGE); |
1140 | 21.1k | cb->first_data = cranges->header.next; |
1141 | 21.1k | } |
1142 | | |
1143 | 60.2k | if (cranges->range_list_size > 0) |
1144 | 57.9k | { |
1145 | 57.9k | const uint32_t *ranges = (const uint32_t*)(cranges + 1); |
1146 | | |
1147 | 57.9k | if (ranges[0] <= 255) |
1148 | 53.5k | xclass_props |= XCLASS_HAS_8BIT_CHARS; |
1149 | | |
1150 | 57.9k | if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) && |
1151 | 57.9k | ranges[cranges->range_list_size - 2] <= 256) |
1152 | 1.79k | xclass_props |= XCLASS_HIGH_ANY; |
1153 | 57.9k | } |
1154 | 60.2k | } |
1155 | | |
1156 | 199k | class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ |
1157 | 199k | #endif /* SUPPORT_WIDE_CHARS */ |
1158 | | |
1159 | | /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map |
1160 | | in a temporary bit of memory, in case the class contains fewer than two |
1161 | | 8-bit characters because in that case the compiled code doesn't use the bit |
1162 | | map. */ |
1163 | | |
1164 | 199k | memset(classbits, 0, 32); |
1165 | | |
1166 | | /* Process items until end_ptr is reached. */ |
1167 | | |
1168 | 3.76M | while (TRUE) |
1169 | 3.76M | { |
1170 | 3.76M | uint32_t meta = *(pptr++); |
1171 | 3.76M | BOOL local_negate; |
1172 | 3.76M | int posix_class; |
1173 | 3.76M | int taboffset, tabopt; |
1174 | 3.76M | class_bits_storage pbits; |
1175 | 3.76M | uint32_t escape, c; |
1176 | | |
1177 | | /* Handle POSIX classes such as [:alpha:] etc. */ |
1178 | 3.76M | switch (META_CODE(meta)) |
1179 | 3.76M | { |
1180 | 14.1k | case META_POSIX: |
1181 | 16.0k | case META_POSIX_NEG: |
1182 | | |
1183 | 16.0k | local_negate = (meta == META_POSIX_NEG); |
1184 | 16.0k | posix_class = *(pptr++); |
1185 | | |
1186 | 16.0k | if (local_negate) should_flip_negation = TRUE; /* Note negative special */ |
1187 | | |
1188 | | /* If matching is caseless, upper and lower are converted to alpha. |
1189 | | This relies on the fact that the class table starts with alpha, |
1190 | | lower, upper as the first 3 entries. */ |
1191 | | |
1192 | 16.0k | if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2) |
1193 | 634 | posix_class = 0; |
1194 | | |
1195 | | /* When PCRE2_UCP is set, some of the POSIX classes are converted to |
1196 | | different escape sequences that use Unicode properties \p or \P. |
1197 | | Others that are not available via \p or \P have to generate |
1198 | | XCL_PROP/XCL_NOTPROP directly, which is done here. */ |
1199 | | |
1200 | 16.0k | #ifdef SUPPORT_UNICODE |
1201 | | /* TODO This entire block of code here appears to be unreachable!? I simply |
1202 | | can't see how it can be hit, given that the frontend parser doesn't emit |
1203 | | META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */ |
1204 | 16.0k | if ((options & PCRE2_UCP) != 0 && |
1205 | 16.0k | (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0) |
1206 | 1.18k | { |
1207 | 1.18k | uint32_t ptype; |
1208 | | |
1209 | 1.18k | switch(posix_class) |
1210 | 1.18k | { |
1211 | 0 | case PC_GRAPH: |
1212 | 0 | case PC_PRINT: |
1213 | 0 | case PC_PUNCT: |
1214 | 0 | ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH : |
1215 | 0 | (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT; |
1216 | |
|
1217 | 0 | PRIV(update_classbits)(ptype, 0, local_negate, classbits); |
1218 | |
|
1219 | 0 | if ((xclass_props & XCLASS_HIGH_ANY) == 0) |
1220 | 0 | { |
1221 | 0 | if (lengthptr != NULL) |
1222 | 0 | *lengthptr += 3; |
1223 | 0 | else |
1224 | 0 | { |
1225 | 0 | *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; |
1226 | 0 | *class_uchardata++ = (PCRE2_UCHAR)ptype; |
1227 | 0 | *class_uchardata++ = 0; |
1228 | 0 | } |
1229 | 0 | xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; |
1230 | 0 | } |
1231 | 0 | continue; |
1232 | | |
1233 | | /* For the other POSIX classes (ex: ascii) we are going to |
1234 | | fall through to the non-UCP case and build a bit map for |
1235 | | characters with code points less than 256. However, if we are in |
1236 | | a negated POSIX class, characters with code points greater than |
1237 | | 255 must either all match or all not match, depending on whether |
1238 | | the whole class is not or is negated. For example, for |
1239 | | [[:^ascii:]... they must all match, whereas for [^[:^ascii:]... |
1240 | | they must not. |
1241 | | |
1242 | | In the special case where there are no xclass items, this is |
1243 | | automatically handled by the use of OP_CLASS or OP_NCLASS, but an |
1244 | | explicit range is needed for OP_XCLASS. Setting a flag here |
1245 | | causes the range to be generated later when it is known that |
1246 | | OP_XCLASS is required. In the 8-bit library this is relevant only in |
1247 | | utf mode, since no wide characters can exist otherwise. */ |
1248 | | |
1249 | 1.18k | default: |
1250 | 1.18k | break; |
1251 | 1.18k | } |
1252 | 1.18k | } |
1253 | 16.0k | #endif /* SUPPORT_UNICODE */ |
1254 | | |
1255 | | /* In the non-UCP case, or when UCP makes no difference, we build the |
1256 | | bit map for the POSIX class in a chunk of local store because we may |
1257 | | be adding and subtracting from it, and we don't want to subtract bits |
1258 | | that may be in the main map already. At the end we or the result into |
1259 | | the bit map that is being built. */ |
1260 | | |
1261 | 16.0k | posix_class *= 3; |
1262 | | |
1263 | | /* Copy in the first table (always present) */ |
1264 | | |
1265 | 16.0k | memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32); |
1266 | | |
1267 | | /* If there is a second table, add or remove it as required. */ |
1268 | | |
1269 | 16.0k | taboffset = PRIV(posix_class_maps)[posix_class + 1]; |
1270 | 16.0k | tabopt = PRIV(posix_class_maps)[posix_class + 2]; |
1271 | | |
1272 | 16.0k | if (taboffset >= 0) |
1273 | 5.17k | { |
1274 | 5.17k | if (tabopt >= 0) |
1275 | 108k | for (int i = 0; i < 32; i++) |
1276 | 104k | pbits.classbits[i] |= cbits[i + taboffset]; |
1277 | 1.90k | else |
1278 | 62.7k | for (int i = 0; i < 32; i++) |
1279 | 60.8k | pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]); |
1280 | 5.17k | } |
1281 | | |
1282 | | /* Now see if we need to remove any special characters. An option |
1283 | | value of 1 removes vertical space and 2 removes underscore. */ |
1284 | | |
1285 | 16.0k | if (tabopt < 0) tabopt = -tabopt; |
1286 | | #ifdef EBCDIC |
1287 | | { |
1288 | | uint8_t posix_vertical[4] = { CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR }; |
1289 | | uint8_t posix_underscore = CHAR_UNDERSCORE; |
1290 | | uint8_t *chars = NULL; |
1291 | | int n = 0; |
1292 | | |
1293 | | if (tabopt == 1) { chars = posix_vertical; n = 4; } |
1294 | | else if (tabopt == 2) { chars = &posix_underscore; n = 1; } |
1295 | | |
1296 | | for (; n > 0; ++chars, --n) |
1297 | | pbits.classbits[*chars/8] &= ~(1u << (*chars&7)); |
1298 | | } |
1299 | | #else |
1300 | 16.0k | if (tabopt == 1) pbits.classbits[1] &= ~0x3c; |
1301 | 15.1k | else if (tabopt == 2) pbits.classbits[11] &= 0x7f; |
1302 | 16.0k | #endif |
1303 | | |
1304 | | /* Add the POSIX table or its complement into the main table that is |
1305 | | being built and we are done. */ |
1306 | | |
1307 | 16.0k | { |
1308 | 16.0k | uint32_t *classwords = cb->classbits.classwords; |
1309 | | |
1310 | 16.0k | if (local_negate) |
1311 | 16.9k | for (int i = 0; i < 8; i++) |
1312 | 15.0k | classwords[i] |= (uint32_t)(~pbits.classwords[i]); |
1313 | 14.1k | else |
1314 | 127k | for (int i = 0; i < 8; i++) |
1315 | 113k | classwords[i] |= pbits.classwords[i]; |
1316 | 16.0k | } |
1317 | | |
1318 | 16.0k | #ifdef SUPPORT_WIDE_CHARS |
1319 | | /* Every class contains at least one < 256 character. */ |
1320 | 16.0k | xclass_props |= XCLASS_HAS_8BIT_CHARS; |
1321 | 16.0k | #endif |
1322 | 16.0k | continue; /* End of POSIX handling */ |
1323 | | |
1324 | | /* Other than POSIX classes, the only items we should encounter are |
1325 | | \d-type escapes and literal characters (possibly as ranges). */ |
1326 | 0 | case META_BIGVALUE: |
1327 | 0 | meta = *(pptr++); |
1328 | 0 | break; |
1329 | | |
1330 | 151k | case META_ESCAPE: |
1331 | 151k | escape = META_DATA(meta); |
1332 | | |
1333 | 151k | switch(escape) |
1334 | 151k | { |
1335 | 4.32k | case ESC_d: |
1336 | 142k | for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit]; |
1337 | 4.32k | break; |
1338 | | |
1339 | 7.65k | case ESC_D: |
1340 | 7.65k | should_flip_negation = TRUE; |
1341 | 252k | for (int i = 0; i < 32; i++) |
1342 | 244k | classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]); |
1343 | 7.65k | break; |
1344 | | |
1345 | 4.38k | case ESC_w: |
1346 | 144k | for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word]; |
1347 | 4.38k | break; |
1348 | | |
1349 | 9.29k | case ESC_W: |
1350 | 9.29k | should_flip_negation = TRUE; |
1351 | 306k | for (int i = 0; i < 32; i++) |
1352 | 297k | classbits[i] |= (uint8_t)(~cbits[i+cbit_word]); |
1353 | 9.29k | break; |
1354 | | |
1355 | | /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl |
1356 | | 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was |
1357 | | previously set by something earlier in the character class. |
1358 | | Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so |
1359 | | we could just adjust the appropriate bit. From PCRE 8.34 we no |
1360 | | longer treat \s and \S specially. */ |
1361 | | |
1362 | 3.33k | case ESC_s: |
1363 | 110k | for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space]; |
1364 | 3.33k | break; |
1365 | | |
1366 | 7.37k | case ESC_S: |
1367 | 7.37k | should_flip_negation = TRUE; |
1368 | 243k | for (int i = 0; i < 32; i++) |
1369 | 236k | classbits[i] |= (uint8_t)(~cbits[i+cbit_space]); |
1370 | 7.37k | break; |
1371 | | |
1372 | | /* When adding the horizontal or vertical space lists to a class, or |
1373 | | their complements, disable PCRE2_CASELESS, because it justs wastes |
1374 | | time, and in the "not-x" UTF cases can create unwanted duplicates in |
1375 | | the XCLASS list (provoked by characters that have more than one other |
1376 | | case and by both cases being in the same "not-x" sublist). */ |
1377 | | |
1378 | 38.7k | case ESC_h: |
1379 | 38.7k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1380 | 38.7k | #ifdef SUPPORT_UNICODE |
1381 | 38.7k | if (cranges != NULL) break; |
1382 | 11.5k | #endif |
1383 | 11.5k | add_list_to_class(options & ~PCRE2_CASELESS, xoptions, |
1384 | 11.5k | cb, PRIV(hspace_list)); |
1385 | | #else |
1386 | | PCRE2_ASSERT(cranges != NULL); |
1387 | | #endif |
1388 | 11.5k | break; |
1389 | | |
1390 | 25.7k | case ESC_H: |
1391 | 25.7k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1392 | 25.7k | #ifdef SUPPORT_UNICODE |
1393 | 25.7k | if (cranges != NULL) break; |
1394 | 13.7k | #endif |
1395 | 13.7k | add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions, |
1396 | 13.7k | cb, PRIV(hspace_list)); |
1397 | | #else |
1398 | | PCRE2_ASSERT(cranges != NULL); |
1399 | | #endif |
1400 | 13.7k | break; |
1401 | | |
1402 | 4.73k | case ESC_v: |
1403 | 4.73k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1404 | 4.73k | #ifdef SUPPORT_UNICODE |
1405 | 4.73k | if (cranges != NULL) break; |
1406 | 2.68k | #endif |
1407 | 2.68k | add_list_to_class(options & ~PCRE2_CASELESS, xoptions, |
1408 | 2.68k | cb, PRIV(vspace_list)); |
1409 | | #else |
1410 | | PCRE2_ASSERT(cranges != NULL); |
1411 | | #endif |
1412 | 2.68k | break; |
1413 | | |
1414 | 3.64k | case ESC_V: |
1415 | 3.64k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1416 | 3.64k | #ifdef SUPPORT_UNICODE |
1417 | 3.64k | if (cranges != NULL) break; |
1418 | 2.14k | #endif |
1419 | 2.14k | add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions, |
1420 | 2.14k | cb, PRIV(vspace_list)); |
1421 | | #else |
1422 | | PCRE2_ASSERT(cranges != NULL); |
1423 | | #endif |
1424 | 2.14k | break; |
1425 | | |
1426 | | /* If Unicode is not supported, \P and \p are not allowed and are |
1427 | | faulted at parse time, so will never appear here. */ |
1428 | | |
1429 | 0 | #ifdef SUPPORT_UNICODE |
1430 | 22.3k | case ESC_p: |
1431 | 41.8k | case ESC_P: |
1432 | 41.8k | { |
1433 | 41.8k | uint32_t ptype = *pptr >> 16; |
1434 | 41.8k | uint32_t pdata = *(pptr++) & 0xffff; |
1435 | | |
1436 | | /* The "Any" is processed by PRIV(update_classbits)(). */ |
1437 | 41.8k | if (ptype == PT_ANY) |
1438 | 1.39k | { |
1439 | 1.39k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1440 | 1.39k | if (!utf && escape == ESC_p) memset(classbits, 0xff, 32); |
1441 | 1.39k | #endif |
1442 | 1.39k | continue; |
1443 | 1.39k | } |
1444 | | |
1445 | 40.4k | PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits); |
1446 | | |
1447 | 40.4k | if ((xclass_props & XCLASS_HIGH_ANY) == 0) |
1448 | 37.8k | { |
1449 | 37.8k | if (lengthptr != NULL) |
1450 | 23.8k | *lengthptr += 3; |
1451 | 13.9k | else |
1452 | 13.9k | { |
1453 | 13.9k | *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; |
1454 | 13.9k | *class_uchardata++ = ptype; |
1455 | 13.9k | *class_uchardata++ = pdata; |
1456 | 13.9k | } |
1457 | 37.8k | xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; |
1458 | 37.8k | } |
1459 | 40.4k | } |
1460 | 0 | continue; |
1461 | 151k | #endif |
1462 | 151k | } |
1463 | | |
1464 | 109k | #ifdef SUPPORT_WIDE_CHARS |
1465 | | /* Every non-property class contains at least one < 256 character. */ |
1466 | 109k | xclass_props |= XCLASS_HAS_8BIT_CHARS; |
1467 | 109k | #endif |
1468 | | /* End handling \d-type escapes */ |
1469 | 109k | continue; |
1470 | | |
1471 | 3.59M | CLASS_END_CASES(meta) |
1472 | | /* Literals. */ |
1473 | 3.59M | if (meta < META_END) break; |
1474 | | /* Non-literals: end of class contents. */ |
1475 | 199k | goto END_PROCESSING; |
1476 | 3.76M | } |
1477 | | |
1478 | | /* A literal character may be followed by a range meta. At parse time |
1479 | | there are checks for out-of-order characters, for ranges where the two |
1480 | | characters are equal, and for hyphens that cannot indicate a range. At |
1481 | | this point, therefore, no checking is needed. */ |
1482 | | |
1483 | 3.39M | c = meta; |
1484 | | |
1485 | | /* Remember if \r or \n were explicitly used */ |
1486 | | |
1487 | 3.39M | if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; |
1488 | | |
1489 | | /* Process a character range */ |
1490 | | |
1491 | 3.39M | if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED) |
1492 | 16.8k | { |
1493 | 16.8k | uint32_t d; |
1494 | | |
1495 | | #ifdef EBCDIC |
1496 | | BOOL range_is_literal = (*pptr == META_RANGE_LITERAL); |
1497 | | #endif |
1498 | 16.8k | ++pptr; |
1499 | 16.8k | d = *(pptr++); |
1500 | 16.8k | if (d == META_BIGVALUE) d = *(pptr++); |
1501 | | |
1502 | | /* Remember an explicit \r or \n, and add the range to the class. */ |
1503 | | |
1504 | 16.8k | if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; |
1505 | | |
1506 | 16.8k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1507 | 16.8k | #ifdef SUPPORT_UNICODE |
1508 | 16.8k | if (cranges != NULL) continue; |
1509 | 10.6k | xclass_props |= XCLASS_HAS_8BIT_CHARS; |
1510 | 10.6k | #endif |
1511 | | |
1512 | | /* In an EBCDIC environment, Perl treats alphabetic ranges specially |
1513 | | because there are holes in the encoding, and simply using the range |
1514 | | A-Z (for example) would include the characters in the holes. This |
1515 | | applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */ |
1516 | | |
1517 | | #ifdef EBCDIC |
1518 | | if (range_is_literal && |
1519 | | (cb->ctypes[c] & ctype_letter) != 0 && |
1520 | | (cb->ctypes[d] & ctype_letter) != 0 && |
1521 | | (c <= CHAR_z) == (d <= CHAR_z)) |
1522 | | { |
1523 | | uint32_t uc = (d <= CHAR_z)? 0 : 64; |
1524 | | uint32_t C = c - uc; |
1525 | | uint32_t D = d - uc; |
1526 | | |
1527 | | if (C <= CHAR_i) |
1528 | | { |
1529 | | add_to_class(options, xoptions, cb, C + uc, |
1530 | | ((D < CHAR_i)? D : CHAR_i) + uc); |
1531 | | C = CHAR_j; |
1532 | | } |
1533 | | |
1534 | | if (C <= D && C <= CHAR_r) |
1535 | | { |
1536 | | add_to_class(options, xoptions, cb, C + uc, |
1537 | | ((D < CHAR_r)? D : CHAR_r) + uc); |
1538 | | C = CHAR_s; |
1539 | | } |
1540 | | |
1541 | | if (C <= D) |
1542 | | add_to_class(options, xoptions, cb, C + uc, D + uc); |
1543 | | } |
1544 | | else |
1545 | | #endif |
1546 | | /* Not an EBCDIC special range */ |
1547 | | |
1548 | 10.6k | add_to_class(options, xoptions, cb, c, d); |
1549 | | #else |
1550 | | PCRE2_ASSERT(cranges != NULL); |
1551 | | #endif |
1552 | 10.6k | continue; |
1553 | 16.8k | } /* End of range handling */ |
1554 | | |
1555 | | /* Character ranges are ignored when class_ranges is present. */ |
1556 | 3.38M | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1557 | 3.38M | #ifdef SUPPORT_UNICODE |
1558 | 3.38M | if (cranges != NULL) continue; |
1559 | 2.33M | xclass_props |= XCLASS_HAS_8BIT_CHARS; |
1560 | 2.33M | #endif |
1561 | | /* Handle a single character. */ |
1562 | | |
1563 | 2.33M | add_to_class(options, xoptions, cb, meta, meta); |
1564 | | #else |
1565 | | PCRE2_ASSERT(cranges != NULL); |
1566 | | #endif |
1567 | 2.33M | } /* End of main class-processing loop */ |
1568 | | |
1569 | 199k | END_PROCESSING: |
1570 | | |
1571 | 199k | #ifdef SUPPORT_WIDE_CHARS |
1572 | 199k | PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 || |
1573 | 199k | (xclass_props & XCLASS_HIGH_ANY) == 0); |
1574 | | |
1575 | 199k | if (cranges != NULL) |
1576 | 60.2k | { |
1577 | 60.2k | uint32_t *range = (uint32_t*)(cranges + 1); |
1578 | 60.2k | uint32_t *end = range + cranges->range_list_size; |
1579 | | |
1580 | 265k | while (range < end && range[0] < 256) |
1581 | 209k | { |
1582 | 209k | PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0); |
1583 | | /* Add range to bitset. If we are in UTF or UCP mode, then clear the |
1584 | | caseless bit, because the cranges handle caselessness (only) in this |
1585 | | condition; see the condition for PARSE_CLASS_CASELESS_UTF in |
1586 | | compile_optimize_class(). */ |
1587 | 209k | add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)? |
1588 | 209k | (options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]); |
1589 | | |
1590 | 209k | if (range[1] > 255) break; |
1591 | 205k | range += 2; |
1592 | 205k | } |
1593 | | |
1594 | 60.2k | if (cranges->char_lists_size > 0) |
1595 | 17.8k | { |
1596 | | /* The cranges structure is still used and freed later. */ |
1597 | 17.8k | PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0); |
1598 | 17.8k | xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS; |
1599 | 17.8k | } |
1600 | 42.3k | else |
1601 | 42.3k | { |
1602 | 42.3k | if ((xclass_props & XCLASS_HIGH_ANY) != 0) |
1603 | 1.79k | { |
1604 | 1.79k | PCRE2_ASSERT(range + 2 == end && range[0] <= 256 && |
1605 | 1.79k | range[1] >= GET_MAX_CHAR_VALUE(utf)); |
1606 | 1.79k | should_flip_negation = TRUE; |
1607 | 1.79k | range = end; |
1608 | 1.79k | } |
1609 | | |
1610 | 63.6k | while (range < end) |
1611 | 21.2k | { |
1612 | 21.2k | uint32_t range_start = range[0]; |
1613 | 21.2k | uint32_t range_end = range[1]; |
1614 | | |
1615 | 21.2k | range += 2; |
1616 | 21.2k | xclass_props |= XCLASS_REQUIRED; |
1617 | | |
1618 | 21.2k | if (range_start < 256) range_start = 256; |
1619 | | |
1620 | 21.2k | if (lengthptr != NULL) |
1621 | 12.7k | { |
1622 | 12.7k | #ifdef SUPPORT_UNICODE |
1623 | 12.7k | if (utf) |
1624 | 12.7k | { |
1625 | 12.7k | *lengthptr += 1; |
1626 | | |
1627 | 12.7k | if (range_start < range_end) |
1628 | 1.98k | *lengthptr += PRIV(ord2utf)(range_start, class_uchardata); |
1629 | | |
1630 | 12.7k | *lengthptr += PRIV(ord2utf)(range_end, class_uchardata); |
1631 | 12.7k | continue; |
1632 | 12.7k | } |
1633 | 0 | #endif /* SUPPORT_UNICODE */ |
1634 | | |
1635 | 0 | *lengthptr += range_start < range_end ? 3 : 2; |
1636 | 0 | continue; |
1637 | 12.7k | } |
1638 | | |
1639 | 8.56k | #ifdef SUPPORT_UNICODE |
1640 | 8.56k | if (utf) |
1641 | 8.56k | { |
1642 | 8.56k | if (range_start < range_end) |
1643 | 1.30k | { |
1644 | 1.30k | *class_uchardata++ = XCL_RANGE; |
1645 | 1.30k | class_uchardata += PRIV(ord2utf)(range_start, class_uchardata); |
1646 | 1.30k | } |
1647 | 7.26k | else |
1648 | 7.26k | *class_uchardata++ = XCL_SINGLE; |
1649 | | |
1650 | 8.56k | class_uchardata += PRIV(ord2utf)(range_end, class_uchardata); |
1651 | 8.56k | continue; |
1652 | 8.56k | } |
1653 | 8.56k | #endif /* SUPPORT_UNICODE */ |
1654 | | |
1655 | | /* Without UTF support, character values are constrained |
1656 | | by the bit length, and can only be > 256 for 16-bit and |
1657 | | 32-bit libraries. */ |
1658 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
1659 | | if (range_start < range_end) |
1660 | | { |
1661 | | *class_uchardata++ = XCL_RANGE; |
1662 | | *class_uchardata++ = range_start; |
1663 | | } |
1664 | | else |
1665 | | *class_uchardata++ = XCL_SINGLE; |
1666 | | |
1667 | | *class_uchardata++ = range_end; |
1668 | | #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
1669 | 8.56k | } |
1670 | | |
1671 | 42.3k | if (lengthptr == NULL) |
1672 | 17.0k | cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data); |
1673 | 42.3k | } |
1674 | 60.2k | } |
1675 | 199k | #endif /* SUPPORT_WIDE_CHARS */ |
1676 | | |
1677 | | /* If there are characters with values > 255, or Unicode property settings |
1678 | | (\p or \P), we have to compile an extended class, with its own opcode, |
1679 | | unless there were no property settings and there was a negated special such |
1680 | | as \S in the class, and PCRE2_UCP is not set, because in that case all |
1681 | | characters > 255 are in or not in the class, so any that were explicitly |
1682 | | given as well can be ignored. |
1683 | | |
1684 | | In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were |
1685 | | were present in a class, we either have to match or not match all wide |
1686 | | characters (depending on whether the whole class is or is not negated). |
1687 | | This requirement is indicated by match_all_or_no_wide_chars being true. |
1688 | | We do this by including an explicit range, which works in both cases. |
1689 | | This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there |
1690 | | cannot be any wide characters in 8-bit non-UTF mode. |
1691 | | |
1692 | | When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit |
1693 | | class where \S etc is present without PCRE2_UCP, causing an extended class |
1694 | | to be compiled, we make sure that all characters > 255 are included by |
1695 | | forcing match_all_or_no_wide_chars to be true. |
1696 | | |
1697 | | If, when generating an xclass, there are no characters < 256, we can omit |
1698 | | the bitmap in the actual compiled code. */ |
1699 | | |
1700 | 199k | #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */ |
1701 | 199k | if ((xclass_props & XCLASS_REQUIRED) != 0) |
1702 | 57.0k | { |
1703 | 57.0k | PCRE2_UCHAR *previous = code; |
1704 | | |
1705 | 57.0k | if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0) |
1706 | 39.1k | *class_uchardata++ = XCL_END; /* Marks the end of extra data */ |
1707 | 57.0k | *code++ = OP_XCLASS; |
1708 | 57.0k | code += LINK_SIZE; |
1709 | 57.0k | *code = negate_class? XCL_NOT:0; |
1710 | 57.0k | if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP; |
1711 | | |
1712 | | /* If the map is required, move up the extra data to make room for it; |
1713 | | otherwise just move the code pointer to the end of the extra data. */ |
1714 | | |
1715 | 57.0k | if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL) |
1716 | 49.6k | { |
1717 | 49.6k | if (negate_class) |
1718 | 10.6k | { |
1719 | 10.6k | uint32_t *classwords = cb->classbits.classwords; |
1720 | 95.5k | for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i]; |
1721 | 10.6k | } |
1722 | | |
1723 | 49.6k | if (has_bitmap == NULL) |
1724 | 24.3k | { |
1725 | 24.3k | *code++ |= XCL_MAP; |
1726 | 24.3k | (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, |
1727 | 24.3k | CU2BYTES(class_uchardata - code)); |
1728 | 24.3k | memcpy(code, classbits, 32); |
1729 | 24.3k | code = class_uchardata + (32 / sizeof(PCRE2_UCHAR)); |
1730 | 24.3k | } |
1731 | 25.3k | else |
1732 | 25.3k | { |
1733 | 25.3k | code = class_uchardata; |
1734 | 25.3k | if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0) |
1735 | 19.7k | *has_bitmap = TRUE; |
1736 | 25.3k | } |
1737 | 49.6k | } |
1738 | 7.36k | else code = class_uchardata; |
1739 | | |
1740 | 57.0k | if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0) |
1741 | 17.8k | { |
1742 | | /* Char lists size is an even number, because all items are 16 or 32 |
1743 | | bit values. The character list data is always aligned to 32 bits. */ |
1744 | 17.8k | size_t char_lists_size = cranges->char_lists_size; |
1745 | 17.8k | PCRE2_ASSERT((char_lists_size & 0x1) == 0 && |
1746 | 17.8k | (cb->char_lists_size & 0x3) == 0); |
1747 | | |
1748 | 17.8k | if (lengthptr != NULL) |
1749 | 13.7k | { |
1750 | 13.7k | char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t)); |
1751 | | |
1752 | 13.7k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1753 | 13.7k | *lengthptr += 2 + LINK_SIZE; |
1754 | | #else |
1755 | | *lengthptr += 1 + LINK_SIZE; |
1756 | | #endif |
1757 | | |
1758 | 13.7k | cb->char_lists_size += char_lists_size; |
1759 | | |
1760 | 13.7k | char_lists_size /= sizeof(PCRE2_UCHAR); |
1761 | | |
1762 | | /* Storage space for character lists is included |
1763 | | in the maximum pattern size. */ |
1764 | 13.7k | if (*lengthptr > MAX_PATTERN_SIZE || |
1765 | 13.7k | MAX_PATTERN_SIZE - *lengthptr < char_lists_size) |
1766 | 18 | { |
1767 | 18 | *errorcodeptr = ERR20; /* Pattern is too large */ |
1768 | 18 | return NULL; |
1769 | 18 | } |
1770 | 13.7k | } |
1771 | 4.12k | else |
1772 | 4.12k | { |
1773 | 4.12k | uint8_t *data; |
1774 | | |
1775 | 4.12k | PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK); |
1776 | 4.12k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1777 | | /* Encode as high / low bytes. */ |
1778 | 4.12k | code[0] = (uint8_t)(XCL_LIST | |
1779 | 4.12k | (cranges->char_lists_types >> 8)); |
1780 | 4.12k | code[1] = (uint8_t)cranges->char_lists_types; |
1781 | 4.12k | code += 2; |
1782 | | #else |
1783 | | *code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types); |
1784 | | #endif |
1785 | | |
1786 | | /* Character lists are stored in backwards direction from |
1787 | | byte code start. The non-dfa/dfa matchers can access these |
1788 | | lists using the byte code start stored in match blocks. |
1789 | | Each list is aligned to 32 bit with an optional unused |
1790 | | 16 bit value at the beginning of the character list. */ |
1791 | | |
1792 | 4.12k | cb->char_lists_size += char_lists_size; |
1793 | 4.12k | data = (uint8_t*)cb->start_code - cb->char_lists_size; |
1794 | | |
1795 | 4.12k | memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start, |
1796 | 4.12k | char_lists_size); |
1797 | | |
1798 | | /* Since character lists total size is less than MAX_PATTERN_SIZE, |
1799 | | their starting offset fits into a value which size is LINK_SIZE. */ |
1800 | | |
1801 | 4.12k | char_lists_size = cb->char_lists_size; |
1802 | 4.12k | PUT(code, 0, (uint32_t)(char_lists_size >> 1)); |
1803 | 4.12k | code += LINK_SIZE; |
1804 | | |
1805 | | #if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND |
1806 | | if ((char_lists_size & 0x2) != 0) |
1807 | | { |
1808 | | /* In debug the unused 16 bit value is set |
1809 | | to a fixed value and marked unused. */ |
1810 | | ((uint16_t*)data)[-1] = 0x5555; |
1811 | | #ifdef SUPPORT_VALGRIND |
1812 | | VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2); |
1813 | | #endif |
1814 | | } |
1815 | | #endif |
1816 | | |
1817 | 4.12k | cb->char_lists_size = |
1818 | 4.12k | CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t)); |
1819 | | |
1820 | 4.12k | cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data); |
1821 | 4.12k | } |
1822 | 17.8k | } |
1823 | | |
1824 | | /* Now fill in the complete length of the item */ |
1825 | | |
1826 | 57.0k | PUT(previous, 1, (int)(code - previous)); |
1827 | 57.0k | goto DONE; /* End of class handling */ |
1828 | 57.0k | } |
1829 | 142k | #endif /* SUPPORT_WIDE_CHARS */ |
1830 | | |
1831 | | /* If there are no characters > 255, or they are all to be included or |
1832 | | excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the |
1833 | | whole class was negated and whether there were negative specials such as \S |
1834 | | (non-UCP) in the class. Then copy the 32-byte map into the code vector, |
1835 | | negating it if necessary. */ |
1836 | | |
1837 | 142k | if (negate_class) |
1838 | 55.4k | { |
1839 | 55.4k | uint32_t *classwords = cb->classbits.classwords; |
1840 | | |
1841 | 498k | for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i]; |
1842 | 55.4k | } |
1843 | | |
1844 | 142k | if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) && |
1845 | 142k | cb->classbits.classwords[0] == ~(uint32_t)0) |
1846 | 39.6k | { |
1847 | 39.6k | const uint32_t *classwords = cb->classbits.classwords; |
1848 | 39.6k | int i; |
1849 | | |
1850 | 151k | for (i = 0; i < 8; i++) |
1851 | 150k | if (classwords[i] != ~(uint32_t)0) break; |
1852 | | |
1853 | 39.6k | if (i == 8) |
1854 | 1.32k | { |
1855 | 1.32k | *code++ = OP_ALLANY; |
1856 | 1.32k | goto DONE; /* End of class handling */ |
1857 | 1.32k | } |
1858 | 39.6k | } |
1859 | | |
1860 | 141k | *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; |
1861 | 141k | memcpy(code, classbits, 32); |
1862 | 141k | code += 32 / sizeof(PCRE2_UCHAR); |
1863 | | |
1864 | 199k | DONE: |
1865 | 199k | *pcode = code; |
1866 | 199k | return pptr - 1; |
1867 | 141k | } |
1868 | | |
1869 | | |
1870 | | |
1871 | | /* ===================================================================*/ |
1872 | | /* Here follows a block of ECLASS-compiling functions. You may well want to |
1873 | | read them from top to bottom; they are ordered from leafmost (at the top) to |
1874 | | outermost parser (at the bottom of the file). */ |
1875 | | |
1876 | | /* This function folds one operand using the negation operator. |
1877 | | The new, combined chunk of stack code is written out to *pop_info. */ |
1878 | | |
1879 | | static void |
1880 | | fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr, |
1881 | | BOOL preserve_classbits) |
1882 | 11.4k | { |
1883 | | /* If the chunk of stack code is already composed of multiple ops, we won't |
1884 | | descend in and try and propagate the negation down the tree. (That would lead |
1885 | | to O(n^2) compile-time, which could be exploitable with a malicious regex - |
1886 | | although maybe that's not really too much of a worry in a library that offers |
1887 | | an exponential-time matching function!) */ |
1888 | | |
1889 | 11.4k | if (pop_info->op_single_type == 0) |
1890 | 4.38k | { |
1891 | 4.38k | if (lengthptr != NULL) |
1892 | 2.34k | *lengthptr += 1; |
1893 | 2.04k | else |
1894 | 2.04k | pop_info->code_start[pop_info->length] = ECL_NOT; |
1895 | 4.38k | pop_info->length += 1; |
1896 | 4.38k | } |
1897 | | |
1898 | | /* Otherwise, it's a nice single-op item, so we can easily fold in the negation |
1899 | | without needing to produce an ECL_NOT. */ |
1900 | | |
1901 | 7.10k | else if (pop_info->op_single_type == ECL_ANY || |
1902 | 7.10k | pop_info->op_single_type == ECL_NONE) |
1903 | 3.40k | { |
1904 | 3.40k | pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)? |
1905 | 2.20k | ECL_ANY : ECL_NONE; |
1906 | 3.40k | if (lengthptr == NULL) |
1907 | 1.35k | *(pop_info->code_start) = pop_info->op_single_type; |
1908 | 3.40k | } |
1909 | 3.69k | else |
1910 | 3.69k | { |
1911 | 3.69k | PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS && |
1912 | 3.69k | pop_info->length >= 1 + LINK_SIZE + 1); |
1913 | 3.69k | if (lengthptr == NULL) |
1914 | 1.78k | pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT; |
1915 | 3.69k | } |
1916 | | |
1917 | 11.4k | if (!preserve_classbits) |
1918 | 6.79k | { |
1919 | 61.1k | for (int i = 0; i < 8; i++) |
1920 | 54.3k | pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i]; |
1921 | 6.79k | } |
1922 | 11.4k | } |
1923 | | |
1924 | | |
1925 | | |
1926 | | /* This function folds together two operands using a binary operator. |
1927 | | The new, combined chunk of stack code is written out to *lhs_op_info. */ |
1928 | | |
1929 | | static void |
1930 | | fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info, |
1931 | | PCRE2_SIZE *lengthptr) |
1932 | 57.1k | { |
1933 | 57.1k | switch (op) |
1934 | 57.1k | { |
1935 | | /* ECL_AND truth table: |
1936 | | |
1937 | | LHS RHS RESULT |
1938 | | ---------------- |
1939 | | ANY * RHS |
1940 | | * ANY LHS |
1941 | | NONE * NONE |
1942 | | * NONE NONE |
1943 | | X Y X & Y |
1944 | | */ |
1945 | | |
1946 | 14.7k | case ECL_AND: |
1947 | 14.7k | if (rhs_op_info->op_single_type == ECL_ANY) |
1948 | 7.26k | { |
1949 | | /* no-op: drop the RHS */ |
1950 | 7.26k | } |
1951 | 7.48k | else if (lhs_op_info->op_single_type == ECL_ANY) |
1952 | 1.93k | { |
1953 | | /* no-op: drop the LHS, and memmove the RHS into its place */ |
1954 | 1.93k | if (lengthptr == NULL) |
1955 | 862 | memmove(lhs_op_info->code_start, rhs_op_info->code_start, |
1956 | 862 | CU2BYTES(rhs_op_info->length)); |
1957 | 1.93k | lhs_op_info->length = rhs_op_info->length; |
1958 | 1.93k | lhs_op_info->op_single_type = rhs_op_info->op_single_type; |
1959 | 1.93k | } |
1960 | 5.55k | else if (rhs_op_info->op_single_type == ECL_NONE) |
1961 | 2.21k | { |
1962 | | /* the result is ECL_NONE: write into the LHS */ |
1963 | 2.21k | if (lengthptr == NULL) |
1964 | 1.03k | lhs_op_info->code_start[0] = ECL_NONE; |
1965 | 2.21k | lhs_op_info->length = 1; |
1966 | 2.21k | lhs_op_info->op_single_type = ECL_NONE; |
1967 | 2.21k | } |
1968 | 3.33k | else if (lhs_op_info->op_single_type == ECL_NONE) |
1969 | 809 | { |
1970 | | /* the result is ECL_NONE: drop the RHS */ |
1971 | 809 | } |
1972 | 2.52k | else |
1973 | 2.52k | { |
1974 | | /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ |
1975 | 2.52k | if (lengthptr != NULL) |
1976 | 1.63k | *lengthptr += 1; |
1977 | 897 | else |
1978 | 897 | { |
1979 | 897 | PCRE2_ASSERT(rhs_op_info->code_start == |
1980 | 897 | lhs_op_info->code_start + lhs_op_info->length); |
1981 | 897 | rhs_op_info->code_start[rhs_op_info->length] = ECL_AND; |
1982 | 897 | } |
1983 | 2.52k | lhs_op_info->length += rhs_op_info->length + 1; |
1984 | 2.52k | lhs_op_info->op_single_type = 0; |
1985 | 2.52k | } |
1986 | | |
1987 | 132k | for (int i = 0; i < 8; i++) |
1988 | 118k | lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i]; |
1989 | 14.7k | break; |
1990 | | |
1991 | | /* ECL_OR truth table: |
1992 | | |
1993 | | LHS RHS RESULT |
1994 | | ---------------- |
1995 | | ANY * ANY |
1996 | | * ANY ANY |
1997 | | NONE * RHS |
1998 | | * NONE LHS |
1999 | | X Y X | Y |
2000 | | */ |
2001 | | |
2002 | 34.3k | case ECL_OR: |
2003 | 34.3k | if (rhs_op_info->op_single_type == ECL_NONE) |
2004 | 15.6k | { |
2005 | | /* no-op: drop the RHS */ |
2006 | 15.6k | } |
2007 | 18.6k | else if (lhs_op_info->op_single_type == ECL_NONE) |
2008 | 5.09k | { |
2009 | | /* no-op: drop the LHS, and memmove the RHS into its place */ |
2010 | 5.09k | if (lengthptr == NULL) |
2011 | 1.73k | memmove(lhs_op_info->code_start, rhs_op_info->code_start, |
2012 | 1.73k | CU2BYTES(rhs_op_info->length)); |
2013 | 5.09k | lhs_op_info->length = rhs_op_info->length; |
2014 | 5.09k | lhs_op_info->op_single_type = rhs_op_info->op_single_type; |
2015 | 5.09k | } |
2016 | 13.5k | else if (rhs_op_info->op_single_type == ECL_ANY) |
2017 | 1.76k | { |
2018 | | /* the result is ECL_ANY: write into the LHS */ |
2019 | 1.76k | if (lengthptr == NULL) |
2020 | 793 | lhs_op_info->code_start[0] = ECL_ANY; |
2021 | 1.76k | lhs_op_info->length = 1; |
2022 | 1.76k | lhs_op_info->op_single_type = ECL_ANY; |
2023 | 1.76k | } |
2024 | 11.8k | else if (lhs_op_info->op_single_type == ECL_ANY) |
2025 | 313 | { |
2026 | | /* the result is ECL_ANY: drop the RHS */ |
2027 | 313 | } |
2028 | 11.4k | else |
2029 | 11.4k | { |
2030 | | /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ |
2031 | 11.4k | if (lengthptr != NULL) |
2032 | 8.12k | *lengthptr += 1; |
2033 | 3.36k | else |
2034 | 3.36k | { |
2035 | 3.36k | PCRE2_ASSERT(rhs_op_info->code_start == |
2036 | 3.36k | lhs_op_info->code_start + lhs_op_info->length); |
2037 | 3.36k | rhs_op_info->code_start[rhs_op_info->length] = ECL_OR; |
2038 | 3.36k | } |
2039 | 11.4k | lhs_op_info->length += rhs_op_info->length + 1; |
2040 | 11.4k | lhs_op_info->op_single_type = 0; |
2041 | 11.4k | } |
2042 | | |
2043 | 308k | for (int i = 0; i < 8; i++) |
2044 | 274k | lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i]; |
2045 | 34.3k | break; |
2046 | | |
2047 | | /* ECL_XOR truth table: |
2048 | | |
2049 | | LHS RHS RESULT |
2050 | | ---------------- |
2051 | | ANY * !RHS |
2052 | | * ANY !LHS |
2053 | | NONE * RHS |
2054 | | * NONE LHS |
2055 | | X Y X ^ Y |
2056 | | */ |
2057 | | |
2058 | 8.07k | case ECL_XOR: |
2059 | 8.07k | if (rhs_op_info->op_single_type == ECL_NONE) |
2060 | 685 | { |
2061 | | /* no-op: drop the RHS */ |
2062 | 685 | } |
2063 | 7.39k | else if (lhs_op_info->op_single_type == ECL_NONE) |
2064 | 1.01k | { |
2065 | | /* no-op: drop the LHS, and memmove the RHS into its place */ |
2066 | 1.01k | if (lengthptr == NULL) |
2067 | 474 | memmove(lhs_op_info->code_start, rhs_op_info->code_start, |
2068 | 474 | CU2BYTES(rhs_op_info->length)); |
2069 | 1.01k | lhs_op_info->length = rhs_op_info->length; |
2070 | 1.01k | lhs_op_info->op_single_type = rhs_op_info->op_single_type; |
2071 | 1.01k | } |
2072 | 6.38k | else if (rhs_op_info->op_single_type == ECL_ANY) |
2073 | 3.21k | { |
2074 | | /* the result is !LHS: fold in the negation, and drop the RHS */ |
2075 | | /* Preserve the classbits, because we promise to deal with them later. */ |
2076 | 3.21k | fold_negation(lhs_op_info, lengthptr, TRUE); |
2077 | 3.21k | } |
2078 | 3.16k | else if (lhs_op_info->op_single_type == ECL_ANY) |
2079 | 1.48k | { |
2080 | | /* the result is !RHS: drop the LHS, memmove the RHS into its place, and |
2081 | | fold in the negation */ |
2082 | 1.48k | if (lengthptr == NULL) |
2083 | 727 | memmove(lhs_op_info->code_start, rhs_op_info->code_start, |
2084 | 727 | CU2BYTES(rhs_op_info->length)); |
2085 | 1.48k | lhs_op_info->length = rhs_op_info->length; |
2086 | 1.48k | lhs_op_info->op_single_type = rhs_op_info->op_single_type; |
2087 | | |
2088 | | /* Preserve the classbits, because we promise to deal with them later. */ |
2089 | 1.48k | fold_negation(lhs_op_info, lengthptr, TRUE); |
2090 | 1.48k | } |
2091 | 1.68k | else |
2092 | 1.68k | { |
2093 | | /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ |
2094 | 1.68k | if (lengthptr != NULL) |
2095 | 1.03k | *lengthptr += 1; |
2096 | 643 | else |
2097 | 643 | { |
2098 | 643 | PCRE2_ASSERT(rhs_op_info->code_start == |
2099 | 643 | lhs_op_info->code_start + lhs_op_info->length); |
2100 | 643 | rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR; |
2101 | 643 | } |
2102 | 1.68k | lhs_op_info->length += rhs_op_info->length + 1; |
2103 | 1.68k | lhs_op_info->op_single_type = 0; |
2104 | 1.68k | } |
2105 | | |
2106 | 72.7k | for (int i = 0; i < 8; i++) |
2107 | 64.6k | lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i]; |
2108 | 8.07k | break; |
2109 | | |
2110 | 0 | default: |
2111 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
2112 | 0 | break; |
2113 | 57.1k | } |
2114 | 57.1k | } |
2115 | | |
2116 | | |
2117 | | |
2118 | | static BOOL |
2119 | | compile_eclass_nested(eclass_context *context, BOOL negated, |
2120 | | uint32_t **pptr, PCRE2_UCHAR **pcode, |
2121 | | eclass_op_info *pop_info, PCRE2_SIZE *lengthptr); |
2122 | | |
2123 | | /* This function consumes a group of implicitly-unioned class elements. |
2124 | | These can be characters, ranges, properties, or nested classes, as long |
2125 | | as they are all joined by being placed adjacently. */ |
2126 | | |
2127 | | static BOOL |
2128 | | compile_class_operand(eclass_context *context, BOOL negated, |
2129 | | uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, |
2130 | | PCRE2_SIZE *lengthptr) |
2131 | 79.5k | { |
2132 | 79.5k | uint32_t *ptr = *pptr; |
2133 | 79.5k | uint32_t *prev_ptr; |
2134 | 79.5k | PCRE2_UCHAR *code = *pcode; |
2135 | 79.5k | PCRE2_UCHAR *code_start = code; |
2136 | 79.5k | PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0; |
2137 | 79.5k | PCRE2_SIZE extra_length; |
2138 | 79.5k | uint32_t meta = META_CODE(*ptr); |
2139 | | |
2140 | 79.5k | switch (meta) |
2141 | 79.5k | { |
2142 | 443 | case META_CLASS_EMPTY_NOT: |
2143 | 4.02k | case META_CLASS_EMPTY: |
2144 | 4.02k | ++ptr; |
2145 | 4.02k | pop_info->length = 1; |
2146 | 4.02k | if ((meta == META_CLASS_EMPTY) == negated) |
2147 | 1.45k | { |
2148 | 1.45k | *code++ = pop_info->op_single_type = ECL_ANY; |
2149 | 1.45k | memset(pop_info->bits.classbits, 0xff, 32); |
2150 | 1.45k | } |
2151 | 2.56k | else |
2152 | 2.56k | { |
2153 | 2.56k | *code++ = pop_info->op_single_type = ECL_NONE; |
2154 | 2.56k | memset(pop_info->bits.classbits, 0, 32); |
2155 | 2.56k | } |
2156 | 4.02k | break; |
2157 | | |
2158 | 21.1k | case META_CLASS: |
2159 | 27.3k | case META_CLASS_NOT: |
2160 | 27.3k | if ((*ptr & CLASS_IS_ECLASS) != 0) |
2161 | 9.02k | { |
2162 | 9.02k | if (!compile_eclass_nested(context, negated, &ptr, &code, |
2163 | 9.02k | pop_info, lengthptr)) |
2164 | 35 | return FALSE; |
2165 | | |
2166 | 8.99k | PCRE2_ASSERT(*ptr == META_CLASS_END); |
2167 | 8.99k | ptr++; |
2168 | 8.99k | goto DONE; |
2169 | 9.02k | } |
2170 | | |
2171 | 18.2k | ptr++; |
2172 | | /* Fall through */ |
2173 | | |
2174 | 66.5k | default: |
2175 | | /* Scan forward characters, ranges, and properties. |
2176 | | For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but |
2177 | | we still need to collect that fragment up into a "leaf" OP_CLASS. */ |
2178 | | |
2179 | 66.5k | prev_ptr = ptr; |
2180 | 66.5k | ptr = PRIV(compile_class_not_nested)( |
2181 | 66.5k | context->options, context->xoptions, ptr, &code, |
2182 | 66.5k | (meta != META_CLASS_NOT) == negated, &context->needs_bitmap, |
2183 | 66.5k | context->errorcodeptr, context->cb, lengthptr); |
2184 | 66.5k | if (ptr == NULL) return FALSE; |
2185 | | |
2186 | | /* We must have a 100% guarantee that ptr increases when |
2187 | | compile_class_operand() returns, even on Release builds, so that we can |
2188 | | statically prove our loops terminate. */ |
2189 | 66.5k | if (ptr <= prev_ptr) |
2190 | 0 | { |
2191 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
2192 | 0 | return FALSE; |
2193 | 0 | } |
2194 | | |
2195 | | /* If we fell through above, consume the closing ']'. */ |
2196 | 66.5k | if (meta == META_CLASS || meta == META_CLASS_NOT) |
2197 | 18.2k | { |
2198 | 18.2k | PCRE2_ASSERT(*ptr == META_CLASS_END); |
2199 | 18.2k | ptr++; |
2200 | 18.2k | } |
2201 | | |
2202 | | /* Regardless of whether (lengthptr == NULL), some data will still be written |
2203 | | out to *pcode, which we need: we have to peek at it, to transform the opcode |
2204 | | into the ECLASS version (since we need to hoist up the bitmaps). */ |
2205 | 66.5k | PCRE2_ASSERT(code > code_start); |
2206 | 66.5k | extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0; |
2207 | | |
2208 | | /* Easiest case: convert OP_ALLANY to ECL_ANY */ |
2209 | | |
2210 | 66.5k | if (*code_start == OP_ALLANY) |
2211 | 247 | { |
2212 | 247 | PCRE2_ASSERT(code - code_start == 1 && extra_length == 0); |
2213 | 247 | pop_info->length = 1; |
2214 | 247 | *code_start = pop_info->op_single_type = ECL_ANY; |
2215 | 247 | memset(pop_info->bits.classbits, 0xff, 32); |
2216 | 247 | } |
2217 | | |
2218 | | /* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to |
2219 | | ECL_NONE / ECL_ANY respectively. */ |
2220 | | |
2221 | 66.2k | else if (*code_start == OP_CLASS || *code_start == OP_NCLASS) |
2222 | 40.9k | { |
2223 | 40.9k | PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) && |
2224 | 40.9k | extra_length == 0); |
2225 | 40.9k | pop_info->length = 1; |
2226 | 40.9k | *code_start = pop_info->op_single_type = |
2227 | 40.9k | (*code_start == OP_CLASS)? ECL_NONE : ECL_ANY; |
2228 | 40.9k | memcpy(pop_info->bits.classbits, code_start + 1, 32); |
2229 | | /* Rewind the code pointer, but make sure we adjust *lengthptr, because we |
2230 | | do need to reserve that space (even though we only use it temporarily). */ |
2231 | 40.9k | if (lengthptr != NULL) |
2232 | 25.3k | *lengthptr += code - (code_start + 1); |
2233 | 40.9k | code = code_start + 1; |
2234 | | |
2235 | 40.9k | if (!context->needs_bitmap && *code_start == ECL_NONE) |
2236 | 4.84k | { |
2237 | 4.84k | uint32_t *classwords = pop_info->bits.classwords; |
2238 | | |
2239 | 14.6k | for (int i = 0; i < 8; i++) |
2240 | 14.3k | if (classwords[i] != 0) |
2241 | 4.49k | { |
2242 | 4.49k | context->needs_bitmap = TRUE; |
2243 | 4.49k | break; |
2244 | 4.49k | } |
2245 | 4.84k | } |
2246 | 36.0k | else |
2247 | 36.0k | context->needs_bitmap = TRUE; |
2248 | 40.9k | } |
2249 | | |
2250 | | /* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to |
2251 | | ECL_XCLASS. */ |
2252 | | |
2253 | 25.3k | else |
2254 | 25.3k | { |
2255 | 25.3k | PCRE2_ASSERT(*code_start == OP_XCLASS); |
2256 | 25.3k | *code_start = pop_info->op_single_type = ECL_XCLASS; |
2257 | | |
2258 | 25.3k | PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1); |
2259 | | |
2260 | 25.3k | memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32); |
2261 | 25.3k | pop_info->length = (code - code_start) + extra_length; |
2262 | 25.3k | } |
2263 | | |
2264 | 66.5k | break; |
2265 | 79.5k | } /* End of switch(meta) */ |
2266 | | |
2267 | 70.5k | pop_info->code_start = (lengthptr == NULL)? code_start : NULL; |
2268 | | |
2269 | 70.5k | if (lengthptr != NULL) |
2270 | 43.9k | { |
2271 | 43.9k | *lengthptr += code - code_start; |
2272 | 43.9k | code = code_start; |
2273 | 43.9k | } |
2274 | | |
2275 | 79.5k | DONE: |
2276 | 79.5k | PCRE2_ASSERT(lengthptr == NULL || (code == code_start)); |
2277 | | |
2278 | 79.5k | *pptr = ptr; |
2279 | 79.5k | *pcode = code; |
2280 | 79.5k | return TRUE; |
2281 | 70.5k | } |
2282 | | |
2283 | | |
2284 | | |
2285 | | /* This function consumes a group of implicitly-unioned class elements. |
2286 | | These can be characters, ranges, properties, or nested classes, as long |
2287 | | as they are all joined by being placed adjacently. */ |
2288 | | |
2289 | | static BOOL |
2290 | | compile_class_juxtaposition(eclass_context *context, BOOL negated, |
2291 | | uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, |
2292 | | PCRE2_SIZE *lengthptr) |
2293 | 34.4k | { |
2294 | 34.4k | uint32_t *ptr = *pptr; |
2295 | 34.4k | PCRE2_UCHAR *code = *pcode; |
2296 | | #ifdef PCRE2_DEBUG |
2297 | | PCRE2_UCHAR *start_code = *pcode; |
2298 | | #endif |
2299 | | |
2300 | | /* See compile_class_binary_loose() for comments on compile-time folding of |
2301 | | the "negated" flag. */ |
2302 | | |
2303 | | /* Because it's a non-empty class, there must be an operand at the start. */ |
2304 | 34.4k | if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr)) |
2305 | 12 | return FALSE; |
2306 | | |
2307 | 79.5k | while (*ptr != META_CLASS_END && |
2308 | 79.5k | !(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT)) |
2309 | 45.1k | { |
2310 | 45.1k | uint32_t op; |
2311 | 45.1k | BOOL rhs_negated; |
2312 | 45.1k | eclass_op_info rhs_op_info; |
2313 | | |
2314 | 45.1k | if (negated) |
2315 | 13.2k | { |
2316 | | /* !(A juxtapose B) -> !A && !B */ |
2317 | 13.2k | op = ECL_AND; |
2318 | 13.2k | rhs_negated = TRUE; |
2319 | 13.2k | } |
2320 | 31.8k | else |
2321 | 31.8k | { |
2322 | | /* A juxtapose B -> A || B */ |
2323 | 31.8k | op = ECL_OR; |
2324 | 31.8k | rhs_negated = FALSE; |
2325 | 31.8k | } |
2326 | | |
2327 | | /* An operand must follow the operator. */ |
2328 | 45.1k | if (!compile_class_operand(context, rhs_negated, &ptr, &code, |
2329 | 45.1k | &rhs_op_info, lengthptr)) |
2330 | 36 | return FALSE; |
2331 | | |
2332 | | /* Convert infix to postfix (RPN). */ |
2333 | 45.0k | fold_binary(op, pop_info, &rhs_op_info, lengthptr); |
2334 | 45.0k | if (lengthptr == NULL) |
2335 | 16.0k | code = pop_info->code_start + pop_info->length; |
2336 | 45.0k | } |
2337 | | |
2338 | 34.3k | PCRE2_ASSERT(lengthptr == NULL || code == start_code); |
2339 | | |
2340 | 34.3k | *pptr = ptr; |
2341 | 34.3k | *pcode = code; |
2342 | 34.3k | return TRUE; |
2343 | 34.4k | } |
2344 | | |
2345 | | |
2346 | | |
2347 | | /* This function consumes unary prefix operators. */ |
2348 | | |
2349 | | static BOOL |
2350 | | compile_class_unary(eclass_context *context, BOOL negated, |
2351 | | uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, |
2352 | | PCRE2_SIZE *lengthptr) |
2353 | 34.4k | { |
2354 | 34.4k | uint32_t *ptr = *pptr; |
2355 | | #ifdef PCRE2_DEBUG |
2356 | | PCRE2_UCHAR *start_code = *pcode; |
2357 | | #endif |
2358 | | |
2359 | 35.6k | while (*ptr == META_ECLASS_NOT) |
2360 | 1.15k | { |
2361 | 1.15k | ++ptr; |
2362 | 1.15k | negated = !negated; |
2363 | 1.15k | } |
2364 | | |
2365 | 34.4k | *pptr = ptr; |
2366 | | /* Because it's a non-empty class, there must be an operand. */ |
2367 | 34.4k | if (!compile_class_juxtaposition(context, negated, pptr, pcode, |
2368 | 34.4k | pop_info, lengthptr)) |
2369 | 48 | return FALSE; |
2370 | | |
2371 | 34.3k | PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code); |
2372 | 34.3k | return TRUE; |
2373 | 34.4k | } |
2374 | | |
2375 | | |
2376 | | |
2377 | | /* This function consumes tightly-binding binary operators. */ |
2378 | | |
2379 | | static BOOL |
2380 | | compile_class_binary_tight(eclass_context *context, BOOL negated, |
2381 | | uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, |
2382 | | PCRE2_SIZE *lengthptr) |
2383 | 33.5k | { |
2384 | 33.5k | uint32_t *ptr = *pptr; |
2385 | 33.5k | PCRE2_UCHAR *code = *pcode; |
2386 | | #ifdef PCRE2_DEBUG |
2387 | | PCRE2_UCHAR *start_code = *pcode; |
2388 | | #endif |
2389 | | |
2390 | | /* See compile_class_binary_loose() for comments on compile-time folding of |
2391 | | the "negated" flag. */ |
2392 | | |
2393 | | /* Because it's a non-empty class, there must be an operand at the start. */ |
2394 | 33.5k | if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr)) |
2395 | 48 | return FALSE; |
2396 | | |
2397 | 34.3k | while (*ptr == META_ECLASS_AND) |
2398 | 882 | { |
2399 | 882 | uint32_t op; |
2400 | 882 | BOOL rhs_negated; |
2401 | 882 | eclass_op_info rhs_op_info; |
2402 | | |
2403 | 882 | if (negated) |
2404 | 236 | { |
2405 | | /* !(A && B) -> !A || !B */ |
2406 | 236 | op = ECL_OR; |
2407 | 236 | rhs_negated = TRUE; |
2408 | 236 | } |
2409 | 646 | else |
2410 | 646 | { |
2411 | | /* A && B -> A && B */ |
2412 | 646 | op = ECL_AND; |
2413 | 646 | rhs_negated = FALSE; |
2414 | 646 | } |
2415 | | |
2416 | 882 | ++ptr; |
2417 | | |
2418 | | /* An operand must follow the operator. */ |
2419 | 882 | if (!compile_class_unary(context, rhs_negated, &ptr, &code, |
2420 | 882 | &rhs_op_info, lengthptr)) |
2421 | 0 | return FALSE; |
2422 | | |
2423 | | /* Convert infix to postfix (RPN). */ |
2424 | 882 | fold_binary(op, pop_info, &rhs_op_info, lengthptr); |
2425 | 882 | if (lengthptr == NULL) |
2426 | 432 | code = pop_info->code_start + pop_info->length; |
2427 | 882 | } |
2428 | | |
2429 | 33.5k | PCRE2_ASSERT(lengthptr == NULL || code == start_code); |
2430 | | |
2431 | 33.5k | *pptr = ptr; |
2432 | 33.5k | *pcode = code; |
2433 | 33.5k | return TRUE; |
2434 | 33.5k | } |
2435 | | |
2436 | | |
2437 | | |
2438 | | /* This function consumes loosely-binding binary operators. */ |
2439 | | |
2440 | | static BOOL |
2441 | | compile_class_binary_loose(eclass_context *context, BOOL negated, |
2442 | | uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, |
2443 | | PCRE2_SIZE *lengthptr) |
2444 | 22.3k | { |
2445 | 22.3k | uint32_t *ptr = *pptr; |
2446 | 22.3k | PCRE2_UCHAR *code = *pcode; |
2447 | | #ifdef PCRE2_DEBUG |
2448 | | PCRE2_UCHAR *start_code = *pcode; |
2449 | | #endif |
2450 | | |
2451 | | /* We really want to fold the negation operator, if at all possible, so that |
2452 | | simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want |
2453 | | to produce a fully-folded expression, so that we can guarantee not to emit any |
2454 | | OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode). |
2455 | | |
2456 | | This has the consequence that with a little ingenuity, we can in fact avoid |
2457 | | emitting (nearly...) all cases of the "NOT" operator. Imagine that we have: |
2458 | | !(A ... |
2459 | | We have parsed the preceding "!", and we are about to parse the "A" operand. We |
2460 | | don't know yet whether there will even be a following binary operand! Both of |
2461 | | these are possibilities for what follows: |
2462 | | !(A && B) |
2463 | | !(A) |
2464 | | However, we can still fold the "!" into the "A" operand, because no matter what |
2465 | | the following binary operator will be, we can produce an expression which is |
2466 | | equivalent. */ |
2467 | | |
2468 | | /* Because it's a non-empty class, there must be an operand at the start. */ |
2469 | 22.3k | if (!compile_class_binary_tight(context, negated, &ptr, &code, |
2470 | 22.3k | pop_info, lengthptr)) |
2471 | 20 | return FALSE; |
2472 | | |
2473 | 33.5k | while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR) |
2474 | 11.1k | { |
2475 | 11.1k | uint32_t op; |
2476 | 11.1k | BOOL op_neg; |
2477 | 11.1k | BOOL rhs_negated; |
2478 | 11.1k | eclass_op_info rhs_op_info; |
2479 | | |
2480 | 11.1k | if (negated) |
2481 | 7.44k | { |
2482 | | /* The whole expression is being negated; we respond by unconditionally |
2483 | | negating the LHS A, before seeing what follows. And hooray! We can recover, |
2484 | | no matter what follows. */ |
2485 | | /* !(A || B) -> !A && !B */ |
2486 | | /* !(A -- B) -> !(A && !B) -> !A || B */ |
2487 | | /* !(A XOR B) -> !(!A XOR !B) -> !A XNOR !B */ |
2488 | 7.44k | op = (*ptr == META_ECLASS_OR )? ECL_AND : |
2489 | 7.44k | (*ptr == META_ECLASS_SUB)? ECL_OR : |
2490 | 6.95k | /*ptr == META_ECLASS_XOR*/ ECL_XOR; |
2491 | 7.44k | op_neg = (*ptr == META_ECLASS_XOR); |
2492 | 7.44k | rhs_negated = *ptr != META_ECLASS_SUB; |
2493 | 7.44k | } |
2494 | 3.74k | else |
2495 | 3.74k | { |
2496 | | /* A || B -> A || B */ |
2497 | | /* A -- B -> A && !B */ |
2498 | | /* A XOR B -> A XOR B */ |
2499 | 3.74k | op = (*ptr == META_ECLASS_OR )? ECL_OR : |
2500 | 3.74k | (*ptr == META_ECLASS_SUB)? ECL_AND : |
2501 | 1.64k | /*ptr == META_ECLASS_XOR*/ ECL_XOR; |
2502 | 3.74k | op_neg = FALSE; |
2503 | 3.74k | rhs_negated = *ptr == META_ECLASS_SUB; |
2504 | 3.74k | } |
2505 | | |
2506 | 11.1k | ++ptr; |
2507 | | |
2508 | | /* An operand must follow the operator. */ |
2509 | 11.1k | if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code, |
2510 | 11.1k | &rhs_op_info, lengthptr)) |
2511 | 28 | return FALSE; |
2512 | | |
2513 | | /* Convert infix to postfix (RPN). */ |
2514 | 11.1k | fold_binary(op, pop_info, &rhs_op_info, lengthptr); |
2515 | 11.1k | if (op_neg) fold_negation(pop_info, lengthptr, FALSE); |
2516 | 11.1k | if (lengthptr == NULL) |
2517 | 4.36k | code = pop_info->code_start + pop_info->length; |
2518 | 11.1k | } |
2519 | | |
2520 | 22.3k | PCRE2_ASSERT(lengthptr == NULL || code == start_code); |
2521 | | |
2522 | 22.3k | *pptr = ptr; |
2523 | 22.3k | *pcode = code; |
2524 | 22.3k | return TRUE; |
2525 | 22.3k | } |
2526 | | |
2527 | | |
2528 | | |
2529 | | /* This function converts the META codes in pptr into opcodes written to |
2530 | | pcode. The pptr must start at a META_CLASS or META_CLASS_NOT. |
2531 | | |
2532 | | The class is compiled as a left-associative sequence of operator |
2533 | | applications. |
2534 | | |
2535 | | The pptr will be left pointing at the matching META_CLASS_END. */ |
2536 | | |
2537 | | static BOOL |
2538 | | compile_eclass_nested(eclass_context *context, BOOL negated, |
2539 | | uint32_t **pptr, PCRE2_UCHAR **pcode, |
2540 | | eclass_op_info *pop_info, PCRE2_SIZE *lengthptr) |
2541 | 22.3k | { |
2542 | 22.3k | uint32_t *ptr = *pptr; |
2543 | | #ifdef PCRE2_DEBUG |
2544 | | PCRE2_UCHAR *start_code = *pcode; |
2545 | | #endif |
2546 | | |
2547 | | /* The CLASS_IS_ECLASS bit must be set since it is a nested class. */ |
2548 | 22.3k | PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) || |
2549 | 22.3k | *ptr == (META_CLASS_NOT | CLASS_IS_ECLASS)); |
2550 | | |
2551 | 22.3k | if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS)) |
2552 | 5.57k | negated = !negated; |
2553 | | |
2554 | 22.3k | (*pptr)++; |
2555 | | |
2556 | | /* Because it's a non-empty class, there must be an operand at the start. */ |
2557 | 22.3k | if (!compile_class_binary_loose(context, negated, pptr, pcode, |
2558 | 22.3k | pop_info, lengthptr)) |
2559 | 48 | return FALSE; |
2560 | | |
2561 | 22.3k | PCRE2_ASSERT(**pptr == META_CLASS_END); |
2562 | 22.3k | PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code); |
2563 | 22.3k | return TRUE; |
2564 | 22.3k | } |
2565 | | |
2566 | | BOOL |
2567 | | PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions, |
2568 | | uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr, |
2569 | | compile_block *cb, PCRE2_SIZE *lengthptr) |
2570 | 13.3k | { |
2571 | 13.3k | eclass_context context; |
2572 | 13.3k | eclass_op_info op_info; |
2573 | 13.3k | PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0; |
2574 | 13.3k | PCRE2_UCHAR *code = *pcode; |
2575 | 13.3k | PCRE2_UCHAR *previous; |
2576 | 13.3k | BOOL allbitsone = TRUE; |
2577 | | |
2578 | 13.3k | context.needs_bitmap = FALSE; |
2579 | 13.3k | context.options = options; |
2580 | 13.3k | context.xoptions = xoptions; |
2581 | 13.3k | context.errorcodeptr = errorcodeptr; |
2582 | 13.3k | context.cb = cb; |
2583 | | |
2584 | 13.3k | previous = code; |
2585 | 13.3k | *code++ = OP_ECLASS; |
2586 | 13.3k | code += LINK_SIZE; |
2587 | 13.3k | *code++ = 0; /* Flags, currently zero. */ |
2588 | 13.3k | if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr)) |
2589 | 13 | return FALSE; |
2590 | | |
2591 | 13.3k | if (lengthptr != NULL) |
2592 | 7.60k | { |
2593 | 7.60k | *lengthptr += code - previous; |
2594 | 7.60k | code = previous; |
2595 | | /* (*lengthptr - previous_length) now holds the amount of buffer that |
2596 | | we require to make the call to compile_class_nested() with |
2597 | | lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out |
2598 | | before that call. */ |
2599 | 7.60k | } |
2600 | | |
2601 | | /* Do some useful counting of what's in the bitmap. */ |
2602 | 31.2k | for (int i = 0; i < 8; i++) |
2603 | 29.5k | if (op_info.bits.classwords[i] != 0xffffffff) |
2604 | 11.6k | { |
2605 | 11.6k | allbitsone = FALSE; |
2606 | 11.6k | break; |
2607 | 11.6k | } |
2608 | | |
2609 | | /* After constant-folding the extended class syntax, it may turn out to be |
2610 | | a simple class after all. In that case, we can unwrap it from the |
2611 | | OP_ECLASS container - and in fact, we must do so, because in 8-bit |
2612 | | no-Unicode mode the matcher is compiled without support for OP_ECLASS. */ |
2613 | | |
2614 | | #ifndef SUPPORT_WIDE_CHARS |
2615 | | PCRE2_ASSERT(op_info.op_single_type != 0); |
2616 | | #else |
2617 | 13.3k | if (op_info.op_single_type != 0) |
2618 | 7.99k | #endif |
2619 | 7.99k | { |
2620 | | /* Rewind back over the OP_ECLASS. */ |
2621 | 7.99k | code = previous; |
2622 | | |
2623 | | /* If the bits are all ones, and the "high characters" are all matched |
2624 | | too, we use a special-cased encoding of OP_ALLANY. */ |
2625 | | |
2626 | 7.99k | if (op_info.op_single_type == ECL_ANY && allbitsone) |
2627 | 901 | { |
2628 | | /* Advancing code means rewinding lengthptr, at this point. */ |
2629 | 901 | if (lengthptr != NULL) *lengthptr -= 1; |
2630 | 901 | *code++ = OP_ALLANY; |
2631 | 901 | } |
2632 | | |
2633 | | /* If the high bits are all matched / all not-matched, then we emit an |
2634 | | OP_NCLASS/OP_CLASS respectively. */ |
2635 | | |
2636 | 7.09k | else if (op_info.op_single_type == ECL_ANY || |
2637 | 7.09k | op_info.op_single_type == ECL_NONE) |
2638 | 4.22k | { |
2639 | 4.22k | PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR)); |
2640 | | |
2641 | 4.22k | if (lengthptr != NULL) |
2642 | 2.30k | { |
2643 | 2.30k | if (required_len > (*lengthptr - previous_length)) |
2644 | 225 | *lengthptr = previous_length + required_len; |
2645 | 2.30k | } |
2646 | | |
2647 | | /* Advancing code means rewinding lengthptr, at this point. */ |
2648 | 4.22k | if (lengthptr != NULL) *lengthptr -= required_len; |
2649 | 4.22k | *code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS; |
2650 | 4.22k | memcpy(code, op_info.bits.classbits, 32); |
2651 | 4.22k | code += 32 / sizeof(PCRE2_UCHAR); |
2652 | 4.22k | } |
2653 | | |
2654 | | /* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data |
2655 | | there, but, we pulled out its bitmap into op_info, so now we have to |
2656 | | put that back into the OP_XCLASS. */ |
2657 | | |
2658 | 2.86k | else |
2659 | 2.86k | { |
2660 | | #ifndef SUPPORT_WIDE_CHARS |
2661 | | PCRE2_DEBUG_UNREACHABLE(); |
2662 | | #else |
2663 | 2.86k | BOOL need_map = context.needs_bitmap; |
2664 | 2.86k | PCRE2_SIZE required_len; |
2665 | | |
2666 | 2.86k | PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS); |
2667 | 2.86k | required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0); |
2668 | | |
2669 | 2.86k | if (lengthptr != NULL) |
2670 | 1.70k | { |
2671 | | /* Don't unconditionally request all the space we need - we may |
2672 | | already have asked for more during processing of the ECLASS. */ |
2673 | 1.70k | if (required_len > (*lengthptr - previous_length)) |
2674 | 245 | *lengthptr = previous_length + required_len; |
2675 | | |
2676 | | /* The code we write out here won't be ignored, even during the |
2677 | | (lengthptr != NULL) phase, because if there's a following quantifier |
2678 | | it will peek backwards. So we do have to write out a (truncated) |
2679 | | OP_XCLASS, even on this branch. */ |
2680 | 1.70k | *lengthptr -= 1 + LINK_SIZE + 1; |
2681 | 1.70k | *code++ = OP_XCLASS; |
2682 | 1.70k | PUT(code, 0, 1 + LINK_SIZE + 1); |
2683 | 1.70k | code += LINK_SIZE; |
2684 | 1.70k | *code++ = 0; |
2685 | 1.70k | } |
2686 | 1.16k | else |
2687 | 1.16k | { |
2688 | 1.16k | PCRE2_UCHAR *rest; |
2689 | 1.16k | PCRE2_SIZE rest_len; |
2690 | 1.16k | PCRE2_UCHAR flags; |
2691 | | |
2692 | | /* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */ |
2693 | 1.16k | PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1); |
2694 | 1.16k | rest = op_info.code_start + 1 + LINK_SIZE + 1; |
2695 | 1.16k | rest_len = (op_info.code_start + op_info.length) - rest; |
2696 | | |
2697 | | /* First read any data we use, before memmove splats it. */ |
2698 | 1.16k | flags = op_info.code_start[1 + LINK_SIZE]; |
2699 | 1.16k | PCRE2_ASSERT((flags & XCL_MAP) == 0); |
2700 | | |
2701 | | /* Next do the memmove before any writes. */ |
2702 | 1.16k | memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0), |
2703 | 1.16k | rest, CU2BYTES(rest_len)); |
2704 | | |
2705 | | /* Finally write the header data. */ |
2706 | 1.16k | *code++ = OP_XCLASS; |
2707 | 1.16k | PUT(code, 0, (int)required_len); |
2708 | 1.16k | code += LINK_SIZE; |
2709 | 1.16k | *code++ = flags | (need_map? XCL_MAP : 0); |
2710 | 1.16k | if (need_map) |
2711 | 908 | { |
2712 | 908 | memcpy(code, op_info.bits.classbits, 32); |
2713 | 908 | code += 32 / sizeof(PCRE2_UCHAR); |
2714 | 908 | } |
2715 | 1.16k | code += rest_len; |
2716 | 1.16k | } |
2717 | 2.86k | #endif /* SUPPORT_WIDE_CHARS */ |
2718 | 2.86k | } |
2719 | 7.99k | } |
2720 | | |
2721 | | /* Otherwise, we're going to keep the OP_ECLASS. However, again we need |
2722 | | to do some adjustment to insert the bitmap if we have one. */ |
2723 | | |
2724 | 5.33k | #ifdef SUPPORT_WIDE_CHARS |
2725 | 5.33k | else |
2726 | 5.33k | { |
2727 | 5.33k | BOOL need_map = context.needs_bitmap; |
2728 | 5.33k | PCRE2_SIZE required_len = |
2729 | 5.33k | 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length; |
2730 | | |
2731 | 5.33k | if (lengthptr != NULL) |
2732 | 3.11k | { |
2733 | 3.11k | if (required_len > (*lengthptr - previous_length)) |
2734 | 1.01k | *lengthptr = previous_length + required_len; |
2735 | | |
2736 | | /* As for the XCLASS branch above, we do have to write out a dummy |
2737 | | OP_ECLASS, because of the backwards peek by the quantifier code. Write |
2738 | | out a (truncated) OP_ECLASS, even on this branch. */ |
2739 | 3.11k | *lengthptr -= 1 + LINK_SIZE + 1; |
2740 | 3.11k | *code++ = OP_ECLASS; |
2741 | 3.11k | PUT(code, 0, 1 + LINK_SIZE + 1); |
2742 | 3.11k | code += LINK_SIZE; |
2743 | 3.11k | *code++ = 0; |
2744 | 3.11k | } |
2745 | 2.22k | else |
2746 | 2.22k | { |
2747 | 2.22k | if (need_map) |
2748 | 1.98k | { |
2749 | 1.98k | PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1; |
2750 | 1.98k | previous[1 + LINK_SIZE] |= ECL_MAP; |
2751 | 1.98k | memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start, |
2752 | 1.98k | CU2BYTES(code - map_start)); |
2753 | 1.98k | memcpy(map_start, op_info.bits.classbits, 32); |
2754 | 1.98k | code += 32 / sizeof(PCRE2_UCHAR); |
2755 | 1.98k | } |
2756 | 2.22k | PUT(previous, 1, (int)(code - previous)); |
2757 | 2.22k | } |
2758 | 5.33k | } |
2759 | 13.3k | #endif /* SUPPORT_WIDE_CHARS */ |
2760 | | |
2761 | 13.3k | *pcode = code; |
2762 | 13.3k | return TRUE; |
2763 | 13.3k | } |
2764 | | |
2765 | | /* End of pcre2_compile_class.c */ |