/src/php-src/ext/pcre/pcre2lib/pcre2_compile_class.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2024 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | #ifdef HAVE_CONFIG_H |
42 | | #include "config.h" |
43 | | #endif |
44 | | |
45 | | #include "pcre2_compile.h" |
46 | | |
47 | | typedef struct { |
48 | | /* Option bits for eclass. */ |
49 | | uint32_t options; |
50 | | uint32_t xoptions; |
51 | | /* Rarely used members. */ |
52 | | int *errorcodeptr; |
53 | | compile_block *cb; |
54 | | /* Bitmap is needed. */ |
55 | | BOOL needs_bitmap; |
56 | | } eclass_context; |
57 | | |
58 | | /* Checks the allowed tokens at the end of a class structure in debug mode. |
59 | | When a new token is not processed by all loops, and the token is equals to |
60 | | a) one of the cases here: |
61 | | the compiler will complain about a duplicated case value. |
62 | | b) none of the cases here: |
63 | | the loop without the handler will stop with an assertion failure. */ |
64 | | |
65 | | #ifdef PCRE2_DEBUG |
66 | | #define CLASS_END_CASES(meta) \ |
67 | | default: \ |
68 | | PCRE2_ASSERT((meta) <= META_END); \ |
69 | | /* Fall through */ \ |
70 | | case META_CLASS: \ |
71 | | case META_CLASS_NOT: \ |
72 | | case META_CLASS_EMPTY: \ |
73 | | case META_CLASS_EMPTY_NOT: \ |
74 | | case META_CLASS_END: \ |
75 | | case META_ECLASS_AND: \ |
76 | | case META_ECLASS_OR: \ |
77 | | case META_ECLASS_SUB: \ |
78 | | case META_ECLASS_XOR: \ |
79 | | case META_ECLASS_NOT: |
80 | | #else |
81 | | #define CLASS_END_CASES(meta) \ |
82 | 499k | default: |
83 | | #endif |
84 | | |
85 | | #ifdef SUPPORT_WIDE_CHARS |
86 | | |
87 | | /* Heapsort algorithm. */ |
88 | | |
89 | | static void do_heapify(uint32_t *buffer, size_t size, size_t i) |
90 | 62.8k | { |
91 | 62.8k | size_t max; |
92 | 62.8k | size_t left; |
93 | 62.8k | size_t right; |
94 | 62.8k | uint32_t tmp1, tmp2; |
95 | | |
96 | 187k | while (TRUE) |
97 | 187k | { |
98 | 187k | max = i; |
99 | 187k | left = (i << 1) + 2; |
100 | 187k | right = left + 2; |
101 | | |
102 | 187k | if (left < size && buffer[left] > buffer[max]) max = left; |
103 | 187k | if (right < size && buffer[right] > buffer[max]) max = right; |
104 | 187k | if (i == max) return; |
105 | | |
106 | | /* Swap items. */ |
107 | 124k | tmp1 = buffer[i]; |
108 | 124k | tmp2 = buffer[i + 1]; |
109 | 124k | buffer[i] = buffer[max]; |
110 | 124k | buffer[i + 1] = buffer[max + 1]; |
111 | 124k | buffer[max] = tmp1; |
112 | 124k | buffer[max + 1] = tmp2; |
113 | 124k | i = max; |
114 | 124k | } |
115 | 62.8k | } |
116 | | |
117 | | #ifdef SUPPORT_UNICODE |
118 | | |
119 | 1.71k | #define PARSE_CLASS_UTF 0x1 |
120 | 54.0k | #define PARSE_CLASS_CASELESS_UTF 0x2 |
121 | 46.2k | #define PARSE_CLASS_RESTRICTED_UTF 0x4 |
122 | 131k | #define PARSE_CLASS_TURKISH_UTF 0x8 |
123 | | |
124 | | /* Get the range of nocase characters which includes the |
125 | | 'c' character passed as argument, or directly follows 'c'. */ |
126 | | |
127 | | static const uint32_t* |
128 | | get_nocase_range(uint32_t c) |
129 | 51.2k | { |
130 | 51.2k | uint32_t left = 0; |
131 | 51.2k | uint32_t right = PRIV(ucd_nocase_ranges_size); |
132 | 51.2k | uint32_t middle; |
133 | | |
134 | 51.2k | if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right; |
135 | | |
136 | 284k | while (TRUE) |
137 | 284k | { |
138 | | /* Range end of the middle element. */ |
139 | 284k | middle = ((left + right) >> 1) | 0x1; |
140 | | |
141 | 284k | if (PRIV(ucd_nocase_ranges)[middle] <= c) |
142 | 220 | left = middle + 1; |
143 | 284k | else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c) |
144 | 233k | right = middle - 1; |
145 | 51.2k | else |
146 | 51.2k | return PRIV(ucd_nocase_ranges) + (middle - 1); |
147 | 284k | } |
148 | 51.2k | } |
149 | | |
150 | | /* Get the list of othercase characters, which belongs to the passed range. |
151 | | Create ranges from these characters, and append them to the buffer argument. */ |
152 | | |
153 | | static size_t |
154 | | utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options, |
155 | | uint32_t *buffer) |
156 | 51.2k | { |
157 | 51.2k | uint32_t new_start = start; |
158 | 51.2k | uint32_t new_end = end; |
159 | 51.2k | uint32_t c = start; |
160 | 51.2k | const uint32_t *list; |
161 | 51.2k | uint32_t tmp[3]; |
162 | 51.2k | size_t result = 2; |
163 | 51.2k | const uint32_t *skip_range = get_nocase_range(c); |
164 | 51.2k | uint32_t skip_start = skip_range[0]; |
165 | | |
166 | 51.2k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
167 | 51.2k | PCRE2_ASSERT(options & PARSE_CLASS_UTF); |
168 | 51.2k | #endif |
169 | | |
170 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
171 | | if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT; |
172 | | #endif |
173 | | |
174 | 117k | while (c <= end) |
175 | 66.2k | { |
176 | 66.2k | uint32_t co; |
177 | | |
178 | 66.2k | if (c > skip_start) |
179 | 22.4k | { |
180 | 22.4k | c = skip_range[1]; |
181 | 22.4k | skip_range += 2; |
182 | 22.4k | skip_start = skip_range[0]; |
183 | 22.4k | continue; |
184 | 22.4k | } |
185 | | |
186 | | /* Compute caseless set. */ |
187 | | |
188 | 43.7k | if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) == |
189 | 43.7k | PARSE_CLASS_TURKISH_UTF && |
190 | 43.7k | UCD_ANY_I(c)) |
191 | 0 | { |
192 | 0 | co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3); |
193 | 0 | } |
194 | 43.7k | else if ((co = UCD_CASESET(c)) != 0 && |
195 | 43.7k | (options & PARSE_CLASS_RESTRICTED_UTF) != 0 && |
196 | 43.7k | PRIV(ucd_caseless_sets)[co] < 128) |
197 | 0 | { |
198 | 0 | co = 0; /* Ignore the caseless set if it's restricted. */ |
199 | 0 | } |
200 | | |
201 | 43.7k | if (co != 0) |
202 | 2.45k | list = PRIV(ucd_caseless_sets) + co; |
203 | 41.2k | else |
204 | 41.2k | { |
205 | 41.2k | co = UCD_OTHERCASE(c); |
206 | 41.2k | list = tmp; |
207 | 41.2k | tmp[0] = c; |
208 | 41.2k | tmp[1] = NOTACHAR; |
209 | | |
210 | 41.2k | if (co != c) |
211 | 28.0k | { |
212 | 28.0k | tmp[1] = co; |
213 | 28.0k | tmp[2] = NOTACHAR; |
214 | 28.0k | } |
215 | 41.2k | } |
216 | 43.7k | c++; |
217 | | |
218 | | /* Add characters. */ |
219 | 43.7k | do |
220 | 76.6k | { |
221 | | #if PCRE2_CODE_UNIT_WIDTH == 16 |
222 | | if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue; |
223 | | #endif |
224 | | |
225 | 76.6k | if (*list < new_start) |
226 | 13.2k | { |
227 | 13.2k | if (*list + 1 == new_start) |
228 | 0 | { |
229 | 0 | new_start--; |
230 | 0 | continue; |
231 | 0 | } |
232 | 13.2k | } |
233 | 63.4k | else if (*list > new_end) |
234 | 19.2k | { |
235 | 19.2k | if (*list - 1 == new_end) |
236 | 482 | { |
237 | 482 | new_end++; |
238 | 482 | continue; |
239 | 482 | } |
240 | 19.2k | } |
241 | 44.2k | else continue; |
242 | | |
243 | 31.9k | result += 2; |
244 | 31.9k | if (buffer != NULL) |
245 | 15.9k | { |
246 | 15.9k | buffer[0] = *list; |
247 | 15.9k | buffer[1] = *list; |
248 | 15.9k | buffer += 2; |
249 | 15.9k | } |
250 | 31.9k | } |
251 | 76.6k | while (*(++list) != NOTACHAR); |
252 | 43.7k | } |
253 | | |
254 | 51.2k | if (buffer != NULL) |
255 | 25.6k | { |
256 | 25.6k | buffer[0] = new_start; |
257 | 25.6k | buffer[1] = new_end; |
258 | 25.6k | buffer += 2; |
259 | 25.6k | (void)buffer; |
260 | 25.6k | } |
261 | 51.2k | return result; |
262 | 51.2k | } |
263 | | |
264 | | #endif |
265 | | |
266 | | /* Add a character list to a buffer. */ |
267 | | |
268 | | static size_t |
269 | | append_char_list(const uint32_t *p, uint32_t *buffer) |
270 | 0 | { |
271 | 0 | const uint32_t *n; |
272 | 0 | size_t result = 0; |
273 | |
|
274 | 0 | while (*p != NOTACHAR) |
275 | 0 | { |
276 | 0 | n = p; |
277 | 0 | while (n[0] == n[1] - 1) n++; |
278 | |
|
279 | 0 | PCRE2_ASSERT(*p < 0xffff); |
280 | |
|
281 | 0 | if (buffer != NULL) |
282 | 0 | { |
283 | 0 | buffer[0] = *p; |
284 | 0 | buffer[1] = *n; |
285 | 0 | buffer += 2; |
286 | 0 | } |
287 | |
|
288 | 0 | result += 2; |
289 | 0 | p = n + 1; |
290 | 0 | } |
291 | |
|
292 | 0 | return result; |
293 | 0 | } |
294 | | |
295 | | static uint32_t |
296 | | get_highest_char(uint32_t options) |
297 | 0 | { |
298 | 0 | (void)options; /* Avoid compiler warning. */ |
299 | |
|
300 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
301 | 0 | return MAX_UTF_CODE_POINT; |
302 | | #else |
303 | | #ifdef SUPPORT_UNICODE |
304 | | return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0); |
305 | | #else |
306 | | return MAX_UCHAR_VALUE; |
307 | | #endif |
308 | | #endif |
309 | 0 | } |
310 | | |
311 | | /* Add a negated character list to a buffer. */ |
312 | | static size_t |
313 | | append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer) |
314 | 0 | { |
315 | 0 | const uint32_t *n; |
316 | 0 | uint32_t start = 0; |
317 | 0 | size_t result = 2; |
318 | |
|
319 | 0 | PCRE2_ASSERT(*p > 0); |
320 | |
|
321 | 0 | while (*p != NOTACHAR) |
322 | 0 | { |
323 | 0 | n = p; |
324 | 0 | while (n[0] == n[1] - 1) n++; |
325 | |
|
326 | 0 | PCRE2_ASSERT(*p < 0xffff); |
327 | |
|
328 | 0 | if (buffer != NULL) |
329 | 0 | { |
330 | 0 | buffer[0] = start; |
331 | 0 | buffer[1] = *p - 1; |
332 | 0 | buffer += 2; |
333 | 0 | } |
334 | |
|
335 | 0 | result += 2; |
336 | 0 | start = *n + 1; |
337 | 0 | p = n + 1; |
338 | 0 | } |
339 | |
|
340 | 0 | if (buffer != NULL) |
341 | 0 | { |
342 | 0 | buffer[0] = start; |
343 | 0 | buffer[1] = get_highest_char(options); |
344 | 0 | buffer += 2; |
345 | 0 | (void)buffer; |
346 | 0 | } |
347 | |
|
348 | 0 | return result; |
349 | 0 | } |
350 | | |
351 | | static uint32_t * |
352 | | append_non_ascii_range(uint32_t options, uint32_t *buffer) |
353 | 0 | { |
354 | 0 | if (buffer == NULL) return NULL; |
355 | | |
356 | 0 | buffer[0] = 0x100; |
357 | 0 | buffer[1] = get_highest_char(options); |
358 | 0 | return buffer + 2; |
359 | 0 | } |
360 | | |
361 | | static size_t |
362 | | parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer) |
363 | 3.43k | { |
364 | 3.43k | size_t total_size = 0; |
365 | 3.43k | size_t size; |
366 | 3.43k | uint32_t meta_arg; |
367 | 3.43k | uint32_t start_char; |
368 | | |
369 | 56.1k | while (TRUE) |
370 | 56.1k | { |
371 | 56.1k | switch (META_CODE(*ptr)) |
372 | 56.1k | { |
373 | 380 | case META_ESCAPE: |
374 | 380 | meta_arg = META_DATA(*ptr); |
375 | 380 | switch (meta_arg) |
376 | 380 | { |
377 | 0 | case ESC_D: |
378 | 0 | case ESC_W: |
379 | 0 | case ESC_S: |
380 | 0 | buffer = append_non_ascii_range(options, buffer); |
381 | 0 | total_size += 2; |
382 | 0 | break; |
383 | | |
384 | 0 | case ESC_h: |
385 | 0 | size = append_char_list(PRIV(hspace_list), buffer); |
386 | 0 | total_size += size; |
387 | 0 | if (buffer != NULL) buffer += size; |
388 | 0 | break; |
389 | | |
390 | 0 | case ESC_H: |
391 | 0 | size = append_negated_char_list(PRIV(hspace_list), options, buffer); |
392 | 0 | total_size += size; |
393 | 0 | if (buffer != NULL) buffer += size; |
394 | 0 | break; |
395 | | |
396 | 0 | case ESC_v: |
397 | 0 | size = append_char_list(PRIV(vspace_list), buffer); |
398 | 0 | total_size += size; |
399 | 0 | if (buffer != NULL) buffer += size; |
400 | 0 | break; |
401 | | |
402 | 0 | case ESC_V: |
403 | 0 | size = append_negated_char_list(PRIV(vspace_list), options, buffer); |
404 | 0 | total_size += size; |
405 | 0 | if (buffer != NULL) buffer += size; |
406 | 0 | break; |
407 | | |
408 | 380 | case ESC_p: |
409 | 380 | case ESC_P: |
410 | 380 | ptr++; |
411 | 380 | if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY) |
412 | 0 | { |
413 | 0 | if (buffer != NULL) |
414 | 0 | { |
415 | 0 | buffer[0] = 0; |
416 | 0 | buffer[1] = get_highest_char(options); |
417 | 0 | buffer += 2; |
418 | 0 | } |
419 | 0 | total_size += 2; |
420 | 0 | } |
421 | 380 | break; |
422 | 380 | } |
423 | 380 | ptr++; |
424 | 380 | continue; |
425 | 0 | case META_POSIX_NEG: |
426 | 0 | buffer = append_non_ascii_range(options, buffer); |
427 | 0 | total_size += 2; |
428 | 0 | ptr += 2; |
429 | 0 | continue; |
430 | 0 | case META_POSIX: |
431 | 0 | ptr += 2; |
432 | 0 | continue; |
433 | 0 | case META_BIGVALUE: |
434 | | /* Character literal */ |
435 | 0 | ptr++; |
436 | 0 | break; |
437 | 55.7k | CLASS_END_CASES(*ptr) |
438 | 55.7k | if (*ptr >= META_END) return total_size; |
439 | 52.3k | break; |
440 | 56.1k | } |
441 | | |
442 | 52.3k | start_char = *ptr; |
443 | | |
444 | 52.3k | if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED) |
445 | 566 | { |
446 | 566 | ptr += 2; |
447 | 566 | PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE); |
448 | | |
449 | 566 | if (*ptr == META_BIGVALUE) ptr++; |
450 | | |
451 | | #ifdef EBCDIC |
452 | | #error "Missing EBCDIC support" |
453 | | #endif |
454 | 566 | } |
455 | | |
456 | 52.3k | #ifdef SUPPORT_UNICODE |
457 | 52.3k | if (options & PARSE_CLASS_CASELESS_UTF) |
458 | 51.2k | { |
459 | 51.2k | size = utf_caseless_extend(start_char, *ptr++, options, buffer); |
460 | 51.2k | if (buffer != NULL) buffer += size; |
461 | 51.2k | total_size += size; |
462 | 51.2k | continue; |
463 | 51.2k | } |
464 | 1.13k | #endif |
465 | | |
466 | 1.13k | if (buffer != NULL) |
467 | 568 | { |
468 | 568 | buffer[0] = start_char; |
469 | 568 | buffer[1] = *ptr; |
470 | 568 | buffer += 2; |
471 | 568 | } |
472 | | |
473 | 1.13k | ptr++; |
474 | 1.13k | total_size += 2; |
475 | 1.13k | } |
476 | | |
477 | 0 | return total_size; |
478 | 3.43k | } |
479 | | |
480 | | /* Extra uint32_t values for storing the lengths of range lists in |
481 | | the worst case. Two uint32_t lengths and a range end for a range |
482 | | starting before 255 */ |
483 | 1.71k | #define CHAR_LIST_EXTRA_SIZE 3 |
484 | | |
485 | | /* Starting character values for each character list. */ |
486 | | |
487 | | static const uint32_t char_list_starts[] = { |
488 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
489 | | XCL_CHAR_LIST_HIGH_32_START, |
490 | | #endif |
491 | | #if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE |
492 | | XCL_CHAR_LIST_LOW_32_START, |
493 | | #endif |
494 | | XCL_CHAR_LIST_HIGH_16_START, |
495 | | /* Must be terminated by XCL_CHAR_LIST_LOW_16_START, |
496 | | which also represents the end of the bitset. */ |
497 | | XCL_CHAR_LIST_LOW_16_START, |
498 | | }; |
499 | | |
500 | | static class_ranges * |
501 | | compile_optimize_class(uint32_t *start_ptr, uint32_t options, |
502 | | uint32_t xoptions, compile_block *cb) |
503 | 1.71k | { |
504 | 1.71k | class_ranges* cranges; |
505 | 1.71k | uint32_t *ptr; |
506 | 1.71k | uint32_t *buffer; |
507 | 1.71k | uint32_t *dst; |
508 | 1.71k | uint32_t class_options = 0; |
509 | 1.71k | size_t range_list_size = 0, total_size, i; |
510 | 1.71k | uint32_t tmp1, tmp2; |
511 | 1.71k | const uint32_t *char_list_next; |
512 | 1.71k | uint16_t *next_char; |
513 | 1.71k | uint32_t char_list_start, char_list_end; |
514 | 1.71k | uint32_t range_start, range_end; |
515 | | |
516 | 1.71k | #ifdef SUPPORT_UNICODE |
517 | 1.71k | if (options & PCRE2_UTF) |
518 | 1.71k | class_options |= PARSE_CLASS_UTF; |
519 | | |
520 | 1.71k | if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP))) |
521 | 1.69k | class_options |= PARSE_CLASS_CASELESS_UTF; |
522 | | |
523 | 1.71k | if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) |
524 | 0 | class_options |= PARSE_CLASS_RESTRICTED_UTF; |
525 | | |
526 | 1.71k | if (xoptions & PCRE2_EXTRA_TURKISH_CASING) |
527 | 0 | class_options |= PARSE_CLASS_TURKISH_UTF; |
528 | 1.71k | #endif |
529 | | |
530 | | /* Compute required space for the range. */ |
531 | | |
532 | 1.71k | range_list_size = parse_class(start_ptr, class_options, NULL); |
533 | 1.71k | PCRE2_ASSERT((range_list_size & 0x1) == 0); |
534 | | |
535 | | /* Allocate buffer. The total_size also represents the end of the buffer. */ |
536 | | |
537 | 1.71k | total_size = range_list_size + |
538 | 1.71k | ((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0); |
539 | | |
540 | 1.71k | cranges = cb->cx->memctl.malloc( |
541 | 1.71k | sizeof(class_ranges) + total_size * sizeof(uint32_t), |
542 | 1.71k | cb->cx->memctl.memory_data); |
543 | | |
544 | 1.71k | if (cranges == NULL) return NULL; |
545 | | |
546 | 1.71k | cranges->next = NULL; |
547 | 1.71k | cranges->range_list_size = (uint16_t)range_list_size; |
548 | 1.71k | cranges->char_lists_types = 0; |
549 | 1.71k | cranges->char_lists_size = 0; |
550 | 1.71k | cranges->char_lists_start = 0; |
551 | | |
552 | 1.71k | if (range_list_size == 0) return cranges; |
553 | | |
554 | 1.71k | buffer = (uint32_t*)(cranges + 1); |
555 | 1.71k | parse_class(start_ptr, class_options, buffer); |
556 | | |
557 | | /* Using <= instead of == to help static analysis. */ |
558 | 1.71k | if (range_list_size <= 2) return cranges; |
559 | | |
560 | | /* In-place sorting of ranges. */ |
561 | | |
562 | 1.71k | i = (((range_list_size >> 2) - 1) << 1); |
563 | 20.7k | while (TRUE) |
564 | 20.7k | { |
565 | 20.7k | do_heapify(buffer, range_list_size, i); |
566 | 20.7k | if (i == 0) break; |
567 | 18.9k | i -= 2; |
568 | 18.9k | } |
569 | | |
570 | 1.71k | i = range_list_size - 2; |
571 | 42.1k | while (TRUE) |
572 | 42.1k | { |
573 | 42.1k | tmp1 = buffer[i]; |
574 | 42.1k | tmp2 = buffer[i + 1]; |
575 | 42.1k | buffer[i] = buffer[0]; |
576 | 42.1k | buffer[i + 1] = buffer[1]; |
577 | 42.1k | buffer[0] = tmp1; |
578 | 42.1k | buffer[1] = tmp2; |
579 | | |
580 | 42.1k | do_heapify(buffer, i, 0); |
581 | 42.1k | if (i == 0) break; |
582 | 40.4k | i -= 2; |
583 | 40.4k | } |
584 | | |
585 | | /* Merge ranges whenever possible. */ |
586 | 1.71k | dst = buffer; |
587 | 1.71k | ptr = buffer + 2; |
588 | 1.71k | range_list_size -= 2; |
589 | | |
590 | | /* The second condition is a very rare corner case, where the end of the last |
591 | | range is the maximum character. This range cannot be extended further. */ |
592 | | |
593 | 42.1k | while (range_list_size > 0 && dst[1] != ~(uint32_t)0) |
594 | 40.4k | { |
595 | 40.4k | if (dst[1] + 1 < ptr[0]) |
596 | 12.7k | { |
597 | 12.7k | dst += 2; |
598 | 12.7k | dst[0] = ptr[0]; |
599 | 12.7k | dst[1] = ptr[1]; |
600 | 12.7k | } |
601 | 27.6k | else if (dst[1] < ptr[1]) dst[1] = ptr[1]; |
602 | | |
603 | 40.4k | ptr += 2; |
604 | 40.4k | range_list_size -= 2; |
605 | 40.4k | } |
606 | | |
607 | 1.71k | PCRE2_ASSERT(dst[1] <= get_highest_char(class_options)); |
608 | | |
609 | | /* When the number of ranges are less than six, |
610 | | they are not converted to range lists. */ |
611 | | |
612 | 1.71k | ptr = buffer; |
613 | 14.1k | while (ptr < dst && ptr[1] < 0x100) ptr += 2; |
614 | 1.71k | if (dst - ptr < (2 * (6 - 1))) |
615 | 1.71k | { |
616 | 1.71k | cranges->range_list_size = (uint16_t)(dst + 2 - buffer); |
617 | 1.71k | return cranges; |
618 | 1.71k | } |
619 | | |
620 | | /* Compute character lists structures. */ |
621 | | |
622 | 0 | char_list_next = char_list_starts; |
623 | 0 | char_list_start = *char_list_next++; |
624 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
625 | | char_list_end = XCL_CHAR_LIST_HIGH_32_END; |
626 | | #elif defined SUPPORT_UNICODE |
627 | 0 | char_list_end = XCL_CHAR_LIST_LOW_32_END; |
628 | | #else |
629 | | char_list_end = XCL_CHAR_LIST_HIGH_16_END; |
630 | | #endif |
631 | 0 | next_char = (uint16_t*)(buffer + total_size); |
632 | |
|
633 | 0 | tmp1 = 0; |
634 | 0 | tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN; |
635 | 0 | PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN); |
636 | 0 | range_start = dst[0]; |
637 | 0 | range_end = dst[1]; |
638 | |
|
639 | 0 | while (TRUE) |
640 | 0 | { |
641 | 0 | if (range_start >= char_list_start) |
642 | 0 | { |
643 | 0 | if (range_start == range_end || range_end < char_list_end) |
644 | 0 | { |
645 | 0 | tmp1++; |
646 | 0 | next_char--; |
647 | |
|
648 | 0 | if (char_list_start < XCL_CHAR_LIST_LOW_32_START) |
649 | 0 | *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END); |
650 | 0 | else |
651 | 0 | *(uint32_t*)(--next_char) = |
652 | 0 | (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END; |
653 | 0 | } |
654 | |
|
655 | 0 | if (range_start < range_end) |
656 | 0 | { |
657 | 0 | if (range_start > char_list_start) |
658 | 0 | { |
659 | 0 | tmp1++; |
660 | 0 | next_char--; |
661 | |
|
662 | 0 | if (char_list_start < XCL_CHAR_LIST_LOW_32_START) |
663 | 0 | *next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT); |
664 | 0 | else |
665 | 0 | *(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT); |
666 | 0 | } |
667 | 0 | else |
668 | 0 | cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2; |
669 | 0 | } |
670 | |
|
671 | 0 | PCRE2_ASSERT((uint32_t*)next_char >= dst + 2); |
672 | |
|
673 | 0 | if (dst > buffer) |
674 | 0 | { |
675 | 0 | dst -= 2; |
676 | 0 | range_start = dst[0]; |
677 | 0 | range_end = dst[1]; |
678 | 0 | continue; |
679 | 0 | } |
680 | | |
681 | 0 | range_start = 0; |
682 | 0 | range_end = 0; |
683 | 0 | } |
684 | | |
685 | 0 | if (range_end >= char_list_start) |
686 | 0 | { |
687 | 0 | PCRE2_ASSERT(range_start < char_list_start); |
688 | |
|
689 | 0 | if (range_end < char_list_end) |
690 | 0 | { |
691 | 0 | tmp1++; |
692 | 0 | next_char--; |
693 | |
|
694 | 0 | if (char_list_start < XCL_CHAR_LIST_LOW_32_START) |
695 | 0 | *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END); |
696 | 0 | else |
697 | 0 | *(uint32_t*)(--next_char) = |
698 | 0 | (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END; |
699 | |
|
700 | 0 | PCRE2_ASSERT((uint32_t*)next_char >= dst + 2); |
701 | 0 | } |
702 | |
|
703 | 0 | cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2; |
704 | 0 | } |
705 | |
|
706 | 0 | if (tmp1 >= XCL_ITEM_COUNT_MASK) |
707 | 0 | { |
708 | 0 | cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2; |
709 | 0 | next_char--; |
710 | |
|
711 | 0 | if (char_list_start < XCL_CHAR_LIST_LOW_32_START) |
712 | 0 | *next_char = (uint16_t)tmp1; |
713 | 0 | else |
714 | 0 | *(uint32_t*)(--next_char) = tmp1; |
715 | 0 | } |
716 | 0 | else |
717 | 0 | cranges->char_lists_types |= tmp1 << tmp2; |
718 | |
|
719 | 0 | if (range_start < XCL_CHAR_LIST_LOW_16_START) break; |
720 | | |
721 | 0 | PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN); |
722 | 0 | char_list_end = char_list_start - 1; |
723 | 0 | char_list_start = *char_list_next++; |
724 | 0 | tmp1 = 0; |
725 | 0 | tmp2 -= XCL_TYPE_BIT_LEN; |
726 | 0 | } |
727 | |
|
728 | 0 | if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2; |
729 | 0 | PCRE2_ASSERT((uint16_t*)dst <= next_char); |
730 | |
|
731 | 0 | cranges->char_lists_size = |
732 | 0 | (size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char); |
733 | 0 | cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer); |
734 | 0 | cranges->range_list_size = (uint16_t)(dst - buffer); |
735 | 0 | return cranges; |
736 | 1.71k | } |
737 | | |
738 | | #endif /* SUPPORT_WIDE_CHARS */ |
739 | | |
740 | | #ifdef SUPPORT_UNICODE |
741 | | |
742 | | void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated, |
743 | | uint8_t *classbits) |
744 | 2.52k | { |
745 | | /* Update PRIV(xclass) when this function is changed. */ |
746 | 2.52k | int c, chartype; |
747 | 2.52k | const ucd_record *prop; |
748 | 2.52k | uint32_t gentype; |
749 | 2.52k | BOOL set_bit; |
750 | | |
751 | 2.52k | if (ptype == PT_ANY) |
752 | 0 | { |
753 | 0 | if (!negated) memset(classbits, 0xff, 32); |
754 | 0 | return; |
755 | 0 | } |
756 | | |
757 | 648k | for (c = 0; c < 256; c++) |
758 | 646k | { |
759 | 646k | prop = GET_UCD(c); |
760 | 646k | set_bit = FALSE; |
761 | 646k | (void)set_bit; |
762 | | |
763 | 646k | switch (ptype) |
764 | 646k | { |
765 | 0 | case PT_LAMP: |
766 | 0 | chartype = prop->chartype; |
767 | 0 | set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt); |
768 | 0 | break; |
769 | | |
770 | 548k | case PT_GC: |
771 | 548k | set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata); |
772 | 548k | break; |
773 | | |
774 | 52.2k | case PT_PC: |
775 | 52.2k | set_bit = (prop->chartype == pdata); |
776 | 52.2k | break; |
777 | | |
778 | 0 | case PT_SC: |
779 | 0 | set_bit = (prop->script == pdata); |
780 | 0 | break; |
781 | | |
782 | 0 | case PT_SCX: |
783 | 0 | set_bit = (prop->script == pdata || |
784 | 0 | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); |
785 | 0 | break; |
786 | | |
787 | 0 | case PT_ALNUM: |
788 | 0 | gentype = PRIV(ucp_gentype)[prop->chartype]; |
789 | 0 | set_bit = (gentype == ucp_L || gentype == ucp_N); |
790 | 0 | break; |
791 | | |
792 | 0 | case PT_SPACE: /* Perl space */ |
793 | 0 | case PT_PXSPACE: /* POSIX space */ |
794 | 0 | switch(c) |
795 | 0 | { |
796 | 0 | HSPACE_BYTE_CASES: |
797 | 0 | VSPACE_BYTE_CASES: |
798 | 0 | set_bit = TRUE; |
799 | 0 | break; |
800 | | |
801 | 0 | default: |
802 | 0 | set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z); |
803 | 0 | break; |
804 | 0 | } |
805 | 0 | break; |
806 | | |
807 | 45.0k | case PT_WORD: |
808 | 45.0k | chartype = prop->chartype; |
809 | 45.0k | gentype = PRIV(ucp_gentype)[chartype]; |
810 | 45.0k | set_bit = (gentype == ucp_L || gentype == ucp_N || |
811 | 45.0k | chartype == ucp_Mn || chartype == ucp_Pc); |
812 | 45.0k | break; |
813 | | |
814 | 0 | case PT_UCNC: |
815 | 0 | set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || |
816 | 0 | c == CHAR_GRAVE_ACCENT || c >= 0xa0); |
817 | 0 | break; |
818 | | |
819 | 0 | case PT_BIDICL: |
820 | 0 | set_bit = (UCD_BIDICLASS_PROP(prop) == pdata); |
821 | 0 | break; |
822 | | |
823 | 0 | case PT_BOOL: |
824 | 0 | set_bit = MAPBIT(PRIV(ucd_boolprop_sets) + |
825 | 0 | UCD_BPROPS_PROP(prop), pdata) != 0; |
826 | 0 | break; |
827 | | |
828 | 0 | case PT_PXGRAPH: |
829 | 0 | chartype = prop->chartype; |
830 | 0 | gentype = PRIV(ucp_gentype)[chartype]; |
831 | 0 | set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf)); |
832 | 0 | break; |
833 | | |
834 | 0 | case PT_PXPRINT: |
835 | 0 | chartype = prop->chartype; |
836 | 0 | set_bit = (chartype != ucp_Zl && chartype != ucp_Zp && |
837 | 0 | (PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf)); |
838 | 0 | break; |
839 | | |
840 | 0 | case PT_PXPUNCT: |
841 | 0 | gentype = PRIV(ucp_gentype)[prop->chartype]; |
842 | 0 | set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S)); |
843 | 0 | break; |
844 | | |
845 | 0 | default: |
846 | 0 | PCRE2_ASSERT(ptype == PT_PXXDIGIT); |
847 | 0 | set_bit = (c >= CHAR_0 && c <= CHAR_9) || |
848 | 0 | (c >= CHAR_A && c <= CHAR_F) || |
849 | 0 | (c >= CHAR_a && c <= CHAR_f); |
850 | 0 | break; |
851 | 646k | } |
852 | | |
853 | 646k | if (negated) set_bit = !set_bit; |
854 | 646k | if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7)); |
855 | 646k | if ((c & 0x7) == 0x7) classbits++; |
856 | 646k | } |
857 | 2.52k | } |
858 | | |
859 | | #endif /* SUPPORT_UNICODE */ |
860 | | |
861 | | |
862 | | |
863 | | #ifdef SUPPORT_WIDE_CHARS |
864 | | |
865 | | /************************************************* |
866 | | * XClass related properties * |
867 | | *************************************************/ |
868 | | |
869 | | /* XClass needs to be generated. */ |
870 | 24.3k | #define XCLASS_REQUIRED 0x1 |
871 | | /* XClass has 8 bit character. */ |
872 | 383k | #define XCLASS_HAS_8BIT_CHARS 0x2 |
873 | | /* XClass has properties. */ |
874 | 5.34k | #define XCLASS_HAS_PROPS 0x4 |
875 | | /* XClass has character lists. */ |
876 | 5.63k | #define XCLASS_HAS_CHAR_LISTS 0x8 |
877 | | /* XClass matches to all >= 256 characters. */ |
878 | 5.91k | #define XCLASS_HIGH_ANY 0x10 |
879 | | |
880 | | #endif |
881 | | |
882 | | |
883 | | /************************************************* |
884 | | * Internal entry point for add range to class * |
885 | | *************************************************/ |
886 | | |
887 | | /* This function sets the overall range for characters < 256. |
888 | | It also handles non-utf case folding. |
889 | | |
890 | | Arguments: |
891 | | options the options bits |
892 | | xoptions the extra options bits |
893 | | cb compile data |
894 | | start start of range character |
895 | | end end of range character |
896 | | |
897 | | Returns: cb->classbits is updated |
898 | | */ |
899 | | |
900 | | static void |
901 | | add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb, |
902 | | uint32_t start, uint32_t end) |
903 | 399k | { |
904 | 399k | uint8_t *classbits = cb->classbits.classbits; |
905 | 399k | uint32_t c, byte_start, byte_end; |
906 | 399k | uint32_t classbits_end = (end <= 0xff ? end : 0xff); |
907 | | |
908 | | /* If caseless matching is required, scan the range and process alternate |
909 | | cases. In Unicode, there are 8-bit characters that have alternate cases that |
910 | | are greater than 255 and vice-versa (though these may be ignored if caseless |
911 | | restriction is in force). Sometimes we can just extend the original range. */ |
912 | | |
913 | 399k | if ((options & PCRE2_CASELESS) != 0) |
914 | 238k | { |
915 | 238k | #ifdef SUPPORT_UNICODE |
916 | | /* UTF mode. This branch is taken if we don't support wide characters (e.g. |
917 | | 8-bit library, without UTF), but we do treat those characters as Unicode |
918 | | (if UCP flag is set). In this case, we only need to expand the character class |
919 | | set to include the case pairs which are in the 0-255 codepoint range. */ |
920 | 238k | if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0) |
921 | 0 | { |
922 | 0 | BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) == |
923 | 0 | PCRE2_EXTRA_TURKISH_CASING; |
924 | 0 | if (start < 128) |
925 | 0 | { |
926 | 0 | uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127); |
927 | 0 | for (c = start; c <= lo_end; c++) |
928 | 0 | { |
929 | 0 | if (turkish_i && UCD_ANY_I(c)) continue; |
930 | 0 | SETBIT(classbits, cb->fcc[c]); |
931 | 0 | } |
932 | 0 | } |
933 | 0 | if (classbits_end >= 128) |
934 | 0 | { |
935 | 0 | uint32_t hi_start = (start > 128 ? start : 128); |
936 | 0 | for (c = hi_start; c <= classbits_end; c++) |
937 | 0 | { |
938 | 0 | uint32_t co = UCD_OTHERCASE(c); |
939 | 0 | if (co <= 0xff) SETBIT(classbits, co); |
940 | 0 | } |
941 | 0 | } |
942 | 0 | } |
943 | | |
944 | 238k | else |
945 | 238k | #endif /* SUPPORT_UNICODE */ |
946 | | |
947 | | /* Not UTF mode */ |
948 | 238k | { |
949 | 491k | for (c = start; c <= classbits_end; c++) |
950 | 253k | SETBIT(classbits, cb->fcc[c]); |
951 | 238k | } |
952 | 238k | } |
953 | | |
954 | | /* Use the bitmap for characters < 256. Otherwise use extra data. */ |
955 | | |
956 | 399k | byte_start = (start + 7) >> 3; |
957 | 399k | byte_end = (classbits_end + 1) >> 3; |
958 | | |
959 | 399k | if (byte_start >= byte_end) |
960 | 397k | { |
961 | 799k | for (c = start; c <= classbits_end; c++) |
962 | | /* Regardless of start, c will always be <= 255. */ |
963 | 402k | SETBIT(classbits, c); |
964 | 397k | return; |
965 | 397k | } |
966 | | |
967 | 24.6k | for (c = byte_start; c < byte_end; c++) |
968 | 22.3k | classbits[c] = 0xff; |
969 | | |
970 | 2.32k | byte_start <<= 3; |
971 | 2.32k | byte_end <<= 3; |
972 | | |
973 | 9.17k | for (c = start; c < byte_start; c++) |
974 | 6.84k | SETBIT(classbits, c); |
975 | | |
976 | 9.18k | for (c = byte_end; c <= classbits_end; c++) |
977 | 6.85k | SETBIT(classbits, c); |
978 | 2.32k | } |
979 | | |
980 | | |
981 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
982 | | /************************************************* |
983 | | * Internal entry point for add list to class * |
984 | | *************************************************/ |
985 | | |
986 | | /* This function is used for adding a list of horizontal or vertical whitespace |
987 | | characters to a class. The list must be in order so that ranges of characters |
988 | | can be detected and handled appropriately. This function sets the overall range |
989 | | so that the internal functions can try to avoid duplication when handling |
990 | | case-independence. |
991 | | |
992 | | Arguments: |
993 | | options the options bits |
994 | | xoptions the extra options bits |
995 | | cb contains pointers to tables etc. |
996 | | p points to row of 32-bit values, terminated by NOTACHAR |
997 | | |
998 | | Returns: cb->classbits is updated |
999 | | */ |
1000 | | |
1001 | | static void |
1002 | | add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb, |
1003 | | const uint32_t *p) |
1004 | 4 | { |
1005 | 16 | while (p[0] < 256) |
1006 | 12 | { |
1007 | 12 | unsigned int n = 0; |
1008 | | |
1009 | 12 | while(p[n+1] == p[0] + n + 1) n++; |
1010 | 12 | add_to_class(options, xoptions, cb, p[0], p[n]); |
1011 | | |
1012 | 12 | p += n + 1; |
1013 | 12 | } |
1014 | 4 | } |
1015 | | |
1016 | | |
1017 | | |
1018 | | /************************************************* |
1019 | | * Add characters not in a list to a class * |
1020 | | *************************************************/ |
1021 | | |
1022 | | /* This function is used for adding the complement of a list of horizontal or |
1023 | | vertical whitespace to a class. The list must be in order. |
1024 | | |
1025 | | Arguments: |
1026 | | options the options bits |
1027 | | xoptions the extra options bits |
1028 | | cb contains pointers to tables etc. |
1029 | | p points to row of 32-bit values, terminated by NOTACHAR |
1030 | | |
1031 | | Returns: cb->classbits is updated |
1032 | | */ |
1033 | | |
1034 | | static void |
1035 | | add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb, |
1036 | | const uint32_t *p) |
1037 | 198 | { |
1038 | 198 | if (p[0] > 0) |
1039 | 198 | add_to_class(options, xoptions, cb, 0, p[0] - 1); |
1040 | 608 | while (p[0] < 256) |
1041 | 410 | { |
1042 | 962 | while (p[1] == p[0] + 1) p++; |
1043 | 410 | add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1); |
1044 | 410 | p++; |
1045 | 410 | } |
1046 | 198 | } |
1047 | | #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
1048 | | |
1049 | | |
1050 | | |
1051 | | /************************************************* |
1052 | | * Main entry-point to compile a character class * |
1053 | | *************************************************/ |
1054 | | |
1055 | | /* This function consumes a "leaf", which is a set of characters that will |
1056 | | become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */ |
1057 | | |
1058 | | uint32_t * |
1059 | | PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions, |
1060 | | uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap, |
1061 | | int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) |
1062 | 19.7k | { |
1063 | 19.7k | uint32_t *pptr = start_ptr; |
1064 | 19.7k | PCRE2_UCHAR *code = *pcode; |
1065 | 19.7k | BOOL should_flip_negation; |
1066 | 19.7k | const uint8_t *cbits = cb->cbits; |
1067 | | /* Some functions such as add_to_class() or eclass processing |
1068 | | expects that the bitset is stored in cb->classbits.classbits. */ |
1069 | 19.7k | uint8_t *const classbits = cb->classbits.classbits; |
1070 | | |
1071 | 19.7k | #ifdef SUPPORT_UNICODE |
1072 | 19.7k | BOOL utf = (options & PCRE2_UTF) != 0; |
1073 | | #else /* No Unicode support */ |
1074 | | BOOL utf = FALSE; |
1075 | | #endif |
1076 | | |
1077 | | /* Helper variables for OP_XCLASS opcode (for characters > 255). */ |
1078 | | |
1079 | 19.7k | #ifdef SUPPORT_WIDE_CHARS |
1080 | 19.7k | uint32_t xclass_props; |
1081 | 19.7k | PCRE2_UCHAR *class_uchardata; |
1082 | 19.7k | class_ranges* cranges; |
1083 | 19.7k | #endif |
1084 | | |
1085 | | /* If an XClass contains a negative special such as \S, we need to flip the |
1086 | | negation flag at the end, so that support for characters > 255 works correctly |
1087 | | (they are all included in the class). An XClass may need to insert specific |
1088 | | matching or non-matching code for wide characters. |
1089 | | */ |
1090 | | |
1091 | 19.7k | should_flip_negation = FALSE; |
1092 | | |
1093 | | /* XClass will be used when characters > 255 might match. */ |
1094 | | |
1095 | 19.7k | #ifdef SUPPORT_WIDE_CHARS |
1096 | 19.7k | xclass_props = 0; |
1097 | | |
1098 | 19.7k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1099 | 19.7k | cranges = NULL; |
1100 | | |
1101 | 19.7k | if (utf) |
1102 | 3.39k | #endif |
1103 | 3.39k | { |
1104 | 3.39k | if (lengthptr != NULL) |
1105 | 1.71k | { |
1106 | 1.71k | cranges = compile_optimize_class(pptr, options, xoptions, cb); |
1107 | | |
1108 | 1.71k | if (cranges == NULL) |
1109 | 0 | { |
1110 | 0 | *errorcodeptr = ERR21; |
1111 | 0 | return NULL; |
1112 | 0 | } |
1113 | | |
1114 | | /* Caching the pre-processed character ranges. */ |
1115 | 1.71k | if (cb->next_cranges != NULL) |
1116 | 1.53k | cb->next_cranges->next = cranges; |
1117 | 183 | else |
1118 | 183 | cb->cranges = cranges; |
1119 | | |
1120 | 1.71k | cb->next_cranges = cranges; |
1121 | 1.71k | } |
1122 | 1.67k | else |
1123 | 1.67k | { |
1124 | | /* Reuse the pre-processed character ranges. */ |
1125 | 1.67k | cranges = cb->cranges; |
1126 | 1.67k | PCRE2_ASSERT(cranges != NULL); |
1127 | 1.67k | cb->cranges = cranges->next; |
1128 | 1.67k | } |
1129 | | |
1130 | 3.39k | if (cranges->range_list_size > 0) |
1131 | 3.39k | { |
1132 | 3.39k | const uint32_t *ranges = (const uint32_t*)(cranges + 1); |
1133 | | |
1134 | 3.39k | if (ranges[0] <= 255) |
1135 | 3.39k | xclass_props |= XCLASS_HAS_8BIT_CHARS; |
1136 | | |
1137 | 3.39k | if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) && |
1138 | 3.39k | ranges[cranges->range_list_size - 2] <= 256) |
1139 | 0 | xclass_props |= XCLASS_HIGH_ANY; |
1140 | 3.39k | } |
1141 | 3.39k | } |
1142 | | |
1143 | 19.7k | class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ |
1144 | 19.7k | #endif /* SUPPORT_WIDE_CHARS */ |
1145 | | |
1146 | | /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map |
1147 | | in a temporary bit of memory, in case the class contains fewer than two |
1148 | | 8-bit characters because in that case the compiled code doesn't use the bit |
1149 | | map. */ |
1150 | | |
1151 | 19.7k | memset(classbits, 0, 32); |
1152 | | |
1153 | | /* Process items until end_ptr is reached. */ |
1154 | | |
1155 | 450k | while (TRUE) |
1156 | 450k | { |
1157 | 450k | uint32_t meta = *(pptr++); |
1158 | 450k | BOOL local_negate; |
1159 | 450k | int posix_class; |
1160 | 450k | int taboffset, tabopt; |
1161 | 450k | class_bits_storage pbits; |
1162 | 450k | uint32_t escape, c; |
1163 | | |
1164 | | /* Handle POSIX classes such as [:alpha:] etc. */ |
1165 | 450k | switch (META_CODE(meta)) |
1166 | 450k | { |
1167 | 0 | case META_POSIX: |
1168 | 0 | case META_POSIX_NEG: |
1169 | |
|
1170 | 0 | local_negate = (meta == META_POSIX_NEG); |
1171 | 0 | posix_class = *(pptr++); |
1172 | |
|
1173 | 0 | if (local_negate) should_flip_negation = TRUE; /* Note negative special */ |
1174 | | |
1175 | | /* If matching is caseless, upper and lower are converted to alpha. |
1176 | | This relies on the fact that the class table starts with alpha, |
1177 | | lower, upper as the first 3 entries. */ |
1178 | |
|
1179 | 0 | if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2) |
1180 | 0 | posix_class = 0; |
1181 | | |
1182 | | /* When PCRE2_UCP is set, some of the POSIX classes are converted to |
1183 | | different escape sequences that use Unicode properties \p or \P. |
1184 | | Others that are not available via \p or \P have to generate |
1185 | | XCL_PROP/XCL_NOTPROP directly, which is done here. */ |
1186 | |
|
1187 | 0 | #ifdef SUPPORT_UNICODE |
1188 | | /* TODO This entire block of code here appears to be unreachable!? I simply |
1189 | | can't see how it can be hit, given that the frontend parser doesn't emit |
1190 | | META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */ |
1191 | 0 | if ((options & PCRE2_UCP) != 0 && |
1192 | 0 | (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0) |
1193 | 0 | { |
1194 | 0 | uint32_t ptype; |
1195 | |
|
1196 | 0 | switch(posix_class) |
1197 | 0 | { |
1198 | 0 | case PC_GRAPH: |
1199 | 0 | case PC_PRINT: |
1200 | 0 | case PC_PUNCT: |
1201 | 0 | ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH : |
1202 | 0 | (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT; |
1203 | |
|
1204 | 0 | PRIV(update_classbits)(ptype, 0, local_negate, classbits); |
1205 | |
|
1206 | 0 | if ((xclass_props & XCLASS_HIGH_ANY) == 0) |
1207 | 0 | { |
1208 | 0 | if (lengthptr != NULL) |
1209 | 0 | *lengthptr += 3; |
1210 | 0 | else |
1211 | 0 | { |
1212 | 0 | *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; |
1213 | 0 | *class_uchardata++ = (PCRE2_UCHAR)ptype; |
1214 | 0 | *class_uchardata++ = 0; |
1215 | 0 | } |
1216 | 0 | xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; |
1217 | 0 | } |
1218 | 0 | continue; |
1219 | | |
1220 | | /* For the other POSIX classes (ex: ascii) we are going to |
1221 | | fall through to the non-UCP case and build a bit map for |
1222 | | characters with code points less than 256. However, if we are in |
1223 | | a negated POSIX class, characters with code points greater than |
1224 | | 255 must either all match or all not match, depending on whether |
1225 | | the whole class is not or is negated. For example, for |
1226 | | [[:^ascii:]... they must all match, whereas for [^[:^ascii:]... |
1227 | | they must not. |
1228 | | |
1229 | | In the special case where there are no xclass items, this is |
1230 | | automatically handled by the use of OP_CLASS or OP_NCLASS, but an |
1231 | | explicit range is needed for OP_XCLASS. Setting a flag here |
1232 | | causes the range to be generated later when it is known that |
1233 | | OP_XCLASS is required. In the 8-bit library this is relevant only in |
1234 | | utf mode, since no wide characters can exist otherwise. */ |
1235 | | |
1236 | 0 | default: |
1237 | 0 | break; |
1238 | 0 | } |
1239 | 0 | } |
1240 | 0 | #endif /* SUPPORT_UNICODE */ |
1241 | | |
1242 | | /* In the non-UCP case, or when UCP makes no difference, we build the |
1243 | | bit map for the POSIX class in a chunk of local store because we may |
1244 | | be adding and subtracting from it, and we don't want to subtract bits |
1245 | | that may be in the main map already. At the end we or the result into |
1246 | | the bit map that is being built. */ |
1247 | | |
1248 | 0 | posix_class *= 3; |
1249 | | |
1250 | | /* Copy in the first table (always present) */ |
1251 | |
|
1252 | 0 | memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32); |
1253 | | |
1254 | | /* If there is a second table, add or remove it as required. */ |
1255 | |
|
1256 | 0 | taboffset = PRIV(posix_class_maps)[posix_class + 1]; |
1257 | 0 | tabopt = PRIV(posix_class_maps)[posix_class + 2]; |
1258 | |
|
1259 | 0 | if (taboffset >= 0) |
1260 | 0 | { |
1261 | 0 | if (tabopt >= 0) |
1262 | 0 | for (int i = 0; i < 32; i++) |
1263 | 0 | pbits.classbits[i] |= cbits[i + taboffset]; |
1264 | 0 | else |
1265 | 0 | for (int i = 0; i < 32; i++) |
1266 | 0 | pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]); |
1267 | 0 | } |
1268 | | |
1269 | | /* Now see if we need to remove any special characters. An option |
1270 | | value of 1 removes vertical space and 2 removes underscore. */ |
1271 | |
|
1272 | 0 | if (tabopt < 0) tabopt = -tabopt; |
1273 | 0 | if (tabopt == 1) pbits.classbits[1] &= ~0x3c; |
1274 | 0 | else if (tabopt == 2) pbits.classbits[11] &= 0x7f; |
1275 | | |
1276 | | /* Add the POSIX table or its complement into the main table that is |
1277 | | being built and we are done. */ |
1278 | |
|
1279 | 0 | { |
1280 | 0 | uint32_t *classwords = cb->classbits.classwords; |
1281 | |
|
1282 | 0 | if (local_negate) |
1283 | 0 | for (int i = 0; i < 8; i++) |
1284 | 0 | classwords[i] |= (uint32_t)(~pbits.classwords[i]); |
1285 | 0 | else |
1286 | 0 | for (int i = 0; i < 8; i++) |
1287 | 0 | classwords[i] |= pbits.classwords[i]; |
1288 | 0 | } |
1289 | |
|
1290 | 0 | #ifdef SUPPORT_WIDE_CHARS |
1291 | | /* Every class contains at least one < 256 character. */ |
1292 | 0 | xclass_props |= XCLASS_HAS_8BIT_CHARS; |
1293 | 0 | #endif |
1294 | 0 | continue; /* End of POSIX handling */ |
1295 | | |
1296 | | /* Other than POSIX classes, the only items we should encounter are |
1297 | | \d-type escapes and literal characters (possibly as ranges). */ |
1298 | 0 | case META_BIGVALUE: |
1299 | 0 | meta = *(pptr++); |
1300 | 0 | break; |
1301 | | |
1302 | 7.02k | case META_ESCAPE: |
1303 | 7.02k | escape = META_DATA(meta); |
1304 | | |
1305 | 7.02k | switch(escape) |
1306 | 7.02k | { |
1307 | 946 | case ESC_d: |
1308 | 31.2k | for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit]; |
1309 | 946 | break; |
1310 | | |
1311 | 1.29k | case ESC_D: |
1312 | 1.29k | should_flip_negation = TRUE; |
1313 | 42.7k | for (int i = 0; i < 32; i++) |
1314 | 41.4k | classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]); |
1315 | 1.29k | break; |
1316 | | |
1317 | 198 | case ESC_w: |
1318 | 6.53k | for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word]; |
1319 | 198 | break; |
1320 | | |
1321 | 1.74k | case ESC_W: |
1322 | 1.74k | should_flip_negation = TRUE; |
1323 | 57.6k | for (int i = 0; i < 32; i++) |
1324 | 55.9k | classbits[i] |= (uint8_t)(~cbits[i+cbit_word]); |
1325 | 1.74k | break; |
1326 | | |
1327 | | /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl |
1328 | | 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was |
1329 | | previously set by something earlier in the character class. |
1330 | | Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so |
1331 | | we could just adjust the appropriate bit. From PCRE 8.34 we no |
1332 | | longer treat \s and \S specially. */ |
1333 | | |
1334 | 42 | case ESC_s: |
1335 | 1.38k | for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space]; |
1336 | 42 | break; |
1337 | | |
1338 | 72 | case ESC_S: |
1339 | 72 | should_flip_negation = TRUE; |
1340 | 2.37k | for (int i = 0; i < 32; i++) |
1341 | 2.30k | classbits[i] |= (uint8_t)(~cbits[i+cbit_space]); |
1342 | 72 | break; |
1343 | | |
1344 | | /* When adding the horizontal or vertical space lists to a class, or |
1345 | | their complements, disable PCRE2_CASELESS, because it justs wastes |
1346 | | time, and in the "not-x" UTF cases can create unwanted duplicates in |
1347 | | the XCLASS list (provoked by characters that have more than one other |
1348 | | case and by both cases being in the same "not-x" sublist). */ |
1349 | | |
1350 | 4 | case ESC_h: |
1351 | 4 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1352 | 4 | #ifdef SUPPORT_UNICODE |
1353 | 4 | if (cranges != NULL) break; |
1354 | 4 | #endif |
1355 | 4 | add_list_to_class(options & ~PCRE2_CASELESS, xoptions, |
1356 | 4 | cb, PRIV(hspace_list)); |
1357 | | #else |
1358 | | PCRE2_ASSERT(cranges != NULL); |
1359 | | #endif |
1360 | 4 | break; |
1361 | | |
1362 | 14 | case ESC_H: |
1363 | 14 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1364 | 14 | #ifdef SUPPORT_UNICODE |
1365 | 14 | if (cranges != NULL) break; |
1366 | 14 | #endif |
1367 | 14 | add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions, |
1368 | 14 | cb, PRIV(hspace_list)); |
1369 | | #else |
1370 | | PCRE2_ASSERT(cranges != NULL); |
1371 | | #endif |
1372 | 14 | break; |
1373 | | |
1374 | 0 | case ESC_v: |
1375 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1376 | 0 | #ifdef SUPPORT_UNICODE |
1377 | 0 | if (cranges != NULL) break; |
1378 | 0 | #endif |
1379 | 0 | add_list_to_class(options & ~PCRE2_CASELESS, xoptions, |
1380 | 0 | cb, PRIV(vspace_list)); |
1381 | | #else |
1382 | | PCRE2_ASSERT(cranges != NULL); |
1383 | | #endif |
1384 | 0 | break; |
1385 | | |
1386 | 184 | case ESC_V: |
1387 | 184 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1388 | 184 | #ifdef SUPPORT_UNICODE |
1389 | 184 | if (cranges != NULL) break; |
1390 | 184 | #endif |
1391 | 184 | add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions, |
1392 | 184 | cb, PRIV(vspace_list)); |
1393 | | #else |
1394 | | PCRE2_ASSERT(cranges != NULL); |
1395 | | #endif |
1396 | 184 | break; |
1397 | | |
1398 | | /* If Unicode is not supported, \P and \p are not allowed and are |
1399 | | faulted at parse time, so will never appear here. */ |
1400 | | |
1401 | 0 | #ifdef SUPPORT_UNICODE |
1402 | 1.83k | case ESC_p: |
1403 | 2.52k | case ESC_P: |
1404 | 2.52k | { |
1405 | 2.52k | uint32_t ptype = *pptr >> 16; |
1406 | 2.52k | uint32_t pdata = *(pptr++) & 0xffff; |
1407 | | |
1408 | | /* The "Any" is processed by PRIV(update_classbits)(). */ |
1409 | 2.52k | if (ptype == PT_ANY) |
1410 | 0 | { |
1411 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1412 | 0 | if (!utf && escape == ESC_p) memset(classbits, 0xff, 32); |
1413 | 0 | #endif |
1414 | 0 | continue; |
1415 | 0 | } |
1416 | | |
1417 | 2.52k | PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits); |
1418 | | |
1419 | 2.52k | if ((xclass_props & XCLASS_HIGH_ANY) == 0) |
1420 | 2.52k | { |
1421 | 2.52k | if (lengthptr != NULL) |
1422 | 1.26k | *lengthptr += 3; |
1423 | 1.26k | else |
1424 | 1.26k | { |
1425 | 1.26k | *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; |
1426 | 1.26k | *class_uchardata++ = ptype; |
1427 | 1.26k | *class_uchardata++ = pdata; |
1428 | 1.26k | } |
1429 | 2.52k | xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; |
1430 | 2.52k | } |
1431 | 2.52k | } |
1432 | 0 | continue; |
1433 | 7.02k | #endif |
1434 | 7.02k | } |
1435 | | |
1436 | 4.50k | #ifdef SUPPORT_WIDE_CHARS |
1437 | | /* Every non-property class contains at least one < 256 character. */ |
1438 | 4.50k | xclass_props |= XCLASS_HAS_8BIT_CHARS; |
1439 | 4.50k | #endif |
1440 | | /* End handling \d-type escapes */ |
1441 | 4.50k | continue; |
1442 | | |
1443 | 443k | CLASS_END_CASES(meta) |
1444 | | /* Literals. */ |
1445 | 443k | if (meta < META_END) break; |
1446 | | /* Non-literals: end of class contents. */ |
1447 | 19.7k | goto END_PROCESSING; |
1448 | 450k | } |
1449 | | |
1450 | | /* A literal character may be followed by a range meta. At parse time |
1451 | | there are checks for out-of-order characters, for ranges where the two |
1452 | | characters are equal, and for hyphens that cannot indicate a range. At |
1453 | | this point, therefore, no checking is needed. */ |
1454 | | |
1455 | 424k | c = meta; |
1456 | | |
1457 | | /* Remember if \r or \n were explicitly used */ |
1458 | | |
1459 | 424k | if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; |
1460 | | |
1461 | | /* Process a character range */ |
1462 | | |
1463 | 424k | if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED) |
1464 | 1.25k | { |
1465 | 1.25k | uint32_t d; |
1466 | | |
1467 | | #ifdef EBCDIC |
1468 | | BOOL range_is_literal = (*pptr == META_RANGE_LITERAL); |
1469 | | #endif |
1470 | 1.25k | ++pptr; |
1471 | 1.25k | d = *(pptr++); |
1472 | 1.25k | if (d == META_BIGVALUE) d = *(pptr++); |
1473 | | |
1474 | | /* Remember an explicit \r or \n, and add the range to the class. */ |
1475 | | |
1476 | 1.25k | if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; |
1477 | | |
1478 | 1.25k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1479 | 1.25k | #ifdef SUPPORT_UNICODE |
1480 | 1.25k | if (cranges != NULL) continue; |
1481 | 686 | xclass_props |= XCLASS_HAS_8BIT_CHARS; |
1482 | 686 | #endif |
1483 | | |
1484 | | /* In an EBCDIC environment, Perl treats alphabetic ranges specially |
1485 | | because there are holes in the encoding, and simply using the range |
1486 | | A-Z (for example) would include the characters in the holes. This |
1487 | | applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */ |
1488 | | |
1489 | | #ifdef EBCDIC |
1490 | | if (range_is_literal && |
1491 | | (cb->ctypes[c] & ctype_letter) != 0 && |
1492 | | (cb->ctypes[d] & ctype_letter) != 0 && |
1493 | | (c <= CHAR_z) == (d <= CHAR_z)) |
1494 | | { |
1495 | | uint32_t uc = (d <= CHAR_z)? 0 : 64; |
1496 | | uint32_t C = c - uc; |
1497 | | uint32_t D = d - uc; |
1498 | | |
1499 | | if (C <= CHAR_i) |
1500 | | { |
1501 | | add_to_class(options, xoptions, cb, C + uc, |
1502 | | ((D < CHAR_i)? D : CHAR_i) + uc); |
1503 | | C = CHAR_j; |
1504 | | } |
1505 | | |
1506 | | if (C <= D && C <= CHAR_r) |
1507 | | { |
1508 | | add_to_class(options, xoptions, cb, C + uc, |
1509 | | ((D < CHAR_r)? D : CHAR_r) + uc); |
1510 | | C = CHAR_s; |
1511 | | } |
1512 | | |
1513 | | if (C <= D) |
1514 | | add_to_class(options, xoptions, cb, C + uc, D + uc); |
1515 | | } |
1516 | | else |
1517 | | #endif |
1518 | | /* Not an EBCDIC special range */ |
1519 | | |
1520 | 686 | add_to_class(options, xoptions, cb, c, d); |
1521 | | #else |
1522 | | PCRE2_ASSERT(cranges != NULL); |
1523 | | #endif |
1524 | 686 | continue; |
1525 | 1.25k | } /* End of range handling */ |
1526 | | |
1527 | | /* Character ranges are ignored when class_ranges is present. */ |
1528 | 422k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1529 | 422k | #ifdef SUPPORT_UNICODE |
1530 | 422k | if (cranges != NULL) continue; |
1531 | 371k | xclass_props |= XCLASS_HAS_8BIT_CHARS; |
1532 | 371k | #endif |
1533 | | /* Handle a single character. */ |
1534 | | |
1535 | 371k | add_to_class(options, xoptions, cb, meta, meta); |
1536 | | #else |
1537 | | PCRE2_ASSERT(cranges != NULL); |
1538 | | #endif |
1539 | 371k | } /* End of main class-processing loop */ |
1540 | | |
1541 | 19.7k | END_PROCESSING: |
1542 | | |
1543 | 19.7k | #ifdef SUPPORT_WIDE_CHARS |
1544 | 19.7k | PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 || |
1545 | 19.7k | (xclass_props & XCLASS_HIGH_ANY) == 0); |
1546 | | |
1547 | 19.7k | if (cranges != NULL) |
1548 | 3.39k | { |
1549 | 3.39k | uint32_t *range = (uint32_t*)(cranges + 1); |
1550 | 3.39k | uint32_t *end = range + cranges->range_list_size; |
1551 | | |
1552 | 29.9k | while (range < end && range[0] < 256) |
1553 | 26.5k | { |
1554 | 26.5k | PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0); |
1555 | | /* Add range to bitset. If we are in UTF or UCP mode, then clear the |
1556 | | caseless bit, because the cranges handle caselessness (only) in this |
1557 | | condition; see the condition for PARSE_CLASS_CASELESS_UTF in |
1558 | | compile_optimize_class(). */ |
1559 | 26.5k | add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)? |
1560 | 26.5k | (options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]); |
1561 | | |
1562 | 26.5k | if (range[1] > 255) break; |
1563 | 26.5k | range += 2; |
1564 | 26.5k | } |
1565 | | |
1566 | 3.39k | if (cranges->char_lists_size > 0) |
1567 | 0 | { |
1568 | | /* The cranges structure is still used and freed later. */ |
1569 | 0 | PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0); |
1570 | 0 | xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS; |
1571 | 0 | } |
1572 | 3.39k | else |
1573 | 3.39k | { |
1574 | 3.39k | if ((xclass_props & XCLASS_HIGH_ANY) != 0) |
1575 | 0 | { |
1576 | 0 | PCRE2_ASSERT(range + 2 == end && range[0] <= 256 && |
1577 | 0 | range[1] >= GET_MAX_CHAR_VALUE(utf)); |
1578 | 0 | should_flip_negation = TRUE; |
1579 | 0 | range = end; |
1580 | 0 | } |
1581 | | |
1582 | 5.45k | while (range < end) |
1583 | 2.06k | { |
1584 | 2.06k | uint32_t range_start = range[0]; |
1585 | 2.06k | uint32_t range_end = range[1]; |
1586 | | |
1587 | 2.06k | range += 2; |
1588 | 2.06k | xclass_props |= XCLASS_REQUIRED; |
1589 | | |
1590 | 2.06k | if (range_start < 256) range_start = 256; |
1591 | | |
1592 | 2.06k | if (lengthptr != NULL) |
1593 | 1.03k | { |
1594 | 1.03k | #ifdef SUPPORT_UNICODE |
1595 | 1.03k | if (utf) |
1596 | 1.03k | { |
1597 | 1.03k | *lengthptr += 1; |
1598 | | |
1599 | 1.03k | if (range_start < range_end) |
1600 | 1 | *lengthptr += PRIV(ord2utf)(range_start, class_uchardata); |
1601 | | |
1602 | 1.03k | *lengthptr += PRIV(ord2utf)(range_end, class_uchardata); |
1603 | 1.03k | continue; |
1604 | 1.03k | } |
1605 | 0 | #endif /* SUPPORT_UNICODE */ |
1606 | | |
1607 | 0 | *lengthptr += range_start < range_end ? 3 : 2; |
1608 | 0 | continue; |
1609 | 1.03k | } |
1610 | | |
1611 | 1.03k | #ifdef SUPPORT_UNICODE |
1612 | 1.03k | if (utf) |
1613 | 1.03k | { |
1614 | 1.03k | if (range_start < range_end) |
1615 | 1 | { |
1616 | 1 | *class_uchardata++ = XCL_RANGE; |
1617 | 1 | class_uchardata += PRIV(ord2utf)(range_start, class_uchardata); |
1618 | 1 | } |
1619 | 1.02k | else |
1620 | 1.02k | *class_uchardata++ = XCL_SINGLE; |
1621 | | |
1622 | 1.03k | class_uchardata += PRIV(ord2utf)(range_end, class_uchardata); |
1623 | 1.03k | continue; |
1624 | 1.03k | } |
1625 | 1.03k | #endif /* SUPPORT_UNICODE */ |
1626 | | |
1627 | | /* Without UTF support, character values are constrained |
1628 | | by the bit length, and can only be > 256 for 16-bit and |
1629 | | 32-bit libraries. */ |
1630 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
1631 | | if (range_start < range_end) |
1632 | | { |
1633 | | *class_uchardata++ = XCL_RANGE; |
1634 | | *class_uchardata++ = range_start; |
1635 | | } |
1636 | | else |
1637 | | *class_uchardata++ = XCL_SINGLE; |
1638 | | |
1639 | | *class_uchardata++ = range_end; |
1640 | | #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
1641 | 1.03k | } |
1642 | | |
1643 | 3.39k | if (lengthptr == NULL) |
1644 | 1.67k | cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data); |
1645 | 3.39k | } |
1646 | 3.39k | } |
1647 | 19.7k | #endif /* SUPPORT_WIDE_CHARS */ |
1648 | | |
1649 | | /* If there are characters with values > 255, or Unicode property settings |
1650 | | (\p or \P), we have to compile an extended class, with its own opcode, |
1651 | | unless there were no property settings and there was a negated special such |
1652 | | as \S in the class, and PCRE2_UCP is not set, because in that case all |
1653 | | characters > 255 are in or not in the class, so any that were explicitly |
1654 | | given as well can be ignored. |
1655 | | |
1656 | | In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were |
1657 | | were present in a class, we either have to match or not match all wide |
1658 | | characters (depending on whether the whole class is or is not negated). |
1659 | | This requirement is indicated by match_all_or_no_wide_chars being true. |
1660 | | We do this by including an explicit range, which works in both cases. |
1661 | | This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there |
1662 | | cannot be any wide characters in 8-bit non-UTF mode. |
1663 | | |
1664 | | When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit |
1665 | | class where \S etc is present without PCRE2_UCP, causing an extended class |
1666 | | to be compiled, we make sure that all characters > 255 are included by |
1667 | | forcing match_all_or_no_wide_chars to be true. |
1668 | | |
1669 | | If, when generating an xclass, there are no characters < 256, we can omit |
1670 | | the bitmap in the actual compiled code. */ |
1671 | | |
1672 | 19.7k | #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */ |
1673 | 19.7k | if ((xclass_props & XCLASS_REQUIRED) != 0) |
1674 | 2.81k | { |
1675 | 2.81k | PCRE2_UCHAR *previous = code; |
1676 | | |
1677 | 2.81k | if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0) |
1678 | 2.81k | *class_uchardata++ = XCL_END; /* Marks the end of extra data */ |
1679 | 2.81k | *code++ = OP_XCLASS; |
1680 | 2.81k | code += LINK_SIZE; |
1681 | 2.81k | *code = negate_class? XCL_NOT:0; |
1682 | 2.81k | if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP; |
1683 | | |
1684 | | /* If the map is required, move up the extra data to make room for it; |
1685 | | otherwise just move the code pointer to the end of the extra data. */ |
1686 | | |
1687 | 2.81k | if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL) |
1688 | 2.81k | { |
1689 | 2.81k | if (negate_class) |
1690 | 1.38k | { |
1691 | 1.38k | uint32_t *classwords = cb->classbits.classwords; |
1692 | 12.4k | for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i]; |
1693 | 1.38k | } |
1694 | | |
1695 | 2.81k | if (has_bitmap == NULL) |
1696 | 2.81k | { |
1697 | 2.81k | *code++ |= XCL_MAP; |
1698 | 2.81k | (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, |
1699 | 2.81k | CU2BYTES(class_uchardata - code)); |
1700 | 2.81k | memcpy(code, classbits, 32); |
1701 | 2.81k | code = class_uchardata + (32 / sizeof(PCRE2_UCHAR)); |
1702 | 2.81k | } |
1703 | 0 | else |
1704 | 0 | { |
1705 | 0 | code = class_uchardata; |
1706 | 0 | if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0) |
1707 | 0 | *has_bitmap = TRUE; |
1708 | 0 | } |
1709 | 2.81k | } |
1710 | 0 | else code = class_uchardata; |
1711 | | |
1712 | 2.81k | if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0) |
1713 | 0 | { |
1714 | | /* Char lists size is an even number, because all items are 16 or 32 |
1715 | | bit values. The character list data is always aligned to 32 bits. */ |
1716 | 0 | size_t char_lists_size = cranges->char_lists_size; |
1717 | 0 | PCRE2_ASSERT((char_lists_size & 0x1) == 0 && |
1718 | 0 | (cb->char_lists_size & 0x3) == 0); |
1719 | |
|
1720 | 0 | if (lengthptr != NULL) |
1721 | 0 | { |
1722 | 0 | char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t)); |
1723 | |
|
1724 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1725 | 0 | *lengthptr += 2 + LINK_SIZE; |
1726 | | #else |
1727 | | *lengthptr += 1 + LINK_SIZE; |
1728 | | #endif |
1729 | |
|
1730 | 0 | cb->char_lists_size += char_lists_size; |
1731 | |
|
1732 | 0 | char_lists_size /= sizeof(PCRE2_UCHAR); |
1733 | | |
1734 | | /* Storage space for character lists is included |
1735 | | in the maximum pattern size. */ |
1736 | 0 | if (*lengthptr > MAX_PATTERN_SIZE || |
1737 | 0 | MAX_PATTERN_SIZE - *lengthptr < char_lists_size) |
1738 | 0 | { |
1739 | 0 | *errorcodeptr = ERR20; /* Pattern is too large */ |
1740 | 0 | return NULL; |
1741 | 0 | } |
1742 | 0 | } |
1743 | 0 | else |
1744 | 0 | { |
1745 | 0 | uint8_t *data; |
1746 | |
|
1747 | 0 | PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK); |
1748 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1749 | | /* Encode as high / low bytes. */ |
1750 | 0 | code[0] = (uint8_t)(XCL_LIST | |
1751 | 0 | (cranges->char_lists_types >> 8)); |
1752 | 0 | code[1] = (uint8_t)cranges->char_lists_types; |
1753 | 0 | code += 2; |
1754 | | #else |
1755 | | *code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types); |
1756 | | #endif |
1757 | | |
1758 | | /* Character lists are stored in backwards direction from |
1759 | | byte code start. The non-dfa/dfa matchers can access these |
1760 | | lists using the byte code start stored in match blocks. |
1761 | | Each list is aligned to 32 bit with an optional unused |
1762 | | 16 bit value at the beginning of the character list. */ |
1763 | |
|
1764 | 0 | cb->char_lists_size += char_lists_size; |
1765 | 0 | data = (uint8_t*)cb->start_code - cb->char_lists_size; |
1766 | |
|
1767 | 0 | memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start, |
1768 | 0 | char_lists_size); |
1769 | | |
1770 | | /* Since character lists total size is less than MAX_PATTERN_SIZE, |
1771 | | their starting offset fits into a value which size is LINK_SIZE. */ |
1772 | |
|
1773 | 0 | char_lists_size = cb->char_lists_size; |
1774 | 0 | PUT(code, 0, (uint32_t)(char_lists_size >> 1)); |
1775 | 0 | code += LINK_SIZE; |
1776 | |
|
1777 | | #if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND |
1778 | | if ((char_lists_size & 0x2) != 0) |
1779 | | { |
1780 | | /* In debug the unused 16 bit value is set |
1781 | | to a fixed value and marked unused. */ |
1782 | | ((uint16_t*)data)[-1] = 0x5555; |
1783 | | #ifdef SUPPORT_VALGRIND |
1784 | | VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2); |
1785 | | #endif |
1786 | | } |
1787 | | #endif |
1788 | |
|
1789 | 0 | cb->char_lists_size = |
1790 | 0 | CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t)); |
1791 | |
|
1792 | 0 | cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data); |
1793 | 0 | } |
1794 | 0 | } |
1795 | | |
1796 | | /* Now fill in the complete length of the item */ |
1797 | | |
1798 | 2.81k | PUT(previous, 1, (int)(code - previous)); |
1799 | 2.81k | goto DONE; /* End of class handling */ |
1800 | 2.81k | } |
1801 | 16.9k | #endif /* SUPPORT_WIDE_CHARS */ |
1802 | | |
1803 | | /* If there are no characters > 255, or they are all to be included or |
1804 | | excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the |
1805 | | whole class was negated and whether there were negative specials such as \S |
1806 | | (non-UCP) in the class. Then copy the 32-byte map into the code vector, |
1807 | | negating it if necessary. */ |
1808 | | |
1809 | 16.9k | if (negate_class) |
1810 | 5.03k | { |
1811 | 5.03k | uint32_t *classwords = cb->classbits.classwords; |
1812 | | |
1813 | 45.2k | for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i]; |
1814 | 5.03k | } |
1815 | | |
1816 | 16.9k | if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) && |
1817 | 16.9k | cb->classbits.classwords[0] == ~(uint32_t)0) |
1818 | 1.45k | { |
1819 | 1.45k | const uint32_t *classwords = cb->classbits.classwords; |
1820 | 1.45k | int i; |
1821 | | |
1822 | 4.28k | for (i = 0; i < 8; i++) |
1823 | 4.11k | if (classwords[i] != ~(uint32_t)0) break; |
1824 | | |
1825 | 1.45k | if (i == 8) |
1826 | 166 | { |
1827 | 166 | *code++ = OP_ALLANY; |
1828 | 166 | goto DONE; /* End of class handling */ |
1829 | 166 | } |
1830 | 1.45k | } |
1831 | | |
1832 | 16.7k | *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; |
1833 | 16.7k | memcpy(code, classbits, 32); |
1834 | 16.7k | code += 32 / sizeof(PCRE2_UCHAR); |
1835 | | |
1836 | 19.7k | DONE: |
1837 | 19.7k | *pcode = code; |
1838 | 19.7k | return pptr - 1; |
1839 | 16.7k | } |
1840 | | |
1841 | | |
1842 | | |
1843 | | /* ===================================================================*/ |
1844 | | /* Here follows a block of ECLASS-compiling functions. You may well want to |
1845 | | read them from top to bottom; they are ordered from leafmost (at the top) to |
1846 | | outermost parser (at the bottom of the file). */ |
1847 | | |
1848 | | /* This function folds one operand using the negation operator. |
1849 | | The new, combined chunk of stack code is written out to *pop_info. */ |
1850 | | |
1851 | | static void |
1852 | | fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr, |
1853 | | BOOL preserve_classbits) |
1854 | 0 | { |
1855 | | /* If the chunk of stack code is already composed of multiple ops, we won't |
1856 | | descend in and try and propagate the negation down the tree. (That would lead |
1857 | | to O(n^2) compile-time, which could be exploitable with a malicious regex - |
1858 | | although maybe that's not really too much of a worry in a library that offers |
1859 | | an exponential-time matching function!) */ |
1860 | |
|
1861 | 0 | if (pop_info->op_single_type == 0) |
1862 | 0 | { |
1863 | 0 | if (lengthptr != NULL) |
1864 | 0 | *lengthptr += 1; |
1865 | 0 | else |
1866 | 0 | pop_info->code_start[pop_info->length] = ECL_NOT; |
1867 | 0 | pop_info->length += 1; |
1868 | 0 | } |
1869 | | |
1870 | | /* Otherwise, it's a nice single-op item, so we can easily fold in the negation |
1871 | | without needing to produce an ECL_NOT. */ |
1872 | | |
1873 | 0 | else if (pop_info->op_single_type == ECL_ANY || |
1874 | 0 | pop_info->op_single_type == ECL_NONE) |
1875 | 0 | { |
1876 | 0 | pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)? |
1877 | 0 | ECL_ANY : ECL_NONE; |
1878 | 0 | if (lengthptr == NULL) |
1879 | 0 | *(pop_info->code_start) = pop_info->op_single_type; |
1880 | 0 | } |
1881 | 0 | else |
1882 | 0 | { |
1883 | 0 | PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS && |
1884 | 0 | pop_info->length >= 1 + LINK_SIZE + 1); |
1885 | 0 | if (lengthptr == NULL) |
1886 | 0 | pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT; |
1887 | 0 | } |
1888 | |
|
1889 | 0 | if (!preserve_classbits) |
1890 | 0 | { |
1891 | 0 | for (int i = 0; i < 8; i++) |
1892 | 0 | pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i]; |
1893 | 0 | } |
1894 | 0 | } |
1895 | | |
1896 | | |
1897 | | |
1898 | | /* This function folds together two operands using a binary operator. |
1899 | | The new, combined chunk of stack code is written out to *lhs_op_info. */ |
1900 | | |
1901 | | static void |
1902 | | fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info, |
1903 | | PCRE2_SIZE *lengthptr) |
1904 | 0 | { |
1905 | 0 | switch (op) |
1906 | 0 | { |
1907 | | /* ECL_AND truth table: |
1908 | | |
1909 | | LHS RHS RESULT |
1910 | | ---------------- |
1911 | | ANY * RHS |
1912 | | * ANY LHS |
1913 | | NONE * NONE |
1914 | | * NONE NONE |
1915 | | X Y X & Y |
1916 | | */ |
1917 | | |
1918 | 0 | case ECL_AND: |
1919 | 0 | if (rhs_op_info->op_single_type == ECL_ANY) |
1920 | 0 | { |
1921 | | /* no-op: drop the RHS */ |
1922 | 0 | } |
1923 | 0 | else if (lhs_op_info->op_single_type == ECL_ANY) |
1924 | 0 | { |
1925 | | /* no-op: drop the LHS, and memmove the RHS into its place */ |
1926 | 0 | if (lengthptr == NULL) |
1927 | 0 | memmove(lhs_op_info->code_start, rhs_op_info->code_start, |
1928 | 0 | CU2BYTES(rhs_op_info->length)); |
1929 | 0 | lhs_op_info->length = rhs_op_info->length; |
1930 | 0 | lhs_op_info->op_single_type = rhs_op_info->op_single_type; |
1931 | 0 | } |
1932 | 0 | else if (rhs_op_info->op_single_type == ECL_NONE) |
1933 | 0 | { |
1934 | | /* the result is ECL_NONE: write into the LHS */ |
1935 | 0 | if (lengthptr == NULL) |
1936 | 0 | lhs_op_info->code_start[0] = ECL_NONE; |
1937 | 0 | lhs_op_info->length = 1; |
1938 | 0 | lhs_op_info->op_single_type = ECL_NONE; |
1939 | 0 | } |
1940 | 0 | else if (lhs_op_info->op_single_type == ECL_NONE) |
1941 | 0 | { |
1942 | | /* the result is ECL_NONE: drop the RHS */ |
1943 | 0 | } |
1944 | 0 | else |
1945 | 0 | { |
1946 | | /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ |
1947 | 0 | if (lengthptr != NULL) |
1948 | 0 | *lengthptr += 1; |
1949 | 0 | else |
1950 | 0 | { |
1951 | 0 | PCRE2_ASSERT(rhs_op_info->code_start == |
1952 | 0 | lhs_op_info->code_start + lhs_op_info->length); |
1953 | 0 | rhs_op_info->code_start[rhs_op_info->length] = ECL_AND; |
1954 | 0 | } |
1955 | 0 | lhs_op_info->length += rhs_op_info->length + 1; |
1956 | 0 | lhs_op_info->op_single_type = 0; |
1957 | 0 | } |
1958 | |
|
1959 | 0 | for (int i = 0; i < 8; i++) |
1960 | 0 | lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i]; |
1961 | 0 | break; |
1962 | | |
1963 | | /* ECL_OR truth table: |
1964 | | |
1965 | | LHS RHS RESULT |
1966 | | ---------------- |
1967 | | ANY * ANY |
1968 | | * ANY ANY |
1969 | | NONE * RHS |
1970 | | * NONE LHS |
1971 | | X Y X | Y |
1972 | | */ |
1973 | | |
1974 | 0 | case ECL_OR: |
1975 | 0 | if (rhs_op_info->op_single_type == ECL_NONE) |
1976 | 0 | { |
1977 | | /* no-op: drop the RHS */ |
1978 | 0 | } |
1979 | 0 | else if (lhs_op_info->op_single_type == ECL_NONE) |
1980 | 0 | { |
1981 | | /* no-op: drop the LHS, and memmove the RHS into its place */ |
1982 | 0 | if (lengthptr == NULL) |
1983 | 0 | memmove(lhs_op_info->code_start, rhs_op_info->code_start, |
1984 | 0 | CU2BYTES(rhs_op_info->length)); |
1985 | 0 | lhs_op_info->length = rhs_op_info->length; |
1986 | 0 | lhs_op_info->op_single_type = rhs_op_info->op_single_type; |
1987 | 0 | } |
1988 | 0 | else if (rhs_op_info->op_single_type == ECL_ANY) |
1989 | 0 | { |
1990 | | /* the result is ECL_ANY: write into the LHS */ |
1991 | 0 | if (lengthptr == NULL) |
1992 | 0 | lhs_op_info->code_start[0] = ECL_ANY; |
1993 | 0 | lhs_op_info->length = 1; |
1994 | 0 | lhs_op_info->op_single_type = ECL_ANY; |
1995 | 0 | } |
1996 | 0 | else if (lhs_op_info->op_single_type == ECL_ANY) |
1997 | 0 | { |
1998 | | /* the result is ECL_ANY: drop the RHS */ |
1999 | 0 | } |
2000 | 0 | else |
2001 | 0 | { |
2002 | | /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ |
2003 | 0 | if (lengthptr != NULL) |
2004 | 0 | *lengthptr += 1; |
2005 | 0 | else |
2006 | 0 | { |
2007 | 0 | PCRE2_ASSERT(rhs_op_info->code_start == |
2008 | 0 | lhs_op_info->code_start + lhs_op_info->length); |
2009 | 0 | rhs_op_info->code_start[rhs_op_info->length] = ECL_OR; |
2010 | 0 | } |
2011 | 0 | lhs_op_info->length += rhs_op_info->length + 1; |
2012 | 0 | lhs_op_info->op_single_type = 0; |
2013 | 0 | } |
2014 | |
|
2015 | 0 | for (int i = 0; i < 8; i++) |
2016 | 0 | lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i]; |
2017 | 0 | break; |
2018 | | |
2019 | | /* ECL_XOR truth table: |
2020 | | |
2021 | | LHS RHS RESULT |
2022 | | ---------------- |
2023 | | ANY * !RHS |
2024 | | * ANY !LHS |
2025 | | NONE * RHS |
2026 | | * NONE LHS |
2027 | | X Y X ^ Y |
2028 | | */ |
2029 | | |
2030 | 0 | case ECL_XOR: |
2031 | 0 | if (rhs_op_info->op_single_type == ECL_NONE) |
2032 | 0 | { |
2033 | | /* no-op: drop the RHS */ |
2034 | 0 | } |
2035 | 0 | else if (lhs_op_info->op_single_type == ECL_NONE) |
2036 | 0 | { |
2037 | | /* no-op: drop the LHS, and memmove the RHS into its place */ |
2038 | 0 | if (lengthptr == NULL) |
2039 | 0 | memmove(lhs_op_info->code_start, rhs_op_info->code_start, |
2040 | 0 | CU2BYTES(rhs_op_info->length)); |
2041 | 0 | lhs_op_info->length = rhs_op_info->length; |
2042 | 0 | lhs_op_info->op_single_type = rhs_op_info->op_single_type; |
2043 | 0 | } |
2044 | 0 | else if (rhs_op_info->op_single_type == ECL_ANY) |
2045 | 0 | { |
2046 | | /* the result is !LHS: fold in the negation, and drop the RHS */ |
2047 | | /* Preserve the classbits, because we promise to deal with them later. */ |
2048 | 0 | fold_negation(lhs_op_info, lengthptr, TRUE); |
2049 | 0 | } |
2050 | 0 | else if (lhs_op_info->op_single_type == ECL_ANY) |
2051 | 0 | { |
2052 | | /* the result is !RHS: drop the LHS, memmove the RHS into its place, and |
2053 | | fold in the negation */ |
2054 | 0 | if (lengthptr == NULL) |
2055 | 0 | memmove(lhs_op_info->code_start, rhs_op_info->code_start, |
2056 | 0 | CU2BYTES(rhs_op_info->length)); |
2057 | 0 | lhs_op_info->length = rhs_op_info->length; |
2058 | 0 | lhs_op_info->op_single_type = rhs_op_info->op_single_type; |
2059 | | |
2060 | | /* Preserve the classbits, because we promise to deal with them later. */ |
2061 | 0 | fold_negation(lhs_op_info, lengthptr, TRUE); |
2062 | 0 | } |
2063 | 0 | else |
2064 | 0 | { |
2065 | | /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ |
2066 | 0 | if (lengthptr != NULL) |
2067 | 0 | *lengthptr += 1; |
2068 | 0 | else |
2069 | 0 | { |
2070 | 0 | PCRE2_ASSERT(rhs_op_info->code_start == |
2071 | 0 | lhs_op_info->code_start + lhs_op_info->length); |
2072 | 0 | rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR; |
2073 | 0 | } |
2074 | 0 | lhs_op_info->length += rhs_op_info->length + 1; |
2075 | 0 | lhs_op_info->op_single_type = 0; |
2076 | 0 | } |
2077 | |
|
2078 | 0 | for (int i = 0; i < 8; i++) |
2079 | 0 | lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i]; |
2080 | 0 | break; |
2081 | | |
2082 | 0 | default: |
2083 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
2084 | 0 | break; |
2085 | 0 | } |
2086 | 0 | } |
2087 | | |
2088 | | |
2089 | | |
2090 | | static BOOL |
2091 | | compile_eclass_nested(eclass_context *context, BOOL negated, |
2092 | | uint32_t **pptr, PCRE2_UCHAR **pcode, |
2093 | | eclass_op_info *pop_info, PCRE2_SIZE *lengthptr); |
2094 | | |
2095 | | /* This function consumes a group of implicitly-unioned class elements. |
2096 | | These can be characters, ranges, properties, or nested classes, as long |
2097 | | as they are all joined by being placed adjacently. */ |
2098 | | |
2099 | | static BOOL |
2100 | | compile_class_operand(eclass_context *context, BOOL negated, |
2101 | | uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, |
2102 | | PCRE2_SIZE *lengthptr) |
2103 | 0 | { |
2104 | 0 | uint32_t *ptr = *pptr; |
2105 | 0 | uint32_t *prev_ptr; |
2106 | 0 | PCRE2_UCHAR *code = *pcode; |
2107 | 0 | PCRE2_UCHAR *code_start = code; |
2108 | 0 | PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0; |
2109 | 0 | PCRE2_SIZE extra_length; |
2110 | 0 | uint32_t meta = META_CODE(*ptr); |
2111 | |
|
2112 | 0 | switch (meta) |
2113 | 0 | { |
2114 | 0 | case META_CLASS_EMPTY_NOT: |
2115 | 0 | case META_CLASS_EMPTY: |
2116 | 0 | ++ptr; |
2117 | 0 | pop_info->length = 1; |
2118 | 0 | if ((meta == META_CLASS_EMPTY) == negated) |
2119 | 0 | { |
2120 | 0 | *code++ = pop_info->op_single_type = ECL_ANY; |
2121 | 0 | memset(pop_info->bits.classbits, 0xff, 32); |
2122 | 0 | } |
2123 | 0 | else |
2124 | 0 | { |
2125 | 0 | *code++ = pop_info->op_single_type = ECL_NONE; |
2126 | 0 | memset(pop_info->bits.classbits, 0, 32); |
2127 | 0 | } |
2128 | 0 | break; |
2129 | | |
2130 | 0 | case META_CLASS: |
2131 | 0 | case META_CLASS_NOT: |
2132 | 0 | if ((*ptr & CLASS_IS_ECLASS) != 0) |
2133 | 0 | { |
2134 | 0 | if (!compile_eclass_nested(context, negated, &ptr, &code, |
2135 | 0 | pop_info, lengthptr)) |
2136 | 0 | return FALSE; |
2137 | | |
2138 | 0 | PCRE2_ASSERT(*ptr == META_CLASS_END); |
2139 | 0 | ptr++; |
2140 | 0 | goto DONE; |
2141 | 0 | } |
2142 | | |
2143 | 0 | ptr++; |
2144 | | /* Fall through */ |
2145 | |
|
2146 | 0 | default: |
2147 | | /* Scan forward characters, ranges, and properties. |
2148 | | For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but |
2149 | | we still need to collect that fragment up into a "leaf" OP_CLASS. */ |
2150 | |
|
2151 | 0 | prev_ptr = ptr; |
2152 | 0 | ptr = PRIV(compile_class_not_nested)( |
2153 | 0 | context->options, context->xoptions, ptr, &code, |
2154 | 0 | (meta != META_CLASS_NOT) == negated, &context->needs_bitmap, |
2155 | 0 | context->errorcodeptr, context->cb, lengthptr); |
2156 | 0 | if (ptr == NULL) return FALSE; |
2157 | | |
2158 | | /* We must have a 100% guarantee that ptr increases when |
2159 | | compile_class_operand() returns, even on Release builds, so that we can |
2160 | | statically prove our loops terminate. */ |
2161 | 0 | if (ptr <= prev_ptr) |
2162 | 0 | { |
2163 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
2164 | 0 | return FALSE; |
2165 | 0 | } |
2166 | | |
2167 | | /* If we fell through above, consume the closing ']'. */ |
2168 | 0 | if (meta == META_CLASS || meta == META_CLASS_NOT) |
2169 | 0 | { |
2170 | 0 | PCRE2_ASSERT(*ptr == META_CLASS_END); |
2171 | 0 | ptr++; |
2172 | 0 | } |
2173 | | |
2174 | | /* Regardless of whether (lengthptr == NULL), some data will still be written |
2175 | | out to *pcode, which we need: we have to peek at it, to transform the opcode |
2176 | | into the ECLASS version (since we need to hoist up the bitmaps). */ |
2177 | 0 | PCRE2_ASSERT(code > code_start); |
2178 | 0 | extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0; |
2179 | | |
2180 | | /* Easiest case: convert OP_ALLANY to ECL_ANY */ |
2181 | |
|
2182 | 0 | if (*code_start == OP_ALLANY) |
2183 | 0 | { |
2184 | 0 | PCRE2_ASSERT(code - code_start == 1 && extra_length == 0); |
2185 | 0 | pop_info->length = 1; |
2186 | 0 | *code_start = pop_info->op_single_type = ECL_ANY; |
2187 | 0 | memset(pop_info->bits.classbits, 0xff, 32); |
2188 | 0 | } |
2189 | | |
2190 | | /* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to |
2191 | | ECL_NONE / ECL_ANY respectively. */ |
2192 | | |
2193 | 0 | else if (*code_start == OP_CLASS || *code_start == OP_NCLASS) |
2194 | 0 | { |
2195 | 0 | PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) && |
2196 | 0 | extra_length == 0); |
2197 | 0 | pop_info->length = 1; |
2198 | 0 | *code_start = pop_info->op_single_type = |
2199 | 0 | (*code_start == OP_CLASS)? ECL_NONE : ECL_ANY; |
2200 | 0 | memcpy(pop_info->bits.classbits, code_start + 1, 32); |
2201 | | /* Rewind the code pointer, but make sure we adjust *lengthptr, because we |
2202 | | do need to reserve that space (even though we only use it temporarily). */ |
2203 | 0 | if (lengthptr != NULL) |
2204 | 0 | *lengthptr += code - (code_start + 1); |
2205 | 0 | code = code_start + 1; |
2206 | |
|
2207 | 0 | if (!context->needs_bitmap && *code_start == ECL_NONE) |
2208 | 0 | { |
2209 | 0 | uint32_t *classwords = pop_info->bits.classwords; |
2210 | |
|
2211 | 0 | for (int i = 0; i < 8; i++) |
2212 | 0 | if (classwords[i] != 0) |
2213 | 0 | { |
2214 | 0 | context->needs_bitmap = TRUE; |
2215 | 0 | break; |
2216 | 0 | } |
2217 | 0 | } |
2218 | 0 | else |
2219 | 0 | context->needs_bitmap = TRUE; |
2220 | 0 | } |
2221 | | |
2222 | | /* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to |
2223 | | ECL_XCLASS. */ |
2224 | | |
2225 | 0 | else |
2226 | 0 | { |
2227 | 0 | PCRE2_ASSERT(*code_start == OP_XCLASS); |
2228 | 0 | *code_start = pop_info->op_single_type = ECL_XCLASS; |
2229 | |
|
2230 | 0 | PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1); |
2231 | |
|
2232 | 0 | memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32); |
2233 | 0 | pop_info->length = (code - code_start) + extra_length; |
2234 | 0 | } |
2235 | |
|
2236 | 0 | break; |
2237 | 0 | } /* End of switch(meta) */ |
2238 | | |
2239 | 0 | pop_info->code_start = (lengthptr == NULL)? code_start : NULL; |
2240 | |
|
2241 | 0 | if (lengthptr != NULL) |
2242 | 0 | { |
2243 | 0 | *lengthptr += code - code_start; |
2244 | 0 | code = code_start; |
2245 | 0 | } |
2246 | |
|
2247 | 0 | DONE: |
2248 | 0 | PCRE2_ASSERT(lengthptr == NULL || (code == code_start)); |
2249 | |
|
2250 | 0 | *pptr = ptr; |
2251 | 0 | *pcode = code; |
2252 | 0 | return TRUE; |
2253 | 0 | } |
2254 | | |
2255 | | |
2256 | | |
2257 | | /* This function consumes a group of implicitly-unioned class elements. |
2258 | | These can be characters, ranges, properties, or nested classes, as long |
2259 | | as they are all joined by being placed adjacently. */ |
2260 | | |
2261 | | static BOOL |
2262 | | compile_class_juxtaposition(eclass_context *context, BOOL negated, |
2263 | | uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, |
2264 | | PCRE2_SIZE *lengthptr) |
2265 | 0 | { |
2266 | 0 | uint32_t *ptr = *pptr; |
2267 | 0 | PCRE2_UCHAR *code = *pcode; |
2268 | | #ifdef PCRE2_DEBUG |
2269 | | PCRE2_UCHAR *start_code = *pcode; |
2270 | | #endif |
2271 | | |
2272 | | /* See compile_class_binary_loose() for comments on compile-time folding of |
2273 | | the "negated" flag. */ |
2274 | | |
2275 | | /* Because it's a non-empty class, there must be an operand at the start. */ |
2276 | 0 | if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr)) |
2277 | 0 | return FALSE; |
2278 | | |
2279 | 0 | while (*ptr != META_CLASS_END && |
2280 | 0 | !(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT)) |
2281 | 0 | { |
2282 | 0 | uint32_t op; |
2283 | 0 | BOOL rhs_negated; |
2284 | 0 | eclass_op_info rhs_op_info; |
2285 | |
|
2286 | 0 | if (negated) |
2287 | 0 | { |
2288 | | /* !(A juxtapose B) -> !A && !B */ |
2289 | 0 | op = ECL_AND; |
2290 | 0 | rhs_negated = TRUE; |
2291 | 0 | } |
2292 | 0 | else |
2293 | 0 | { |
2294 | | /* A juxtapose B -> A || B */ |
2295 | 0 | op = ECL_OR; |
2296 | 0 | rhs_negated = FALSE; |
2297 | 0 | } |
2298 | | |
2299 | | /* An operand must follow the operator. */ |
2300 | 0 | if (!compile_class_operand(context, rhs_negated, &ptr, &code, |
2301 | 0 | &rhs_op_info, lengthptr)) |
2302 | 0 | return FALSE; |
2303 | | |
2304 | | /* Convert infix to postfix (RPN). */ |
2305 | 0 | fold_binary(op, pop_info, &rhs_op_info, lengthptr); |
2306 | 0 | if (lengthptr == NULL) |
2307 | 0 | code = pop_info->code_start + pop_info->length; |
2308 | 0 | } |
2309 | | |
2310 | 0 | PCRE2_ASSERT(lengthptr == NULL || code == start_code); |
2311 | |
|
2312 | 0 | *pptr = ptr; |
2313 | 0 | *pcode = code; |
2314 | 0 | return TRUE; |
2315 | 0 | } |
2316 | | |
2317 | | |
2318 | | |
2319 | | /* This function consumes unary prefix operators. */ |
2320 | | |
2321 | | static BOOL |
2322 | | compile_class_unary(eclass_context *context, BOOL negated, |
2323 | | uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, |
2324 | | PCRE2_SIZE *lengthptr) |
2325 | 0 | { |
2326 | 0 | uint32_t *ptr = *pptr; |
2327 | | #ifdef PCRE2_DEBUG |
2328 | | PCRE2_UCHAR *start_code = *pcode; |
2329 | | #endif |
2330 | |
|
2331 | 0 | while (*ptr == META_ECLASS_NOT) |
2332 | 0 | { |
2333 | 0 | ++ptr; |
2334 | 0 | negated = !negated; |
2335 | 0 | } |
2336 | |
|
2337 | 0 | *pptr = ptr; |
2338 | | /* Because it's a non-empty class, there must be an operand. */ |
2339 | 0 | if (!compile_class_juxtaposition(context, negated, pptr, pcode, |
2340 | 0 | pop_info, lengthptr)) |
2341 | 0 | return FALSE; |
2342 | | |
2343 | 0 | PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code); |
2344 | 0 | return TRUE; |
2345 | 0 | } |
2346 | | |
2347 | | |
2348 | | |
2349 | | /* This function consumes tightly-binding binary operators. */ |
2350 | | |
2351 | | static BOOL |
2352 | | compile_class_binary_tight(eclass_context *context, BOOL negated, |
2353 | | uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, |
2354 | | PCRE2_SIZE *lengthptr) |
2355 | 0 | { |
2356 | 0 | uint32_t *ptr = *pptr; |
2357 | 0 | PCRE2_UCHAR *code = *pcode; |
2358 | | #ifdef PCRE2_DEBUG |
2359 | | PCRE2_UCHAR *start_code = *pcode; |
2360 | | #endif |
2361 | | |
2362 | | /* See compile_class_binary_loose() for comments on compile-time folding of |
2363 | | the "negated" flag. */ |
2364 | | |
2365 | | /* Because it's a non-empty class, there must be an operand at the start. */ |
2366 | 0 | if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr)) |
2367 | 0 | return FALSE; |
2368 | | |
2369 | 0 | while (*ptr == META_ECLASS_AND) |
2370 | 0 | { |
2371 | 0 | uint32_t op; |
2372 | 0 | BOOL rhs_negated; |
2373 | 0 | eclass_op_info rhs_op_info; |
2374 | |
|
2375 | 0 | if (negated) |
2376 | 0 | { |
2377 | | /* !(A && B) -> !A || !B */ |
2378 | 0 | op = ECL_OR; |
2379 | 0 | rhs_negated = TRUE; |
2380 | 0 | } |
2381 | 0 | else |
2382 | 0 | { |
2383 | | /* A && B -> A && B */ |
2384 | 0 | op = ECL_AND; |
2385 | 0 | rhs_negated = FALSE; |
2386 | 0 | } |
2387 | |
|
2388 | 0 | ++ptr; |
2389 | | |
2390 | | /* An operand must follow the operator. */ |
2391 | 0 | if (!compile_class_unary(context, rhs_negated, &ptr, &code, |
2392 | 0 | &rhs_op_info, lengthptr)) |
2393 | 0 | return FALSE; |
2394 | | |
2395 | | /* Convert infix to postfix (RPN). */ |
2396 | 0 | fold_binary(op, pop_info, &rhs_op_info, lengthptr); |
2397 | 0 | if (lengthptr == NULL) |
2398 | 0 | code = pop_info->code_start + pop_info->length; |
2399 | 0 | } |
2400 | | |
2401 | 0 | PCRE2_ASSERT(lengthptr == NULL || code == start_code); |
2402 | |
|
2403 | 0 | *pptr = ptr; |
2404 | 0 | *pcode = code; |
2405 | 0 | return TRUE; |
2406 | 0 | } |
2407 | | |
2408 | | |
2409 | | |
2410 | | /* This function consumes loosely-binding binary operators. */ |
2411 | | |
2412 | | static BOOL |
2413 | | compile_class_binary_loose(eclass_context *context, BOOL negated, |
2414 | | uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, |
2415 | | PCRE2_SIZE *lengthptr) |
2416 | 0 | { |
2417 | 0 | uint32_t *ptr = *pptr; |
2418 | 0 | PCRE2_UCHAR *code = *pcode; |
2419 | | #ifdef PCRE2_DEBUG |
2420 | | PCRE2_UCHAR *start_code = *pcode; |
2421 | | #endif |
2422 | | |
2423 | | /* We really want to fold the negation operator, if at all possible, so that |
2424 | | simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want |
2425 | | to produce a fully-folded expression, so that we can guarantee not to emit any |
2426 | | OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode). |
2427 | | |
2428 | | This has the consequence that with a little ingenuity, we can in fact avoid |
2429 | | emitting (nearly...) all cases of the "NOT" operator. Imagine that we have: |
2430 | | !(A ... |
2431 | | We have parsed the preceding "!", and we are about to parse the "A" operand. We |
2432 | | don't know yet whether there will even be a following binary operand! Both of |
2433 | | these are possibilities for what follows: |
2434 | | !(A && B) |
2435 | | !(A) |
2436 | | However, we can still fold the "!" into the "A" operand, because no matter what |
2437 | | the following binary operator will be, we can produce an expression which is |
2438 | | equivalent. */ |
2439 | | |
2440 | | /* Because it's a non-empty class, there must be an operand at the start. */ |
2441 | 0 | if (!compile_class_binary_tight(context, negated, &ptr, &code, |
2442 | 0 | pop_info, lengthptr)) |
2443 | 0 | return FALSE; |
2444 | | |
2445 | 0 | while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR) |
2446 | 0 | { |
2447 | 0 | uint32_t op; |
2448 | 0 | BOOL op_neg; |
2449 | 0 | BOOL rhs_negated; |
2450 | 0 | eclass_op_info rhs_op_info; |
2451 | |
|
2452 | 0 | if (negated) |
2453 | 0 | { |
2454 | | /* The whole expression is being negated; we respond by unconditionally |
2455 | | negating the LHS A, before seeing what follows. And hooray! We can recover, |
2456 | | no matter what follows. */ |
2457 | | /* !(A || B) -> !A && !B */ |
2458 | | /* !(A -- B) -> !(A && !B) -> !A || B */ |
2459 | | /* !(A XOR B) -> !(!A XOR !B) -> !A XNOR !B */ |
2460 | 0 | op = (*ptr == META_ECLASS_OR )? ECL_AND : |
2461 | 0 | (*ptr == META_ECLASS_SUB)? ECL_OR : |
2462 | 0 | /*ptr == META_ECLASS_XOR*/ ECL_XOR; |
2463 | 0 | op_neg = (*ptr == META_ECLASS_XOR); |
2464 | 0 | rhs_negated = *ptr != META_ECLASS_SUB; |
2465 | 0 | } |
2466 | 0 | else |
2467 | 0 | { |
2468 | | /* A || B -> A || B */ |
2469 | | /* A -- B -> A && !B */ |
2470 | | /* A XOR B -> A XOR B */ |
2471 | 0 | op = (*ptr == META_ECLASS_OR )? ECL_OR : |
2472 | 0 | (*ptr == META_ECLASS_SUB)? ECL_AND : |
2473 | 0 | /*ptr == META_ECLASS_XOR*/ ECL_XOR; |
2474 | 0 | op_neg = FALSE; |
2475 | 0 | rhs_negated = *ptr == META_ECLASS_SUB; |
2476 | 0 | } |
2477 | |
|
2478 | 0 | ++ptr; |
2479 | | |
2480 | | /* An operand must follow the operator. */ |
2481 | 0 | if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code, |
2482 | 0 | &rhs_op_info, lengthptr)) |
2483 | 0 | return FALSE; |
2484 | | |
2485 | | /* Convert infix to postfix (RPN). */ |
2486 | 0 | fold_binary(op, pop_info, &rhs_op_info, lengthptr); |
2487 | 0 | if (op_neg) fold_negation(pop_info, lengthptr, FALSE); |
2488 | 0 | if (lengthptr == NULL) |
2489 | 0 | code = pop_info->code_start + pop_info->length; |
2490 | 0 | } |
2491 | | |
2492 | 0 | PCRE2_ASSERT(lengthptr == NULL || code == start_code); |
2493 | |
|
2494 | 0 | *pptr = ptr; |
2495 | 0 | *pcode = code; |
2496 | 0 | return TRUE; |
2497 | 0 | } |
2498 | | |
2499 | | |
2500 | | |
2501 | | /* This function converts the META codes in pptr into opcodes written to |
2502 | | pcode. The pptr must start at a META_CLASS or META_CLASS_NOT. |
2503 | | |
2504 | | The class is compiled as a left-associative sequence of operator |
2505 | | applications. |
2506 | | |
2507 | | The pptr will be left pointing at the matching META_CLASS_END. */ |
2508 | | |
2509 | | static BOOL |
2510 | | compile_eclass_nested(eclass_context *context, BOOL negated, |
2511 | | uint32_t **pptr, PCRE2_UCHAR **pcode, |
2512 | | eclass_op_info *pop_info, PCRE2_SIZE *lengthptr) |
2513 | 0 | { |
2514 | 0 | uint32_t *ptr = *pptr; |
2515 | | #ifdef PCRE2_DEBUG |
2516 | | PCRE2_UCHAR *start_code = *pcode; |
2517 | | #endif |
2518 | | |
2519 | | /* The CLASS_IS_ECLASS bit must be set since it is a nested class. */ |
2520 | 0 | PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) || |
2521 | 0 | *ptr == (META_CLASS_NOT | CLASS_IS_ECLASS)); |
2522 | |
|
2523 | 0 | if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS)) |
2524 | 0 | negated = !negated; |
2525 | |
|
2526 | 0 | (*pptr)++; |
2527 | | |
2528 | | /* Because it's a non-empty class, there must be an operand at the start. */ |
2529 | 0 | if (!compile_class_binary_loose(context, negated, pptr, pcode, |
2530 | 0 | pop_info, lengthptr)) |
2531 | 0 | return FALSE; |
2532 | | |
2533 | 0 | PCRE2_ASSERT(**pptr == META_CLASS_END); |
2534 | 0 | PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code); |
2535 | 0 | return TRUE; |
2536 | 0 | } |
2537 | | |
2538 | | BOOL |
2539 | | PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions, |
2540 | | uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr, |
2541 | | compile_block *cb, PCRE2_SIZE *lengthptr) |
2542 | 0 | { |
2543 | 0 | eclass_context context; |
2544 | 0 | eclass_op_info op_info; |
2545 | 0 | PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0; |
2546 | 0 | PCRE2_UCHAR *code = *pcode; |
2547 | 0 | PCRE2_UCHAR *previous; |
2548 | 0 | BOOL allbitsone = TRUE; |
2549 | |
|
2550 | 0 | context.needs_bitmap = FALSE; |
2551 | 0 | context.options = options; |
2552 | 0 | context.xoptions = xoptions; |
2553 | 0 | context.errorcodeptr = errorcodeptr; |
2554 | 0 | context.cb = cb; |
2555 | |
|
2556 | 0 | previous = code; |
2557 | 0 | *code++ = OP_ECLASS; |
2558 | 0 | code += LINK_SIZE; |
2559 | 0 | *code++ = 0; /* Flags, currently zero. */ |
2560 | 0 | if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr)) |
2561 | 0 | return FALSE; |
2562 | | |
2563 | 0 | if (lengthptr != NULL) |
2564 | 0 | { |
2565 | 0 | *lengthptr += code - previous; |
2566 | 0 | code = previous; |
2567 | | /* (*lengthptr - previous_length) now holds the amount of buffer that |
2568 | | we require to make the call to compile_class_nested() with |
2569 | | lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out |
2570 | | before that call. */ |
2571 | 0 | } |
2572 | | |
2573 | | /* Do some useful counting of what's in the bitmap. */ |
2574 | 0 | for (int i = 0; i < 8; i++) |
2575 | 0 | if (op_info.bits.classwords[i] != 0xffffffff) |
2576 | 0 | { |
2577 | 0 | allbitsone = FALSE; |
2578 | 0 | break; |
2579 | 0 | } |
2580 | | |
2581 | | /* After constant-folding the extended class syntax, it may turn out to be |
2582 | | a simple class after all. In that case, we can unwrap it from the |
2583 | | OP_ECLASS container - and in fact, we must do so, because in 8-bit |
2584 | | no-Unicode mode the matcher is compiled without support for OP_ECLASS. */ |
2585 | |
|
2586 | | #ifndef SUPPORT_WIDE_CHARS |
2587 | | PCRE2_ASSERT(op_info.op_single_type != 0); |
2588 | | #else |
2589 | 0 | if (op_info.op_single_type != 0) |
2590 | 0 | #endif |
2591 | 0 | { |
2592 | | /* Rewind back over the OP_ECLASS. */ |
2593 | 0 | code = previous; |
2594 | | |
2595 | | /* If the bits are all ones, and the "high characters" are all matched |
2596 | | too, we use a special-cased encoding of OP_ALLANY. */ |
2597 | |
|
2598 | 0 | if (op_info.op_single_type == ECL_ANY && allbitsone) |
2599 | 0 | { |
2600 | | /* Advancing code means rewinding lengthptr, at this point. */ |
2601 | 0 | if (lengthptr != NULL) *lengthptr -= 1; |
2602 | 0 | *code++ = OP_ALLANY; |
2603 | 0 | } |
2604 | | |
2605 | | /* If the high bits are all matched / all not-matched, then we emit an |
2606 | | OP_NCLASS/OP_CLASS respectively. */ |
2607 | | |
2608 | 0 | else if (op_info.op_single_type == ECL_ANY || |
2609 | 0 | op_info.op_single_type == ECL_NONE) |
2610 | 0 | { |
2611 | 0 | PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR)); |
2612 | |
|
2613 | 0 | if (lengthptr != NULL) |
2614 | 0 | { |
2615 | 0 | if (required_len > (*lengthptr - previous_length)) |
2616 | 0 | *lengthptr = previous_length + required_len; |
2617 | 0 | } |
2618 | | |
2619 | | /* Advancing code means rewinding lengthptr, at this point. */ |
2620 | 0 | if (lengthptr != NULL) *lengthptr -= required_len; |
2621 | 0 | *code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS; |
2622 | 0 | memcpy(code, op_info.bits.classbits, 32); |
2623 | 0 | code += 32 / sizeof(PCRE2_UCHAR); |
2624 | 0 | } |
2625 | | |
2626 | | /* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data |
2627 | | there, but, we pulled out its bitmap into op_info, so now we have to |
2628 | | put that back into the OP_XCLASS. */ |
2629 | | |
2630 | 0 | else |
2631 | 0 | { |
2632 | | #ifndef SUPPORT_WIDE_CHARS |
2633 | | PCRE2_DEBUG_UNREACHABLE(); |
2634 | | #else |
2635 | 0 | BOOL need_map = context.needs_bitmap; |
2636 | 0 | PCRE2_SIZE required_len; |
2637 | |
|
2638 | 0 | PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS); |
2639 | 0 | required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0); |
2640 | |
|
2641 | 0 | if (lengthptr != NULL) |
2642 | 0 | { |
2643 | | /* Don't unconditionally request all the space we need - we may |
2644 | | already have asked for more during processing of the ECLASS. */ |
2645 | 0 | if (required_len > (*lengthptr - previous_length)) |
2646 | 0 | *lengthptr = previous_length + required_len; |
2647 | | |
2648 | | /* The code we write out here won't be ignored, even during the |
2649 | | (lengthptr != NULL) phase, because if there's a following quantifier |
2650 | | it will peek backwards. So we do have to write out a (truncated) |
2651 | | OP_XCLASS, even on this branch. */ |
2652 | 0 | *lengthptr -= 1 + LINK_SIZE + 1; |
2653 | 0 | *code++ = OP_XCLASS; |
2654 | 0 | PUT(code, 0, 1 + LINK_SIZE + 1); |
2655 | 0 | code += LINK_SIZE; |
2656 | 0 | *code++ = 0; |
2657 | 0 | } |
2658 | 0 | else |
2659 | 0 | { |
2660 | 0 | PCRE2_UCHAR *rest; |
2661 | 0 | PCRE2_SIZE rest_len; |
2662 | 0 | PCRE2_UCHAR flags; |
2663 | | |
2664 | | /* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */ |
2665 | 0 | PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1); |
2666 | 0 | rest = op_info.code_start + 1 + LINK_SIZE + 1; |
2667 | 0 | rest_len = (op_info.code_start + op_info.length) - rest; |
2668 | | |
2669 | | /* First read any data we use, before memmove splats it. */ |
2670 | 0 | flags = op_info.code_start[1 + LINK_SIZE]; |
2671 | 0 | PCRE2_ASSERT((flags & XCL_MAP) == 0); |
2672 | | |
2673 | | /* Next do the memmove before any writes. */ |
2674 | 0 | memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0), |
2675 | 0 | rest, CU2BYTES(rest_len)); |
2676 | | |
2677 | | /* Finally write the header data. */ |
2678 | 0 | *code++ = OP_XCLASS; |
2679 | 0 | PUT(code, 0, (int)required_len); |
2680 | 0 | code += LINK_SIZE; |
2681 | 0 | *code++ = flags | (need_map? XCL_MAP : 0); |
2682 | 0 | if (need_map) |
2683 | 0 | { |
2684 | 0 | memcpy(code, op_info.bits.classbits, 32); |
2685 | 0 | code += 32 / sizeof(PCRE2_UCHAR); |
2686 | 0 | } |
2687 | 0 | code += rest_len; |
2688 | 0 | } |
2689 | 0 | #endif /* SUPPORT_WIDE_CHARS */ |
2690 | 0 | } |
2691 | 0 | } |
2692 | | |
2693 | | /* Otherwise, we're going to keep the OP_ECLASS. However, again we need |
2694 | | to do some adjustment to insert the bitmap if we have one. */ |
2695 | | |
2696 | 0 | #ifdef SUPPORT_WIDE_CHARS |
2697 | 0 | else |
2698 | 0 | { |
2699 | 0 | BOOL need_map = context.needs_bitmap; |
2700 | 0 | PCRE2_SIZE required_len = |
2701 | 0 | 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length; |
2702 | |
|
2703 | 0 | if (lengthptr != NULL) |
2704 | 0 | { |
2705 | 0 | if (required_len > (*lengthptr - previous_length)) |
2706 | 0 | *lengthptr = previous_length + required_len; |
2707 | | |
2708 | | /* As for the XCLASS branch above, we do have to write out a dummy |
2709 | | OP_ECLASS, because of the backwards peek by the quantifier code. Write |
2710 | | out a (truncated) OP_ECLASS, even on this branch. */ |
2711 | 0 | *lengthptr -= 1 + LINK_SIZE + 1; |
2712 | 0 | *code++ = OP_ECLASS; |
2713 | 0 | PUT(code, 0, 1 + LINK_SIZE + 1); |
2714 | 0 | code += LINK_SIZE; |
2715 | 0 | *code++ = 0; |
2716 | 0 | } |
2717 | 0 | else |
2718 | 0 | { |
2719 | 0 | if (need_map) |
2720 | 0 | { |
2721 | 0 | PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1; |
2722 | 0 | previous[1 + LINK_SIZE] |= ECL_MAP; |
2723 | 0 | memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start, |
2724 | 0 | CU2BYTES(code - map_start)); |
2725 | 0 | memcpy(map_start, op_info.bits.classbits, 32); |
2726 | 0 | code += 32 / sizeof(PCRE2_UCHAR); |
2727 | 0 | } |
2728 | 0 | PUT(previous, 1, (int)(code - previous)); |
2729 | 0 | } |
2730 | 0 | } |
2731 | 0 | #endif /* SUPPORT_WIDE_CHARS */ |
2732 | |
|
2733 | 0 | *pcode = code; |
2734 | 0 | return TRUE; |
2735 | 0 | } |
2736 | | |
2737 | | /* End of pcre2_compile_class.c */ |