/src/php-src/ext/pcre/pcre2lib/pcre2_auto_possess.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2024 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | /* This module contains functions that scan a compiled pattern and change |
42 | | repeats into possessive repeats where possible. */ |
43 | | |
44 | | |
45 | | #ifdef HAVE_CONFIG_H |
46 | | #include "config.h" |
47 | | #endif |
48 | | |
49 | | |
50 | | #include "pcre2_internal.h" |
51 | | |
52 | | /* This macro represents the max size of list[] and that is used to keep |
53 | | track of UCD info in several places, it should be kept on sync with the |
54 | | value used by GenerateUcd.py */ |
55 | 568 | #define MAX_LIST 8 |
56 | | |
57 | | /************************************************* |
58 | | * Tables for auto-possessification * |
59 | | *************************************************/ |
60 | | |
61 | | /* This table is used to check whether auto-possessification is possible |
62 | | between adjacent character-type opcodes. The left-hand (repeated) opcode is |
63 | | used to select the row, and the right-hand opcode is use to select the column. |
64 | | A value of 1 means that auto-possessification is OK. For example, the second |
65 | | value in the first row means that \D+\d can be turned into \D++\d. |
66 | | |
67 | | The Unicode property types (\P and \p) have to be present to fill out the table |
68 | | because of what their opcode values are, but the table values should always be |
69 | | zero because property types are handled separately in the code. The last four |
70 | | columns apply to items that cannot be repeated, so there is no need to have |
71 | | rows for them. Note that OP_DIGIT etc. are generated only when PCRE2_UCP is |
72 | | *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
73 | | |
74 | | #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1) |
75 | | #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1) |
76 | | |
77 | | static const uint8_t autoposstab[APTROWS][APTCOLS] = { |
78 | | /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */ |
79 | | { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */ |
80 | | { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */ |
81 | | { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */ |
82 | | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */ |
83 | | { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */ |
84 | | { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */ |
85 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */ |
86 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */ |
87 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */ |
88 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */ |
89 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */ |
90 | | { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */ |
91 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */ |
92 | | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */ |
93 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */ |
94 | | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */ |
95 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */ |
96 | | }; |
97 | | |
98 | | #ifdef SUPPORT_UNICODE |
99 | | /* This table is used to check whether auto-possessification is possible |
100 | | between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The |
101 | | left-hand (repeated) opcode is used to select the row, and the right-hand |
102 | | opcode is used to select the column. The values are as follows: |
103 | | |
104 | | 0 Always return FALSE (never auto-possessify) |
105 | | 1 Character groups are distinct (possessify if both are OP_PROP) |
106 | | 2 Check character categories in the same group (general or particular) |
107 | | 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP) |
108 | | |
109 | | 4 Check left general category vs right particular category |
110 | | 5 Check right general category vs left particular category |
111 | | |
112 | | 6 Left alphanum vs right general category |
113 | | 7 Left space vs right general category |
114 | | 8 Left word vs right general category |
115 | | |
116 | | 9 Right alphanum vs left general category |
117 | | 10 Right space vs left general category |
118 | | 11 Right word vs left general category |
119 | | |
120 | | 12 Left alphanum vs right particular category |
121 | | 13 Left space vs right particular category |
122 | | 14 Left word vs right particular category |
123 | | |
124 | | 15 Right alphanum vs left particular category |
125 | | 16 Right space vs left particular category |
126 | | 17 Right word vs left particular category |
127 | | */ |
128 | | |
129 | | static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = { |
130 | | /* LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */ |
131 | | { 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */ |
132 | | { 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */ |
133 | | { 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */ |
134 | | { 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */ |
135 | | { 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */ |
136 | | { 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */ |
137 | | { 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */ |
138 | | { 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */ |
139 | | { 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */ |
140 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */ |
141 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */ |
142 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */ |
143 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */ |
144 | | /* PT_ANY does not need a record. */ |
145 | | }; |
146 | | |
147 | | /* This table is used to check whether auto-possessification is possible |
148 | | between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one |
149 | | specifies a general category and the other specifies a particular category. The |
150 | | row is selected by the general category and the column by the particular |
151 | | category. The value is 1 if the particular category is not part of the general |
152 | | category. */ |
153 | | |
154 | | static const uint8_t catposstab[7][30] = { |
155 | | /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */ |
156 | | { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */ |
157 | | { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */ |
158 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */ |
159 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ |
160 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */ |
161 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */ |
162 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */ |
163 | | }; |
164 | | |
165 | | /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against |
166 | | a general or particular category. The properties in each row are those |
167 | | that apply to the character set in question. Duplication means that a little |
168 | | unnecessary work is done when checking, but this keeps things much simpler |
169 | | because they can all use the same code. For more details see the comment where |
170 | | this table is used. |
171 | | |
172 | | Note: SPACE and PXSPACE used to be different because Perl excluded VT from |
173 | | "space", but from Perl 5.18 it's included, so both categories are treated the |
174 | | same here. */ |
175 | | |
176 | | static const uint8_t posspropstab[3][4] = { |
177 | | { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */ |
178 | | { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */ |
179 | | { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */ |
180 | | }; |
181 | | #endif /* SUPPORT_UNICODE */ |
182 | | |
183 | | |
184 | | |
185 | | #ifdef SUPPORT_UNICODE |
186 | | /************************************************* |
187 | | * Check a character and a property * |
188 | | *************************************************/ |
189 | | |
190 | | /* This function is called by compare_opcodes() when a property item is |
191 | | adjacent to a fixed character. |
192 | | |
193 | | Arguments: |
194 | | c the character |
195 | | ptype the property type |
196 | | pdata the data for the type |
197 | | negated TRUE if it's a negated property (\P or \p{^) |
198 | | |
199 | | Returns: TRUE if auto-possessifying is OK |
200 | | */ |
201 | | |
202 | | static BOOL |
203 | | check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata, |
204 | | BOOL negated) |
205 | 643 | { |
206 | 643 | BOOL ok, rc; |
207 | 643 | const uint32_t *p; |
208 | 643 | const ucd_record *prop = GET_UCD(c); |
209 | | |
210 | 643 | switch(ptype) |
211 | 643 | { |
212 | 0 | case PT_LAMP: |
213 | 0 | return (prop->chartype == ucp_Lu || |
214 | 0 | prop->chartype == ucp_Ll || |
215 | 0 | prop->chartype == ucp_Lt) == negated; |
216 | | |
217 | 13 | case PT_GC: |
218 | 13 | return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; |
219 | | |
220 | 37 | case PT_PC: |
221 | 37 | return (pdata == prop->chartype) == negated; |
222 | | |
223 | 0 | case PT_SC: |
224 | 0 | return (pdata == prop->script) == negated; |
225 | | |
226 | 0 | case PT_SCX: |
227 | 0 | ok = (pdata == prop->script |
228 | 0 | || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); |
229 | 0 | return ok == negated; |
230 | | |
231 | | /* These are specials */ |
232 | | |
233 | 0 | case PT_ALNUM: |
234 | 0 | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
235 | 0 | PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; |
236 | | |
237 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, which |
238 | | means that Perl space and POSIX space are now identical. PCRE was changed |
239 | | at release 8.34. */ |
240 | | |
241 | 21 | case PT_SPACE: /* Perl space */ |
242 | 21 | case PT_PXSPACE: /* POSIX space */ |
243 | 21 | switch(c) |
244 | 21 | { |
245 | 18 | HSPACE_CASES: |
246 | 18 | VSPACE_CASES: |
247 | 1 | rc = negated; |
248 | 1 | break; |
249 | | |
250 | 20 | default: |
251 | 20 | rc = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; |
252 | 21 | } |
253 | 21 | return rc; |
254 | | |
255 | 572 | case PT_WORD: |
256 | 572 | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
257 | 572 | PRIV(ucp_gentype)[prop->chartype] == ucp_N || |
258 | 572 | c == CHAR_UNDERSCORE) == negated; |
259 | | |
260 | 0 | case PT_CLIST: |
261 | 0 | p = PRIV(ucd_caseless_sets) + prop->caseset; |
262 | 0 | for (;;) |
263 | 0 | { |
264 | 0 | if (c < *p) return !negated; |
265 | 0 | if (c == *p++) return negated; |
266 | 0 | } |
267 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ |
268 | 0 | break; |
269 | | |
270 | | /* Haven't yet thought these through. */ |
271 | | |
272 | 0 | case PT_BIDICL: |
273 | 0 | return FALSE; |
274 | | |
275 | 0 | case PT_BOOL: |
276 | 0 | return FALSE; |
277 | 643 | } |
278 | | |
279 | 0 | return FALSE; |
280 | 643 | } |
281 | | #endif /* SUPPORT_UNICODE */ |
282 | | |
283 | | |
284 | | |
285 | | /************************************************* |
286 | | * Base opcode of repeated opcodes * |
287 | | *************************************************/ |
288 | | |
289 | | /* Returns the base opcode for repeated single character type opcodes. If the |
290 | | opcode is not a repeated character type, it returns with the original value. |
291 | | |
292 | | Arguments: c opcode |
293 | | Returns: base opcode for the type |
294 | | */ |
295 | | |
296 | | static PCRE2_UCHAR |
297 | | get_repeat_base(PCRE2_UCHAR c) |
298 | 89.7k | { |
299 | 89.7k | return (c > OP_TYPEPOSUPTO)? c : |
300 | 89.7k | (c >= OP_TYPESTAR)? OP_TYPESTAR : |
301 | 89.7k | (c >= OP_NOTSTARI)? OP_NOTSTARI : |
302 | 65.8k | (c >= OP_NOTSTAR)? OP_NOTSTAR : |
303 | 63.9k | (c >= OP_STARI)? OP_STARI : |
304 | 60.0k | OP_STAR; |
305 | 89.7k | } |
306 | | |
307 | | |
308 | | /************************************************* |
309 | | * Fill the character property list * |
310 | | *************************************************/ |
311 | | |
312 | | /* Checks whether the code points to an opcode that can take part in auto- |
313 | | possessification, and if so, fills a list with its properties. |
314 | | |
315 | | Arguments: |
316 | | code points to start of expression |
317 | | utf TRUE if in UTF mode |
318 | | ucp TRUE if in UCP mode |
319 | | fcc points to the case-flipping table |
320 | | list points to output list |
321 | | list[0] will be filled with the opcode |
322 | | list[1] will be non-zero if this opcode |
323 | | can match an empty character string |
324 | | list[2..7] depends on the opcode |
325 | | |
326 | | Returns: points to the start of the next opcode if *code is accepted |
327 | | NULL if *code is not accepted |
328 | | */ |
329 | | |
330 | | static PCRE2_SPTR |
331 | | get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc, |
332 | | uint32_t *list) |
333 | 97.5k | { |
334 | 97.5k | PCRE2_UCHAR c = *code; |
335 | 97.5k | PCRE2_UCHAR base; |
336 | 97.5k | PCRE2_SPTR end; |
337 | 97.5k | PCRE2_SPTR class_end; |
338 | 97.5k | uint32_t chr; |
339 | | |
340 | 97.5k | #ifdef SUPPORT_UNICODE |
341 | 97.5k | uint32_t *clist_dest; |
342 | 97.5k | const uint32_t *clist_src; |
343 | | #else |
344 | | (void)utf; /* Suppress "unused parameter" compiler warnings */ |
345 | | (void)ucp; |
346 | | #endif |
347 | | |
348 | 97.5k | list[0] = c; |
349 | 97.5k | list[1] = FALSE; |
350 | 97.5k | code++; |
351 | | |
352 | 97.5k | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) |
353 | 51.9k | { |
354 | 51.9k | base = get_repeat_base(c); |
355 | 51.9k | c -= (base - OP_STAR); |
356 | | |
357 | 51.9k | if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) |
358 | 0 | code += IMM2_SIZE; |
359 | | |
360 | 51.9k | list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && |
361 | 51.9k | c != OP_POSPLUS); |
362 | | |
363 | 51.9k | switch(base) |
364 | 51.9k | { |
365 | 21.4k | case OP_STAR: |
366 | 21.4k | list[0] = OP_CHAR; |
367 | 21.4k | break; |
368 | | |
369 | 12.8k | case OP_STARI: |
370 | 12.8k | list[0] = OP_CHARI; |
371 | 12.8k | break; |
372 | | |
373 | 2.36k | case OP_NOTSTAR: |
374 | 2.36k | list[0] = OP_NOT; |
375 | 2.36k | break; |
376 | | |
377 | 1.09k | case OP_NOTSTARI: |
378 | 1.09k | list[0] = OP_NOTI; |
379 | 1.09k | break; |
380 | | |
381 | 14.2k | case OP_TYPESTAR: |
382 | 14.2k | list[0] = *code; |
383 | 14.2k | code++; |
384 | 14.2k | break; |
385 | 51.9k | } |
386 | 51.9k | c = list[0]; |
387 | 51.9k | } |
388 | | |
389 | 97.5k | switch(c) |
390 | 97.5k | { |
391 | 1.80k | case OP_NOT_DIGIT: |
392 | 3.91k | case OP_DIGIT: |
393 | 4.03k | case OP_NOT_WHITESPACE: |
394 | 5.02k | case OP_WHITESPACE: |
395 | 5.51k | case OP_NOT_WORDCHAR: |
396 | 10.6k | case OP_WORDCHAR: |
397 | 12.6k | case OP_ANY: |
398 | 12.9k | case OP_ALLANY: |
399 | 17.2k | case OP_ANYNL: |
400 | 17.5k | case OP_NOT_HSPACE: |
401 | 17.6k | case OP_HSPACE: |
402 | 18.5k | case OP_NOT_VSPACE: |
403 | 18.7k | case OP_VSPACE: |
404 | 18.9k | case OP_EXTUNI: |
405 | 18.9k | case OP_EODN: |
406 | 18.9k | case OP_EOD: |
407 | 19.2k | case OP_DOLL: |
408 | 19.2k | case OP_DOLLM: |
409 | 19.2k | return code; |
410 | | |
411 | 37.9k | case OP_CHAR: |
412 | 40.3k | case OP_NOT: |
413 | 40.3k | GETCHARINCTEST(chr, code); |
414 | 40.3k | list[2] = chr; |
415 | 40.3k | list[3] = NOTACHAR; |
416 | 40.3k | return code; |
417 | | |
418 | 20.1k | case OP_CHARI: |
419 | 21.2k | case OP_NOTI: |
420 | 21.2k | list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; |
421 | 21.2k | GETCHARINCTEST(chr, code); |
422 | 21.2k | list[2] = chr; |
423 | | |
424 | 21.2k | #ifdef SUPPORT_UNICODE |
425 | 21.2k | if (chr < 128 || (chr < 256 && !utf && !ucp)) |
426 | 21.1k | list[3] = fcc[chr]; |
427 | 70 | else |
428 | 70 | list[3] = UCD_OTHERCASE(chr); |
429 | | #elif defined SUPPORT_WIDE_CHARS |
430 | | list[3] = (chr < 256) ? fcc[chr] : chr; |
431 | | #else |
432 | | list[3] = fcc[chr]; |
433 | | #endif |
434 | | |
435 | | /* The othercase might be the same value. */ |
436 | | |
437 | 21.2k | if (chr == list[3]) |
438 | 13.3k | list[3] = NOTACHAR; |
439 | 7.89k | else |
440 | 7.89k | list[4] = NOTACHAR; |
441 | 21.2k | return code; |
442 | | |
443 | 0 | #ifdef SUPPORT_UNICODE |
444 | 1.10k | case OP_PROP: |
445 | 1.23k | case OP_NOTPROP: |
446 | 1.23k | if (code[0] != PT_CLIST) |
447 | 1.09k | { |
448 | 1.09k | list[2] = code[0]; |
449 | 1.09k | list[3] = code[1]; |
450 | 1.09k | return code + 2; |
451 | 1.09k | } |
452 | | |
453 | | /* Convert only if we have enough space. */ |
454 | | |
455 | 142 | clist_src = PRIV(ucd_caseless_sets) + code[1]; |
456 | 142 | clist_dest = list + 2; |
457 | 142 | code += 2; |
458 | | |
459 | 568 | do { |
460 | 568 | if (clist_dest >= list + MAX_LIST) |
461 | 0 | { |
462 | | /* Early return if there is not enough space. GenerateUcd.py |
463 | | generated a list with more than 5 characters and something |
464 | | must be done about that going forward. */ |
465 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Remove if it ever triggers */ |
466 | 0 | list[2] = code[0]; |
467 | 0 | list[3] = code[1]; |
468 | 0 | return code; |
469 | 0 | } |
470 | 568 | *clist_dest++ = *clist_src; |
471 | 568 | } |
472 | 568 | while(*clist_src++ != NOTACHAR); |
473 | | |
474 | | /* All characters are stored. The terminating NOTACHAR is copied from the |
475 | | clist itself. */ |
476 | | |
477 | 142 | list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; |
478 | 142 | return code; |
479 | 0 | #endif |
480 | | |
481 | 3.67k | case OP_NCLASS: |
482 | 13.3k | case OP_CLASS: |
483 | 13.3k | #ifdef SUPPORT_WIDE_CHARS |
484 | 14.5k | case OP_XCLASS: |
485 | 14.5k | case OP_ECLASS: |
486 | 14.5k | if (c == OP_XCLASS || c == OP_ECLASS) |
487 | 1.28k | end = code + GET(code, 0) - 1; |
488 | 13.3k | else |
489 | 13.3k | #endif |
490 | 13.3k | end = code + 32 / sizeof(PCRE2_UCHAR); |
491 | 14.5k | class_end = end; |
492 | | |
493 | 14.5k | switch(*end) |
494 | 14.5k | { |
495 | 4.47k | case OP_CRSTAR: |
496 | 5.77k | case OP_CRMINSTAR: |
497 | 9.72k | case OP_CRQUERY: |
498 | 12.0k | case OP_CRMINQUERY: |
499 | 12.1k | case OP_CRPOSSTAR: |
500 | 12.1k | case OP_CRPOSQUERY: |
501 | 12.1k | list[1] = TRUE; |
502 | 12.1k | end++; |
503 | 12.1k | break; |
504 | | |
505 | 991 | case OP_CRPLUS: |
506 | 1.22k | case OP_CRMINPLUS: |
507 | 1.22k | case OP_CRPOSPLUS: |
508 | 1.22k | end++; |
509 | 1.22k | break; |
510 | | |
511 | 0 | case OP_CRRANGE: |
512 | 0 | case OP_CRMINRANGE: |
513 | 0 | case OP_CRPOSRANGE: |
514 | 0 | list[1] = (GET2(end, 1) == 0); |
515 | 0 | end += 1 + 2 * IMM2_SIZE; |
516 | 0 | break; |
517 | 14.5k | } |
518 | 14.5k | list[2] = (uint32_t)(end - code); |
519 | 14.5k | list[3] = (uint32_t)(end - class_end); |
520 | 14.5k | return end; |
521 | 97.5k | } |
522 | | |
523 | 872 | return NULL; /* Opcode not accepted */ |
524 | 97.5k | } |
525 | | |
526 | | |
527 | | |
528 | | /************************************************* |
529 | | * Scan further character sets for match * |
530 | | *************************************************/ |
531 | | |
532 | | /* Checks whether the base and the current opcode have a common character, in |
533 | | which case the base cannot be possessified. |
534 | | |
535 | | Arguments: |
536 | | code points to the byte code |
537 | | utf TRUE in UTF mode |
538 | | ucp TRUE in UCP mode |
539 | | cb compile data block |
540 | | base_list the data list of the base opcode |
541 | | base_end the end of the base opcode |
542 | | rec_limit points to recursion depth counter |
543 | | |
544 | | Returns: TRUE if the auto-possessification is possible |
545 | | */ |
546 | | |
547 | | static BOOL |
548 | | compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb, |
549 | | const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) |
550 | 44.8k | { |
551 | 44.8k | PCRE2_UCHAR c; |
552 | 44.8k | uint32_t list[MAX_LIST]; |
553 | 44.8k | const uint32_t *chr_ptr; |
554 | 44.8k | const uint32_t *ochr_ptr; |
555 | 44.8k | const uint32_t *list_ptr; |
556 | 44.8k | PCRE2_SPTR next_code; |
557 | 44.8k | #ifdef SUPPORT_WIDE_CHARS |
558 | 44.8k | PCRE2_SPTR xclass_flags; |
559 | 44.8k | #endif |
560 | 44.8k | const uint8_t *class_bitset; |
561 | 44.8k | const uint8_t *set1, *set2, *set_end; |
562 | 44.8k | uint32_t chr; |
563 | 44.8k | BOOL accepted, invert_bits; |
564 | 44.8k | BOOL entered_a_group = FALSE; |
565 | | |
566 | 44.8k | if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */ |
567 | | |
568 | | /* Note: the base_list[1] contains whether the current opcode has a greedy |
569 | | (represented by a non-zero value) quantifier. This is a different from |
570 | | other character type lists, which store here that the character iterator |
571 | | matches to an empty string (also represented by a non-zero value). */ |
572 | | |
573 | 44.8k | for(;;) |
574 | 56.2k | { |
575 | 56.2k | PCRE2_SPTR bracode; |
576 | | |
577 | | /* All operations move the code pointer forward. |
578 | | Therefore infinite recursions are not possible. */ |
579 | | |
580 | 56.2k | c = *code; |
581 | | |
582 | | /* Skip over callouts */ |
583 | | |
584 | 56.2k | if (c == OP_CALLOUT) |
585 | 0 | { |
586 | 0 | code += PRIV(OP_lengths)[c]; |
587 | 0 | continue; |
588 | 0 | } |
589 | | |
590 | 56.2k | if (c == OP_CALLOUT_STR) |
591 | 0 | { |
592 | 0 | code += GET(code, 1 + 2*LINK_SIZE); |
593 | 0 | continue; |
594 | 0 | } |
595 | | |
596 | | /* At the end of a branch, skip to the end of the group and process it. */ |
597 | | |
598 | 56.2k | if (c == OP_ALT) |
599 | 1.57k | { |
600 | 39.6k | do code += GET(code, 1); while (*code == OP_ALT); |
601 | 1.57k | c = *code; |
602 | 1.57k | } |
603 | | |
604 | | /* Inspect the next opcode. */ |
605 | | |
606 | 56.2k | switch(c) |
607 | 56.2k | { |
608 | | /* We can always possessify a greedy iterator at the end of the pattern, |
609 | | which is reached after skipping over the final OP_KET. A non-greedy |
610 | | iterator must never be possessified. */ |
611 | | |
612 | 865 | case OP_END: |
613 | 865 | return base_list[1] != 0; |
614 | | |
615 | | /* When an iterator is at the end of certain kinds of group we can inspect |
616 | | what follows the group by skipping over the closing ket. Note that this |
617 | | does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given |
618 | | iteration is variable (could be another iteration or could be the next |
619 | | item). As these two opcodes are not listed in the next switch, they will |
620 | | end up as the next code to inspect, and return FALSE by virtue of being |
621 | | unsupported. */ |
622 | | |
623 | 1.69k | case OP_KET: |
624 | 1.69k | case OP_KETRPOS: |
625 | | /* The non-greedy case cannot be converted to a possessive form. */ |
626 | | |
627 | 1.69k | if (base_list[1] == 0) return FALSE; |
628 | | |
629 | | /* If the bracket is capturing it might be referenced by an OP_RECURSE |
630 | | so its last iterator can never be possessified if the pattern contains |
631 | | recursions. (This could be improved by keeping a list of group numbers that |
632 | | are called by recursion.) */ |
633 | | |
634 | 1.61k | bracode = code - GET(code, 1); |
635 | 1.61k | switch(*bracode) |
636 | 1.61k | { |
637 | 645 | case OP_CBRA: |
638 | 645 | case OP_SCBRA: |
639 | 645 | case OP_CBRAPOS: |
640 | 645 | case OP_SCBRAPOS: |
641 | 645 | if (cb->had_recurse) return FALSE; |
642 | 632 | break; |
643 | | |
644 | | /* A script run might have to backtrack if the iterated item can match |
645 | | characters from more than one script. So give up unless repeating an |
646 | | explicit character. */ |
647 | | |
648 | 632 | case OP_SCRIPT_RUN: |
649 | 0 | if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI) |
650 | 0 | return FALSE; |
651 | 0 | break; |
652 | | |
653 | | /* Atomic sub-patterns and forward assertions can always auto-possessify |
654 | | their last iterator. However, if the group was entered as a result of |
655 | | checking a previous iterator, this is not possible. */ |
656 | | |
657 | 32 | case OP_ASSERT: |
658 | 39 | case OP_ASSERT_NOT: |
659 | 39 | case OP_ONCE: |
660 | 39 | return !entered_a_group; |
661 | | |
662 | | /* Fixed-length lookbehinds can be treated the same way, but variable |
663 | | length lookbehinds must not auto-possessify their last iterator. Note |
664 | | that in order to identify a variable length lookbehind we must check |
665 | | through all branches, because some may be of fixed length. */ |
666 | | |
667 | 0 | case OP_ASSERTBACK: |
668 | 0 | case OP_ASSERTBACK_NOT: |
669 | 0 | do |
670 | 0 | { |
671 | 0 | if (bracode[1+LINK_SIZE] == OP_VREVERSE) return FALSE; /* Variable */ |
672 | 0 | bracode += GET(bracode, 1); |
673 | 0 | } |
674 | 0 | while (*bracode == OP_ALT); |
675 | 0 | return !entered_a_group; /* Not variable length */ |
676 | | |
677 | | /* Non-atomic assertions - don't possessify last iterator. This needs |
678 | | more thought. */ |
679 | | |
680 | 0 | case OP_ASSERT_NA: |
681 | 0 | case OP_ASSERTBACK_NA: |
682 | 0 | return FALSE; |
683 | 1.61k | } |
684 | | |
685 | | /* Skip over the bracket and inspect what comes next. */ |
686 | | |
687 | 1.56k | code += PRIV(OP_lengths)[c]; |
688 | 1.56k | continue; |
689 | | |
690 | | /* Handle cases where the next item is a group. */ |
691 | | |
692 | 1 | case OP_ONCE: |
693 | 1 | case OP_BRA: |
694 | 109 | case OP_CBRA: |
695 | 109 | next_code = code + GET(code, 1); |
696 | 109 | code += PRIV(OP_lengths)[c]; |
697 | | |
698 | | /* Check each branch. We have to recurse a level for all but the last |
699 | | branch. */ |
700 | | |
701 | 1.09k | while (*next_code == OP_ALT) |
702 | 998 | { |
703 | 998 | if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit)) |
704 | 15 | return FALSE; |
705 | 983 | code = next_code + 1 + LINK_SIZE; |
706 | 983 | next_code += GET(next_code, 1); |
707 | 983 | } |
708 | | |
709 | 94 | entered_a_group = TRUE; |
710 | 94 | continue; |
711 | | |
712 | 0 | case OP_BRAZERO: |
713 | 0 | case OP_BRAMINZERO: |
714 | |
|
715 | 0 | next_code = code + 1; |
716 | 0 | if (*next_code != OP_BRA && *next_code != OP_CBRA && |
717 | 0 | *next_code != OP_ONCE) return FALSE; |
718 | | |
719 | 0 | do next_code += GET(next_code, 1); while (*next_code == OP_ALT); |
720 | | |
721 | | /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ |
722 | |
|
723 | 0 | next_code += 1 + LINK_SIZE; |
724 | 0 | if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, |
725 | 0 | rec_limit)) |
726 | 0 | return FALSE; |
727 | | |
728 | 0 | code += PRIV(OP_lengths)[c]; |
729 | 0 | continue; |
730 | | |
731 | | /* The next opcode does not need special handling; fall through and use it |
732 | | to see if the base can be possessified. */ |
733 | | |
734 | 53.6k | default: |
735 | 53.6k | break; |
736 | 56.2k | } |
737 | | |
738 | | /* We now have the next appropriate opcode to compare with the base. Check |
739 | | for a supported opcode, and load its properties. */ |
740 | | |
741 | 53.6k | code = get_chr_property_list(code, utf, ucp, cb->fcc, list); |
742 | 53.6k | if (code == NULL) return FALSE; /* Unsupported */ |
743 | | |
744 | | /* If either opcode is a small character list, set pointers for comparing |
745 | | characters from that list with another list, or with a property. */ |
746 | | |
747 | 52.7k | if (base_list[0] == OP_CHAR) |
748 | 32.2k | { |
749 | 32.2k | chr_ptr = base_list + 2; |
750 | 32.2k | list_ptr = list; |
751 | 32.2k | } |
752 | 20.5k | else if (list[0] == OP_CHAR) |
753 | 12.7k | { |
754 | 12.7k | chr_ptr = list + 2; |
755 | 12.7k | list_ptr = base_list; |
756 | 12.7k | } |
757 | | |
758 | | /* Character bitsets can also be compared to certain opcodes. */ |
759 | | |
760 | 7.84k | else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS |
761 | 7.84k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
762 | | /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ |
763 | 7.84k | || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) |
764 | 7.84k | #endif |
765 | 7.84k | ) |
766 | 3.38k | { |
767 | 3.38k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
768 | 3.38k | if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) |
769 | | #else |
770 | | if (base_list[0] == OP_CLASS) |
771 | | #endif |
772 | 2.26k | { |
773 | 2.26k | set1 = (const uint8_t *)(base_end - base_list[2]); |
774 | 2.26k | list_ptr = list; |
775 | 2.26k | } |
776 | 1.12k | else |
777 | 1.12k | { |
778 | 1.12k | set1 = (const uint8_t *)(code - list[2]); |
779 | 1.12k | list_ptr = base_list; |
780 | 1.12k | } |
781 | | |
782 | 3.38k | invert_bits = FALSE; |
783 | 3.38k | switch(list_ptr[0]) |
784 | 3.38k | { |
785 | 656 | case OP_CLASS: |
786 | 893 | case OP_NCLASS: |
787 | 893 | set2 = (const uint8_t *) |
788 | 893 | ((list_ptr == list ? code : base_end) - list_ptr[2]); |
789 | 893 | break; |
790 | | |
791 | 0 | #ifdef SUPPORT_WIDE_CHARS |
792 | 132 | case OP_XCLASS: |
793 | 132 | xclass_flags = (list_ptr == list ? code : base_end) - |
794 | 132 | list_ptr[2] + LINK_SIZE; |
795 | 132 | if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; |
796 | 1 | if ((*xclass_flags & XCL_MAP) == 0) |
797 | 0 | { |
798 | | /* No bits are set for characters < 256. */ |
799 | 0 | if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0; |
800 | | /* Might be an empty repeat. */ |
801 | 0 | continue; |
802 | 0 | } |
803 | 1 | set2 = (const uint8_t *)(xclass_flags + 1); |
804 | 1 | break; |
805 | 0 | #endif |
806 | | |
807 | 319 | case OP_NOT_DIGIT: |
808 | 319 | invert_bits = TRUE; |
809 | | /* Fall through */ |
810 | 887 | case OP_DIGIT: |
811 | 887 | set2 = (const uint8_t *)(cb->cbits + cbit_digit); |
812 | 887 | break; |
813 | | |
814 | 0 | case OP_NOT_WHITESPACE: |
815 | 0 | invert_bits = TRUE; |
816 | | /* Fall through */ |
817 | 91 | case OP_WHITESPACE: |
818 | 91 | set2 = (const uint8_t *)(cb->cbits + cbit_space); |
819 | 91 | break; |
820 | | |
821 | 65 | case OP_NOT_WORDCHAR: |
822 | 65 | invert_bits = TRUE; |
823 | | /* Fall through */ |
824 | 278 | case OP_WORDCHAR: |
825 | 278 | set2 = (const uint8_t *)(cb->cbits + cbit_word); |
826 | 278 | break; |
827 | | |
828 | 1.10k | default: |
829 | 1.10k | return FALSE; |
830 | 3.38k | } |
831 | | |
832 | | /* Because the bit sets are unaligned bytes, we need to perform byte |
833 | | comparison here. */ |
834 | | |
835 | 2.15k | set_end = set1 + 32; |
836 | 2.15k | if (invert_bits) |
837 | 384 | { |
838 | 384 | do |
839 | 4.78k | { |
840 | 4.78k | if ((*set1++ & ~(*set2++)) != 0) return FALSE; |
841 | 4.78k | } |
842 | 4.54k | while (set1 < set_end); |
843 | 384 | } |
844 | 1.76k | else |
845 | 1.76k | { |
846 | 1.76k | do |
847 | 20.2k | { |
848 | 20.2k | if ((*set1++ & *set2++) != 0) return FALSE; |
849 | 20.2k | } |
850 | 18.8k | while (set1 < set_end); |
851 | 1.76k | } |
852 | | |
853 | 503 | if (list[1] == 0) return TRUE; |
854 | | /* Might be an empty repeat. */ |
855 | 351 | continue; |
856 | 503 | } |
857 | | |
858 | | /* Some property combinations also acceptable. Unicode property opcodes are |
859 | | processed specially; the rest can be handled with a lookup table. */ |
860 | | |
861 | 4.45k | else |
862 | 4.45k | { |
863 | 4.45k | uint32_t leftop, rightop; |
864 | | |
865 | 4.45k | leftop = base_list[0]; |
866 | 4.45k | rightop = list[0]; |
867 | | |
868 | 4.45k | #ifdef SUPPORT_UNICODE |
869 | 4.45k | accepted = FALSE; /* Always set in non-unicode case. */ |
870 | 4.45k | if (leftop == OP_PROP || leftop == OP_NOTPROP) |
871 | 88 | { |
872 | 88 | if (rightop == OP_EOD) |
873 | 0 | accepted = TRUE; |
874 | 88 | else if (rightop == OP_PROP || rightop == OP_NOTPROP) |
875 | 22 | { |
876 | 22 | int n; |
877 | 22 | const uint8_t *p; |
878 | 22 | BOOL same = leftop == rightop; |
879 | 22 | BOOL lisprop = leftop == OP_PROP; |
880 | 22 | BOOL risprop = rightop == OP_PROP; |
881 | 22 | BOOL bothprop = lisprop && risprop; |
882 | | |
883 | | /* There's a table that specifies how each combination is to be |
884 | | processed: |
885 | | 0 Always return FALSE (never auto-possessify) |
886 | | 1 Character groups are distinct (possessify if both are OP_PROP) |
887 | | 2 Check character categories in the same group (general or particular) |
888 | | 3 Return TRUE if the two opcodes are not the same |
889 | | ... see comments below |
890 | | */ |
891 | | |
892 | 22 | n = propposstab[base_list[2]][list[2]]; |
893 | 22 | switch(n) |
894 | 22 | { |
895 | 0 | case 0: break; |
896 | 0 | case 1: accepted = bothprop; break; |
897 | 1 | case 2: accepted = (base_list[3] == list[3]) != same; break; |
898 | 13 | case 3: accepted = !same; break; |
899 | | |
900 | 0 | case 4: /* Left general category, right particular category */ |
901 | 0 | accepted = risprop && catposstab[base_list[3]][list[3]] == same; |
902 | 0 | break; |
903 | | |
904 | 0 | case 5: /* Right general category, left particular category */ |
905 | 0 | accepted = lisprop && catposstab[list[3]][base_list[3]] == same; |
906 | 0 | break; |
907 | | |
908 | | /* This code is logically tricky. Think hard before fiddling with it. |
909 | | The posspropstab table has four entries per row. Each row relates to |
910 | | one of PCRE's special properties such as ALNUM or SPACE or WORD. |
911 | | Only WORD actually needs all four entries, but using repeats for the |
912 | | others means they can all use the same code below. |
913 | | |
914 | | The first two entries in each row are Unicode general categories, and |
915 | | apply always, because all the characters they include are part of the |
916 | | PCRE character set. The third and fourth entries are a general and a |
917 | | particular category, respectively, that include one or more relevant |
918 | | characters. One or the other is used, depending on whether the check |
919 | | is for a general or a particular category. However, in both cases the |
920 | | category contains more characters than the specials that are defined |
921 | | for the property being tested against. Therefore, it cannot be used |
922 | | in a NOTPROP case. |
923 | | |
924 | | Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. |
925 | | Underscore is covered by ucp_P or ucp_Po. */ |
926 | | |
927 | 0 | case 6: /* Left alphanum vs right general category */ |
928 | 0 | case 7: /* Left space vs right general category */ |
929 | 0 | case 8: /* Left word vs right general category */ |
930 | 0 | p = posspropstab[n-6]; |
931 | 0 | accepted = risprop && lisprop == |
932 | 0 | (list[3] != p[0] && |
933 | 0 | list[3] != p[1] && |
934 | 0 | (list[3] != p[2] || !lisprop)); |
935 | 0 | break; |
936 | | |
937 | 0 | case 9: /* Right alphanum vs left general category */ |
938 | 0 | case 10: /* Right space vs left general category */ |
939 | 0 | case 11: /* Right word vs left general category */ |
940 | 0 | p = posspropstab[n-9]; |
941 | 0 | accepted = lisprop && risprop == |
942 | 0 | (base_list[3] != p[0] && |
943 | 0 | base_list[3] != p[1] && |
944 | 0 | (base_list[3] != p[2] || !risprop)); |
945 | 0 | break; |
946 | | |
947 | 0 | case 12: /* Left alphanum vs right particular category */ |
948 | 8 | case 13: /* Left space vs right particular category */ |
949 | 8 | case 14: /* Left word vs right particular category */ |
950 | 8 | p = posspropstab[n-12]; |
951 | 8 | accepted = risprop && lisprop == |
952 | 7 | (catposstab[p[0]][list[3]] && |
953 | 7 | catposstab[p[1]][list[3]] && |
954 | 7 | (list[3] != p[3] || !lisprop)); |
955 | 8 | break; |
956 | | |
957 | 0 | case 15: /* Right alphanum vs left particular category */ |
958 | 0 | case 16: /* Right space vs left particular category */ |
959 | 0 | case 17: /* Right word vs left particular category */ |
960 | 0 | p = posspropstab[n-15]; |
961 | 0 | accepted = lisprop && risprop == |
962 | 0 | (catposstab[p[0]][base_list[3]] && |
963 | 0 | catposstab[p[1]][base_list[3]] && |
964 | 0 | (base_list[3] != p[3] || !risprop)); |
965 | 0 | break; |
966 | 22 | } |
967 | 22 | } |
968 | 88 | } |
969 | | |
970 | 4.37k | else |
971 | 4.37k | #endif /* SUPPORT_UNICODE */ |
972 | | |
973 | 4.37k | accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && |
974 | 4.37k | rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && |
975 | 4.37k | autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; |
976 | | |
977 | 4.45k | if (!accepted) return FALSE; |
978 | | |
979 | 1.35k | if (list[1] == 0) return TRUE; |
980 | | /* Might be an empty repeat. */ |
981 | 845 | continue; |
982 | 1.35k | } |
983 | | |
984 | | /* Control reaches here only if one of the items is a small character list. |
985 | | All characters are checked against the other side. */ |
986 | | |
987 | 44.9k | do |
988 | 48.4k | { |
989 | 48.4k | chr = *chr_ptr; |
990 | | |
991 | 48.4k | switch(list_ptr[0]) |
992 | 48.4k | { |
993 | 22.2k | case OP_CHAR: |
994 | 22.2k | ochr_ptr = list_ptr + 2; |
995 | 22.2k | do |
996 | 26.2k | { |
997 | 26.2k | if (chr == *ochr_ptr) return FALSE; |
998 | 25.0k | ochr_ptr++; |
999 | 25.0k | } |
1000 | 25.0k | while(*ochr_ptr != NOTACHAR); |
1001 | 21.0k | break; |
1002 | | |
1003 | 21.0k | case OP_NOT: |
1004 | 1.45k | ochr_ptr = list_ptr + 2; |
1005 | 1.45k | do |
1006 | 1.68k | { |
1007 | 1.68k | if (chr == *ochr_ptr) |
1008 | 124 | break; |
1009 | 1.56k | ochr_ptr++; |
1010 | 1.56k | } |
1011 | 1.56k | while(*ochr_ptr != NOTACHAR); |
1012 | 1.45k | if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */ |
1013 | 124 | break; |
1014 | | |
1015 | | /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not* |
1016 | | set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
1017 | | |
1018 | 1.40k | case OP_DIGIT: |
1019 | 1.40k | if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE; |
1020 | 1.38k | break; |
1021 | | |
1022 | 1.38k | case OP_NOT_DIGIT: |
1023 | 1.17k | if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE; |
1024 | 14 | break; |
1025 | | |
1026 | 509 | case OP_WHITESPACE: |
1027 | 509 | if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE; |
1028 | 498 | break; |
1029 | | |
1030 | 498 | case OP_NOT_WHITESPACE: |
1031 | 55 | if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE; |
1032 | 0 | break; |
1033 | | |
1034 | 2.58k | case OP_WORDCHAR: |
1035 | 2.58k | if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE; |
1036 | 2.05k | break; |
1037 | | |
1038 | 2.05k | case OP_NOT_WORDCHAR: |
1039 | 543 | if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE; |
1040 | 363 | break; |
1041 | | |
1042 | 363 | case OP_HSPACE: |
1043 | 55 | switch(chr) |
1044 | 55 | { |
1045 | 0 | HSPACE_CASES: return FALSE; |
1046 | 55 | default: break; |
1047 | 55 | } |
1048 | 55 | break; |
1049 | | |
1050 | 287 | case OP_NOT_HSPACE: |
1051 | 287 | switch(chr) |
1052 | 287 | { |
1053 | 3 | HSPACE_CASES: break; |
1054 | 284 | default: return FALSE; |
1055 | 287 | } |
1056 | 3 | break; |
1057 | | |
1058 | 3.25k | case OP_ANYNL: |
1059 | 3.45k | case OP_VSPACE: |
1060 | 3.45k | switch(chr) |
1061 | 3.45k | { |
1062 | 180 | VSPACE_CASES: return FALSE; |
1063 | 3.27k | default: break; |
1064 | 3.45k | } |
1065 | 3.27k | break; |
1066 | | |
1067 | 3.27k | case OP_NOT_VSPACE: |
1068 | 634 | switch(chr) |
1069 | 634 | { |
1070 | 8 | VSPACE_CASES: break; |
1071 | 626 | default: return FALSE; |
1072 | 634 | } |
1073 | 8 | break; |
1074 | | |
1075 | 204 | case OP_DOLL: |
1076 | 210 | case OP_EODN: |
1077 | 210 | switch (chr) |
1078 | 210 | { |
1079 | 92 | case CHAR_CR: |
1080 | 92 | case CHAR_LF: |
1081 | 95 | case CHAR_VT: |
1082 | 95 | case CHAR_FF: |
1083 | 95 | case CHAR_NEL: |
1084 | 95 | #ifndef EBCDIC |
1085 | 95 | case 0x2028: |
1086 | 95 | case 0x2029: |
1087 | 95 | #endif /* Not EBCDIC */ |
1088 | 95 | return FALSE; |
1089 | 210 | } |
1090 | 115 | break; |
1091 | | |
1092 | 115 | case OP_EOD: /* Can always possessify before \z */ |
1093 | 80 | break; |
1094 | | |
1095 | 0 | #ifdef SUPPORT_UNICODE |
1096 | 613 | case OP_PROP: |
1097 | 643 | case OP_NOTPROP: |
1098 | 643 | if (!check_char_prop(chr, list_ptr[2], list_ptr[3], |
1099 | 643 | list_ptr[0] == OP_NOTPROP)) |
1100 | 246 | return FALSE; |
1101 | 397 | break; |
1102 | 397 | #endif |
1103 | | |
1104 | 2.78k | case OP_NCLASS: |
1105 | 2.78k | if (chr > 255) return FALSE; |
1106 | | /* Fall through */ |
1107 | | |
1108 | 10.8k | case OP_CLASS: |
1109 | 10.8k | if (chr > 255) break; |
1110 | 10.7k | class_bitset = (const uint8_t *) |
1111 | 10.7k | ((list_ptr == list ? code : base_end) - list_ptr[2]); |
1112 | 10.7k | if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE; |
1113 | 6.33k | break; |
1114 | | |
1115 | 6.33k | #ifdef SUPPORT_WIDE_CHARS |
1116 | 6.33k | case OP_XCLASS: |
1117 | 1.32k | if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - |
1118 | 1.32k | list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf)) |
1119 | 680 | return FALSE; |
1120 | 644 | break; |
1121 | | |
1122 | 644 | case OP_ECLASS: |
1123 | 0 | if (PRIV(eclass)(chr, |
1124 | 0 | (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE, |
1125 | 0 | (list_ptr == list ? code : base_end) - list_ptr[3], |
1126 | 0 | (const uint8_t*)cb->start_code, utf)) |
1127 | 0 | return FALSE; |
1128 | 0 | break; |
1129 | 0 | #endif /* SUPPORT_WIDE_CHARS */ |
1130 | | |
1131 | 944 | default: |
1132 | 944 | return FALSE; |
1133 | 48.4k | } |
1134 | | |
1135 | 36.4k | chr_ptr++; |
1136 | 36.4k | } |
1137 | 44.9k | while(*chr_ptr != NOTACHAR); |
1138 | | |
1139 | | /* At least one character must be matched from this opcode. */ |
1140 | | |
1141 | 32.9k | if (list[1] == 0) return TRUE; |
1142 | 32.9k | } |
1143 | | |
1144 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ |
1145 | 0 | return FALSE; /* Avoid compiler warnings */ |
1146 | 44.8k | } |
1147 | | |
1148 | | |
1149 | | |
1150 | | /************************************************* |
1151 | | * Scan compiled regex for auto-possession * |
1152 | | *************************************************/ |
1153 | | |
1154 | | /* Replaces single character iterations with their possessive alternatives |
1155 | | if appropriate. This function modifies the compiled opcode! Hitting a |
1156 | | non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a |
1157 | | bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches |
1158 | | overly complicated or large patterns. In these cases, the check just stops, |
1159 | | leaving the remainder of the pattern unpossessified. |
1160 | | |
1161 | | Arguments: |
1162 | | code points to start of the byte code |
1163 | | cb compile data block |
1164 | | |
1165 | | Returns: 0 for success |
1166 | | -1 if a non-existant opcode is encountered |
1167 | | */ |
1168 | | |
1169 | | int |
1170 | | PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb) |
1171 | 1.16k | { |
1172 | 1.16k | PCRE2_UCHAR c; |
1173 | 1.16k | PCRE2_SPTR end; |
1174 | 1.16k | PCRE2_UCHAR *repeat_opcode; |
1175 | 1.16k | uint32_t list[MAX_LIST]; |
1176 | 1.16k | int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ |
1177 | 1.16k | BOOL utf = (cb->external_options & PCRE2_UTF) != 0; |
1178 | 1.16k | BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; |
1179 | | |
1180 | 1.16k | for (;;) |
1181 | 539k | { |
1182 | 539k | c = *code; |
1183 | | |
1184 | 539k | if (c >= OP_TABLE_LENGTH) |
1185 | 0 | { |
1186 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
1187 | 0 | return -1; /* Something gone wrong */ |
1188 | 0 | } |
1189 | | |
1190 | 539k | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) |
1191 | 37.7k | { |
1192 | 37.7k | c -= get_repeat_base(c) - OP_STAR; |
1193 | 37.7k | end = (c <= OP_MINUPTO) ? |
1194 | 36.9k | get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; |
1195 | 37.7k | list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; |
1196 | | |
1197 | 37.7k | if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, |
1198 | 36.9k | &rec_limit)) |
1199 | 22.5k | { |
1200 | 22.5k | switch(c) |
1201 | 22.5k | { |
1202 | 1.81k | case OP_STAR: |
1203 | 1.81k | *code += OP_POSSTAR - OP_STAR; |
1204 | 1.81k | break; |
1205 | | |
1206 | 749 | case OP_MINSTAR: |
1207 | 749 | *code += OP_POSSTAR - OP_MINSTAR; |
1208 | 749 | break; |
1209 | | |
1210 | 4.54k | case OP_PLUS: |
1211 | 4.54k | *code += OP_POSPLUS - OP_PLUS; |
1212 | 4.54k | break; |
1213 | | |
1214 | 1.27k | case OP_MINPLUS: |
1215 | 1.27k | *code += OP_POSPLUS - OP_MINPLUS; |
1216 | 1.27k | break; |
1217 | | |
1218 | 12.7k | case OP_QUERY: |
1219 | 12.7k | *code += OP_POSQUERY - OP_QUERY; |
1220 | 12.7k | break; |
1221 | | |
1222 | 1.44k | case OP_MINQUERY: |
1223 | 1.44k | *code += OP_POSQUERY - OP_MINQUERY; |
1224 | 1.44k | break; |
1225 | | |
1226 | 0 | case OP_UPTO: |
1227 | 0 | *code += OP_POSUPTO - OP_UPTO; |
1228 | 0 | break; |
1229 | | |
1230 | 0 | case OP_MINUPTO: |
1231 | 0 | *code += OP_POSUPTO - OP_MINUPTO; |
1232 | 0 | break; |
1233 | 22.5k | } |
1234 | 22.5k | } |
1235 | 37.7k | c = *code; |
1236 | 37.7k | } |
1237 | 502k | else if (c == OP_CLASS || c == OP_NCLASS |
1238 | 502k | #ifdef SUPPORT_WIDE_CHARS |
1239 | 502k | || c == OP_XCLASS || c == OP_ECLASS |
1240 | 502k | #endif |
1241 | 502k | ) |
1242 | 9.78k | { |
1243 | 9.78k | #ifdef SUPPORT_WIDE_CHARS |
1244 | 9.78k | if (c == OP_XCLASS || c == OP_ECLASS) |
1245 | 1.40k | repeat_opcode = code + GET(code, 1); |
1246 | 8.38k | else |
1247 | 8.38k | #endif |
1248 | 8.38k | repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); |
1249 | | |
1250 | 9.78k | c = *repeat_opcode; |
1251 | 9.78k | if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) |
1252 | 6.96k | { |
1253 | | /* The return from get_chr_property_list() will never be NULL when |
1254 | | *code (aka c) is one of the four class opcodes. However, gcc with |
1255 | | -fanalyzer notes that a NULL return is possible, and grumbles. Hence we |
1256 | | put in a check. */ |
1257 | | |
1258 | 6.96k | end = get_chr_property_list(code, utf, ucp, cb->fcc, list); |
1259 | 6.96k | list[1] = (c & 1) == 0; |
1260 | | |
1261 | 6.96k | if (end != NULL && |
1262 | 6.96k | compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) |
1263 | 2.40k | { |
1264 | 2.40k | switch (c) |
1265 | 2.40k | { |
1266 | 554 | case OP_CRSTAR: |
1267 | 646 | case OP_CRMINSTAR: |
1268 | 646 | *repeat_opcode = OP_CRPOSSTAR; |
1269 | 646 | break; |
1270 | | |
1271 | 490 | case OP_CRPLUS: |
1272 | 510 | case OP_CRMINPLUS: |
1273 | 510 | *repeat_opcode = OP_CRPOSPLUS; |
1274 | 510 | break; |
1275 | | |
1276 | 911 | case OP_CRQUERY: |
1277 | 1.24k | case OP_CRMINQUERY: |
1278 | 1.24k | *repeat_opcode = OP_CRPOSQUERY; |
1279 | 1.24k | break; |
1280 | | |
1281 | 0 | case OP_CRRANGE: |
1282 | 0 | case OP_CRMINRANGE: |
1283 | 0 | *repeat_opcode = OP_CRPOSRANGE; |
1284 | 0 | break; |
1285 | 2.40k | } |
1286 | 2.40k | } |
1287 | 6.96k | } |
1288 | 9.78k | c = *code; |
1289 | 9.78k | } |
1290 | | |
1291 | 539k | switch(c) |
1292 | 539k | { |
1293 | 1.16k | case OP_END: |
1294 | 1.16k | return 0; |
1295 | | |
1296 | 122 | case OP_TYPESTAR: |
1297 | 130 | case OP_TYPEMINSTAR: |
1298 | 1.10k | case OP_TYPEPLUS: |
1299 | 1.51k | case OP_TYPEMINPLUS: |
1300 | 3.74k | case OP_TYPEQUERY: |
1301 | 4.04k | case OP_TYPEMINQUERY: |
1302 | 4.10k | case OP_TYPEPOSSTAR: |
1303 | 5.28k | case OP_TYPEPOSPLUS: |
1304 | 9.62k | case OP_TYPEPOSQUERY: |
1305 | 9.62k | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
1306 | 9.62k | break; |
1307 | | |
1308 | 0 | case OP_TYPEUPTO: |
1309 | 0 | case OP_TYPEMINUPTO: |
1310 | 0 | case OP_TYPEEXACT: |
1311 | 0 | case OP_TYPEPOSUPTO: |
1312 | 0 | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
1313 | 0 | code += 2; |
1314 | 0 | break; |
1315 | | |
1316 | 0 | case OP_CALLOUT_STR: |
1317 | 0 | code += GET(code, 1 + 2*LINK_SIZE); |
1318 | 0 | break; |
1319 | | |
1320 | 0 | #ifdef SUPPORT_WIDE_CHARS |
1321 | 1.40k | case OP_XCLASS: |
1322 | 1.40k | case OP_ECLASS: |
1323 | 1.40k | code += GET(code, 1); |
1324 | 1.40k | break; |
1325 | 0 | #endif |
1326 | | |
1327 | 49 | case OP_MARK: |
1328 | 49 | case OP_COMMIT_ARG: |
1329 | 49 | case OP_PRUNE_ARG: |
1330 | 49 | case OP_SKIP_ARG: |
1331 | 49 | case OP_THEN_ARG: |
1332 | 49 | code += code[1]; |
1333 | 49 | break; |
1334 | 539k | } |
1335 | | |
1336 | | /* Add in the fixed length from the table */ |
1337 | | |
1338 | 538k | code += PRIV(OP_lengths)[c]; |
1339 | | |
1340 | | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be |
1341 | | followed by a multi-byte character. The length in the table is a minimum, so |
1342 | | we have to arrange to skip the extra code units. */ |
1343 | | |
1344 | 538k | #ifdef MAYBE_UTF_MULTI |
1345 | 538k | if (utf) switch(c) |
1346 | 30.2k | { |
1347 | 2.46k | case OP_CHAR: |
1348 | 20.3k | case OP_CHARI: |
1349 | 20.3k | case OP_NOT: |
1350 | 20.3k | case OP_NOTI: |
1351 | 20.3k | case OP_STAR: |
1352 | 20.3k | case OP_MINSTAR: |
1353 | 20.4k | case OP_PLUS: |
1354 | 20.4k | case OP_MINPLUS: |
1355 | 20.4k | case OP_QUERY: |
1356 | 20.4k | case OP_MINQUERY: |
1357 | 20.4k | case OP_UPTO: |
1358 | 20.4k | case OP_MINUPTO: |
1359 | 20.4k | case OP_EXACT: |
1360 | 20.4k | case OP_POSSTAR: |
1361 | 20.4k | case OP_POSPLUS: |
1362 | 20.5k | case OP_POSQUERY: |
1363 | 20.5k | case OP_POSUPTO: |
1364 | 20.5k | case OP_STARI: |
1365 | 20.5k | case OP_MINSTARI: |
1366 | 20.5k | case OP_PLUSI: |
1367 | 20.5k | case OP_MINPLUSI: |
1368 | 20.6k | case OP_QUERYI: |
1369 | 20.7k | case OP_MINQUERYI: |
1370 | 20.7k | case OP_UPTOI: |
1371 | 20.7k | case OP_MINUPTOI: |
1372 | 20.7k | case OP_EXACTI: |
1373 | 20.7k | case OP_POSSTARI: |
1374 | 21.2k | case OP_POSPLUSI: |
1375 | 21.7k | case OP_POSQUERYI: |
1376 | 21.7k | case OP_POSUPTOI: |
1377 | 21.7k | case OP_NOTSTAR: |
1378 | 21.7k | case OP_NOTMINSTAR: |
1379 | 21.7k | case OP_NOTPLUS: |
1380 | 21.7k | case OP_NOTMINPLUS: |
1381 | 21.7k | case OP_NOTQUERY: |
1382 | 21.7k | case OP_NOTMINQUERY: |
1383 | 21.7k | case OP_NOTUPTO: |
1384 | 21.7k | case OP_NOTMINUPTO: |
1385 | 21.7k | case OP_NOTEXACT: |
1386 | 21.7k | case OP_NOTPOSSTAR: |
1387 | 21.7k | case OP_NOTPOSPLUS: |
1388 | 21.7k | case OP_NOTPOSQUERY: |
1389 | 21.7k | case OP_NOTPOSUPTO: |
1390 | 21.7k | case OP_NOTSTARI: |
1391 | 21.7k | case OP_NOTMINSTARI: |
1392 | 21.7k | case OP_NOTPLUSI: |
1393 | 21.7k | case OP_NOTMINPLUSI: |
1394 | 21.8k | case OP_NOTQUERYI: |
1395 | 22.0k | case OP_NOTMINQUERYI: |
1396 | 22.0k | case OP_NOTUPTOI: |
1397 | 22.0k | case OP_NOTMINUPTOI: |
1398 | 22.0k | case OP_NOTEXACTI: |
1399 | 22.0k | case OP_NOTPOSSTARI: |
1400 | 22.0k | case OP_NOTPOSPLUSI: |
1401 | 22.0k | case OP_NOTPOSQUERYI: |
1402 | 22.0k | case OP_NOTPOSUPTOI: |
1403 | 22.0k | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
1404 | 22.0k | break; |
1405 | 30.2k | } |
1406 | | #else |
1407 | | (void)(utf); /* Keep compiler happy by referencing function argument */ |
1408 | | #endif /* SUPPORT_WIDE_CHARS */ |
1409 | 538k | } |
1410 | 1.16k | } |
1411 | | |
1412 | | /* End of pcre2_auto_possess.c */ |