/src/glib-2.80.0/subprojects/pcre2-10.42/src/pcre2_auto_possess.c
Line | Count | Source |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2022 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | /* This module contains functions that scan a compiled pattern and change |
42 | | repeats into possessive repeats where possible. */ |
43 | | |
44 | | |
45 | | #ifdef HAVE_CONFIG_H |
46 | | #include "config.h" |
47 | | #endif |
48 | | |
49 | | |
50 | | #include "pcre2_internal.h" |
51 | | |
52 | | |
53 | | /************************************************* |
54 | | * Tables for auto-possessification * |
55 | | *************************************************/ |
56 | | |
57 | | /* This table is used to check whether auto-possessification is possible |
58 | | between adjacent character-type opcodes. The left-hand (repeated) opcode is |
59 | | used to select the row, and the right-hand opcode is use to select the column. |
60 | | A value of 1 means that auto-possessification is OK. For example, the second |
61 | | value in the first row means that \D+\d can be turned into \D++\d. |
62 | | |
63 | | The Unicode property types (\P and \p) have to be present to fill out the table |
64 | | because of what their opcode values are, but the table values should always be |
65 | | zero because property types are handled separately in the code. The last four |
66 | | columns apply to items that cannot be repeated, so there is no need to have |
67 | | rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is |
68 | | *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
69 | | |
70 | | #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1) |
71 | | #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1) |
72 | | |
73 | | static const uint8_t autoposstab[APTROWS][APTCOLS] = { |
74 | | /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */ |
75 | | { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */ |
76 | | { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */ |
77 | | { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */ |
78 | | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */ |
79 | | { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */ |
80 | | { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */ |
81 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */ |
82 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */ |
83 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */ |
84 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */ |
85 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */ |
86 | | { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */ |
87 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */ |
88 | | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */ |
89 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */ |
90 | | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */ |
91 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */ |
92 | | }; |
93 | | |
94 | | #ifdef SUPPORT_UNICODE |
95 | | /* This table is used to check whether auto-possessification is possible |
96 | | between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The |
97 | | left-hand (repeated) opcode is used to select the row, and the right-hand |
98 | | opcode is used to select the column. The values are as follows: |
99 | | |
100 | | 0 Always return FALSE (never auto-possessify) |
101 | | 1 Character groups are distinct (possessify if both are OP_PROP) |
102 | | 2 Check character categories in the same group (general or particular) |
103 | | 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP) |
104 | | |
105 | | 4 Check left general category vs right particular category |
106 | | 5 Check right general category vs left particular category |
107 | | |
108 | | 6 Left alphanum vs right general category |
109 | | 7 Left space vs right general category |
110 | | 8 Left word vs right general category |
111 | | |
112 | | 9 Right alphanum vs left general category |
113 | | 10 Right space vs left general category |
114 | | 11 Right word vs left general category |
115 | | |
116 | | 12 Left alphanum vs right particular category |
117 | | 13 Left space vs right particular category |
118 | | 14 Left word vs right particular category |
119 | | |
120 | | 15 Right alphanum vs left particular category |
121 | | 16 Right space vs left particular category |
122 | | 17 Right word vs left particular category |
123 | | */ |
124 | | |
125 | | static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = { |
126 | | /* ANY LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */ |
127 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */ |
128 | | { 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */ |
129 | | { 0, 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */ |
130 | | { 0, 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */ |
131 | | { 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */ |
132 | | { 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */ |
133 | | { 0, 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */ |
134 | | { 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */ |
135 | | { 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */ |
136 | | { 0, 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */ |
137 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */ |
138 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */ |
139 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */ |
140 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */ |
141 | | }; |
142 | | |
143 | | /* This table is used to check whether auto-possessification is possible |
144 | | between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one |
145 | | specifies a general category and the other specifies a particular category. The |
146 | | row is selected by the general category and the column by the particular |
147 | | category. The value is 1 if the particular category is not part of the general |
148 | | category. */ |
149 | | |
150 | | static const uint8_t catposstab[7][30] = { |
151 | | /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */ |
152 | | { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */ |
153 | | { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */ |
154 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */ |
155 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ |
156 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */ |
157 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */ |
158 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */ |
159 | | }; |
160 | | |
161 | | /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against |
162 | | a general or particular category. The properties in each row are those |
163 | | that apply to the character set in question. Duplication means that a little |
164 | | unnecessary work is done when checking, but this keeps things much simpler |
165 | | because they can all use the same code. For more details see the comment where |
166 | | this table is used. |
167 | | |
168 | | Note: SPACE and PXSPACE used to be different because Perl excluded VT from |
169 | | "space", but from Perl 5.18 it's included, so both categories are treated the |
170 | | same here. */ |
171 | | |
172 | | static const uint8_t posspropstab[3][4] = { |
173 | | { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */ |
174 | | { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */ |
175 | | { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */ |
176 | | }; |
177 | | #endif /* SUPPORT_UNICODE */ |
178 | | |
179 | | |
180 | | |
181 | | #ifdef SUPPORT_UNICODE |
182 | | /************************************************* |
183 | | * Check a character and a property * |
184 | | *************************************************/ |
185 | | |
186 | | /* This function is called by compare_opcodes() when a property item is |
187 | | adjacent to a fixed character. |
188 | | |
189 | | Arguments: |
190 | | c the character |
191 | | ptype the property type |
192 | | pdata the data for the type |
193 | | negated TRUE if it's a negated property (\P or \p{^) |
194 | | |
195 | | Returns: TRUE if auto-possessifying is OK |
196 | | */ |
197 | | |
198 | | static BOOL |
199 | | check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata, |
200 | | BOOL negated) |
201 | 0 | { |
202 | 0 | BOOL ok; |
203 | 0 | const uint32_t *p; |
204 | 0 | const ucd_record *prop = GET_UCD(c); |
205 | |
|
206 | 0 | switch(ptype) |
207 | 0 | { |
208 | 0 | case PT_LAMP: |
209 | 0 | return (prop->chartype == ucp_Lu || |
210 | 0 | prop->chartype == ucp_Ll || |
211 | 0 | prop->chartype == ucp_Lt) == negated; |
212 | | |
213 | 0 | case PT_GC: |
214 | 0 | return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; |
215 | | |
216 | 0 | case PT_PC: |
217 | 0 | return (pdata == prop->chartype) == negated; |
218 | | |
219 | 0 | case PT_SC: |
220 | 0 | return (pdata == prop->script) == negated; |
221 | | |
222 | 0 | case PT_SCX: |
223 | 0 | ok = (pdata == prop->script |
224 | 0 | || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); |
225 | 0 | return ok == negated; |
226 | | |
227 | | /* These are specials */ |
228 | | |
229 | 0 | case PT_ALNUM: |
230 | 0 | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
231 | 0 | PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; |
232 | | |
233 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, which |
234 | | means that Perl space and POSIX space are now identical. PCRE was changed |
235 | | at release 8.34. */ |
236 | | |
237 | 0 | case PT_SPACE: /* Perl space */ |
238 | 0 | case PT_PXSPACE: /* POSIX space */ |
239 | 0 | switch(c) |
240 | 0 | { |
241 | 0 | HSPACE_CASES: |
242 | 0 | VSPACE_CASES: |
243 | 0 | return negated; |
244 | | |
245 | 0 | default: |
246 | 0 | return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; |
247 | 0 | } |
248 | 0 | break; /* Control never reaches here */ |
249 | | |
250 | 0 | case PT_WORD: |
251 | 0 | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
252 | 0 | PRIV(ucp_gentype)[prop->chartype] == ucp_N || |
253 | 0 | c == CHAR_UNDERSCORE) == negated; |
254 | | |
255 | 0 | case PT_CLIST: |
256 | 0 | p = PRIV(ucd_caseless_sets) + prop->caseset; |
257 | 0 | for (;;) |
258 | 0 | { |
259 | 0 | if (c < *p) return !negated; |
260 | 0 | if (c == *p++) return negated; |
261 | 0 | } |
262 | 0 | break; /* Control never reaches here */ |
263 | | |
264 | | /* Haven't yet thought these through. */ |
265 | | |
266 | 0 | case PT_BIDICL: |
267 | 0 | return FALSE; |
268 | | |
269 | 0 | case PT_BOOL: |
270 | 0 | return FALSE; |
271 | 0 | } |
272 | | |
273 | 0 | return FALSE; |
274 | 0 | } |
275 | | #endif /* SUPPORT_UNICODE */ |
276 | | |
277 | | |
278 | | |
279 | | /************************************************* |
280 | | * Base opcode of repeated opcodes * |
281 | | *************************************************/ |
282 | | |
283 | | /* Returns the base opcode for repeated single character type opcodes. If the |
284 | | opcode is not a repeated character type, it returns with the original value. |
285 | | |
286 | | Arguments: c opcode |
287 | | Returns: base opcode for the type |
288 | | */ |
289 | | |
290 | | static PCRE2_UCHAR |
291 | | get_repeat_base(PCRE2_UCHAR c) |
292 | 0 | { |
293 | 0 | return (c > OP_TYPEPOSUPTO)? c : |
294 | 0 | (c >= OP_TYPESTAR)? OP_TYPESTAR : |
295 | 0 | (c >= OP_NOTSTARI)? OP_NOTSTARI : |
296 | 0 | (c >= OP_NOTSTAR)? OP_NOTSTAR : |
297 | 0 | (c >= OP_STARI)? OP_STARI : |
298 | 0 | OP_STAR; |
299 | 0 | } |
300 | | |
301 | | |
302 | | /************************************************* |
303 | | * Fill the character property list * |
304 | | *************************************************/ |
305 | | |
306 | | /* Checks whether the code points to an opcode that can take part in auto- |
307 | | possessification, and if so, fills a list with its properties. |
308 | | |
309 | | Arguments: |
310 | | code points to start of expression |
311 | | utf TRUE if in UTF mode |
312 | | ucp TRUE if in UCP mode |
313 | | fcc points to the case-flipping table |
314 | | list points to output list |
315 | | list[0] will be filled with the opcode |
316 | | list[1] will be non-zero if this opcode |
317 | | can match an empty character string |
318 | | list[2..7] depends on the opcode |
319 | | |
320 | | Returns: points to the start of the next opcode if *code is accepted |
321 | | NULL if *code is not accepted |
322 | | */ |
323 | | |
324 | | static PCRE2_SPTR |
325 | | get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc, |
326 | | uint32_t *list) |
327 | 0 | { |
328 | 0 | PCRE2_UCHAR c = *code; |
329 | 0 | PCRE2_UCHAR base; |
330 | 0 | PCRE2_SPTR end; |
331 | 0 | uint32_t chr; |
332 | |
|
333 | 0 | #ifdef SUPPORT_UNICODE |
334 | 0 | uint32_t *clist_dest; |
335 | 0 | const uint32_t *clist_src; |
336 | | #else |
337 | | (void)utf; /* Suppress "unused parameter" compiler warnings */ |
338 | | (void)ucp; |
339 | | #endif |
340 | |
|
341 | 0 | list[0] = c; |
342 | 0 | list[1] = FALSE; |
343 | 0 | code++; |
344 | |
|
345 | 0 | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) |
346 | 0 | { |
347 | 0 | base = get_repeat_base(c); |
348 | 0 | c -= (base - OP_STAR); |
349 | |
|
350 | 0 | if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) |
351 | 0 | code += IMM2_SIZE; |
352 | |
|
353 | 0 | list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && |
354 | 0 | c != OP_POSPLUS); |
355 | |
|
356 | 0 | switch(base) |
357 | 0 | { |
358 | 0 | case OP_STAR: |
359 | 0 | list[0] = OP_CHAR; |
360 | 0 | break; |
361 | | |
362 | 0 | case OP_STARI: |
363 | 0 | list[0] = OP_CHARI; |
364 | 0 | break; |
365 | | |
366 | 0 | case OP_NOTSTAR: |
367 | 0 | list[0] = OP_NOT; |
368 | 0 | break; |
369 | | |
370 | 0 | case OP_NOTSTARI: |
371 | 0 | list[0] = OP_NOTI; |
372 | 0 | break; |
373 | | |
374 | 0 | case OP_TYPESTAR: |
375 | 0 | list[0] = *code; |
376 | 0 | code++; |
377 | 0 | break; |
378 | 0 | } |
379 | 0 | c = list[0]; |
380 | 0 | } |
381 | | |
382 | 0 | switch(c) |
383 | 0 | { |
384 | 0 | case OP_NOT_DIGIT: |
385 | 0 | case OP_DIGIT: |
386 | 0 | case OP_NOT_WHITESPACE: |
387 | 0 | case OP_WHITESPACE: |
388 | 0 | case OP_NOT_WORDCHAR: |
389 | 0 | case OP_WORDCHAR: |
390 | 0 | case OP_ANY: |
391 | 0 | case OP_ALLANY: |
392 | 0 | case OP_ANYNL: |
393 | 0 | case OP_NOT_HSPACE: |
394 | 0 | case OP_HSPACE: |
395 | 0 | case OP_NOT_VSPACE: |
396 | 0 | case OP_VSPACE: |
397 | 0 | case OP_EXTUNI: |
398 | 0 | case OP_EODN: |
399 | 0 | case OP_EOD: |
400 | 0 | case OP_DOLL: |
401 | 0 | case OP_DOLLM: |
402 | 0 | return code; |
403 | | |
404 | 0 | case OP_CHAR: |
405 | 0 | case OP_NOT: |
406 | 0 | GETCHARINCTEST(chr, code); |
407 | 0 | list[2] = chr; |
408 | 0 | list[3] = NOTACHAR; |
409 | 0 | return code; |
410 | | |
411 | 0 | case OP_CHARI: |
412 | 0 | case OP_NOTI: |
413 | 0 | list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; |
414 | 0 | GETCHARINCTEST(chr, code); |
415 | 0 | list[2] = chr; |
416 | |
|
417 | 0 | #ifdef SUPPORT_UNICODE |
418 | 0 | if (chr < 128 || (chr < 256 && !utf && !ucp)) |
419 | 0 | list[3] = fcc[chr]; |
420 | 0 | else |
421 | 0 | list[3] = UCD_OTHERCASE(chr); |
422 | | #elif defined SUPPORT_WIDE_CHARS |
423 | | list[3] = (chr < 256) ? fcc[chr] : chr; |
424 | | #else |
425 | | list[3] = fcc[chr]; |
426 | | #endif |
427 | | |
428 | | /* The othercase might be the same value. */ |
429 | |
|
430 | 0 | if (chr == list[3]) |
431 | 0 | list[3] = NOTACHAR; |
432 | 0 | else |
433 | 0 | list[4] = NOTACHAR; |
434 | 0 | return code; |
435 | | |
436 | 0 | #ifdef SUPPORT_UNICODE |
437 | 0 | case OP_PROP: |
438 | 0 | case OP_NOTPROP: |
439 | 0 | if (code[0] != PT_CLIST) |
440 | 0 | { |
441 | 0 | list[2] = code[0]; |
442 | 0 | list[3] = code[1]; |
443 | 0 | return code + 2; |
444 | 0 | } |
445 | | |
446 | | /* Convert only if we have enough space. */ |
447 | | |
448 | 0 | clist_src = PRIV(ucd_caseless_sets) + code[1]; |
449 | 0 | clist_dest = list + 2; |
450 | 0 | code += 2; |
451 | |
|
452 | 0 | do { |
453 | 0 | if (clist_dest >= list + 8) |
454 | 0 | { |
455 | | /* Early return if there is not enough space. This should never |
456 | | happen, since all clists are shorter than 5 character now. */ |
457 | 0 | list[2] = code[0]; |
458 | 0 | list[3] = code[1]; |
459 | 0 | return code; |
460 | 0 | } |
461 | 0 | *clist_dest++ = *clist_src; |
462 | 0 | } |
463 | 0 | while(*clist_src++ != NOTACHAR); |
464 | | |
465 | | /* All characters are stored. The terminating NOTACHAR is copied from the |
466 | | clist itself. */ |
467 | | |
468 | 0 | list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; |
469 | 0 | return code; |
470 | 0 | #endif |
471 | | |
472 | 0 | case OP_NCLASS: |
473 | 0 | case OP_CLASS: |
474 | 0 | #ifdef SUPPORT_WIDE_CHARS |
475 | 0 | case OP_XCLASS: |
476 | 0 | if (c == OP_XCLASS) |
477 | 0 | end = code + GET(code, 0) - 1; |
478 | 0 | else |
479 | 0 | #endif |
480 | 0 | end = code + 32 / sizeof(PCRE2_UCHAR); |
481 | |
|
482 | 0 | switch(*end) |
483 | 0 | { |
484 | 0 | case OP_CRSTAR: |
485 | 0 | case OP_CRMINSTAR: |
486 | 0 | case OP_CRQUERY: |
487 | 0 | case OP_CRMINQUERY: |
488 | 0 | case OP_CRPOSSTAR: |
489 | 0 | case OP_CRPOSQUERY: |
490 | 0 | list[1] = TRUE; |
491 | 0 | end++; |
492 | 0 | break; |
493 | | |
494 | 0 | case OP_CRPLUS: |
495 | 0 | case OP_CRMINPLUS: |
496 | 0 | case OP_CRPOSPLUS: |
497 | 0 | end++; |
498 | 0 | break; |
499 | | |
500 | 0 | case OP_CRRANGE: |
501 | 0 | case OP_CRMINRANGE: |
502 | 0 | case OP_CRPOSRANGE: |
503 | 0 | list[1] = (GET2(end, 1) == 0); |
504 | 0 | end += 1 + 2 * IMM2_SIZE; |
505 | 0 | break; |
506 | 0 | } |
507 | 0 | list[2] = (uint32_t)(end - code); |
508 | 0 | return end; |
509 | 0 | } |
510 | | |
511 | 0 | return NULL; /* Opcode not accepted */ |
512 | 0 | } Unexecuted instantiation: pcre2_auto_possess.c:get_chr_property_list Unexecuted instantiation: pcre2_auto_possess.c:get_chr_property_list |
513 | | |
514 | | |
515 | | |
516 | | /************************************************* |
517 | | * Scan further character sets for match * |
518 | | *************************************************/ |
519 | | |
520 | | /* Checks whether the base and the current opcode have a common character, in |
521 | | which case the base cannot be possessified. |
522 | | |
523 | | Arguments: |
524 | | code points to the byte code |
525 | | utf TRUE in UTF mode |
526 | | ucp TRUE in UCP mode |
527 | | cb compile data block |
528 | | base_list the data list of the base opcode |
529 | | base_end the end of the base opcode |
530 | | rec_limit points to recursion depth counter |
531 | | |
532 | | Returns: TRUE if the auto-possessification is possible |
533 | | */ |
534 | | |
535 | | static BOOL |
536 | | compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb, |
537 | | const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) |
538 | 0 | { |
539 | 0 | PCRE2_UCHAR c; |
540 | 0 | uint32_t list[8]; |
541 | 0 | const uint32_t *chr_ptr; |
542 | 0 | const uint32_t *ochr_ptr; |
543 | 0 | const uint32_t *list_ptr; |
544 | 0 | PCRE2_SPTR next_code; |
545 | 0 | #ifdef SUPPORT_WIDE_CHARS |
546 | 0 | PCRE2_SPTR xclass_flags; |
547 | 0 | #endif |
548 | 0 | const uint8_t *class_bitset; |
549 | 0 | const uint8_t *set1, *set2, *set_end; |
550 | 0 | uint32_t chr; |
551 | 0 | BOOL accepted, invert_bits; |
552 | 0 | BOOL entered_a_group = FALSE; |
553 | |
|
554 | 0 | if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */ |
555 | | |
556 | | /* Note: the base_list[1] contains whether the current opcode has a greedy |
557 | | (represented by a non-zero value) quantifier. This is a different from |
558 | | other character type lists, which store here that the character iterator |
559 | | matches to an empty string (also represented by a non-zero value). */ |
560 | | |
561 | 0 | for(;;) |
562 | 0 | { |
563 | | /* All operations move the code pointer forward. |
564 | | Therefore infinite recursions are not possible. */ |
565 | |
|
566 | 0 | c = *code; |
567 | | |
568 | | /* Skip over callouts */ |
569 | |
|
570 | 0 | if (c == OP_CALLOUT) |
571 | 0 | { |
572 | 0 | code += PRIV(OP_lengths)[c]; |
573 | 0 | continue; |
574 | 0 | } |
575 | | |
576 | 0 | if (c == OP_CALLOUT_STR) |
577 | 0 | { |
578 | 0 | code += GET(code, 1 + 2*LINK_SIZE); |
579 | 0 | continue; |
580 | 0 | } |
581 | | |
582 | | /* At the end of a branch, skip to the end of the group. */ |
583 | | |
584 | 0 | if (c == OP_ALT) |
585 | 0 | { |
586 | 0 | do code += GET(code, 1); while (*code == OP_ALT); |
587 | 0 | c = *code; |
588 | 0 | } |
589 | | |
590 | | /* Inspect the next opcode. */ |
591 | |
|
592 | 0 | switch(c) |
593 | 0 | { |
594 | | /* We can always possessify a greedy iterator at the end of the pattern, |
595 | | which is reached after skipping over the final OP_KET. A non-greedy |
596 | | iterator must never be possessified. */ |
597 | | |
598 | 0 | case OP_END: |
599 | 0 | return base_list[1] != 0; |
600 | | |
601 | | /* When an iterator is at the end of certain kinds of group we can inspect |
602 | | what follows the group by skipping over the closing ket. Note that this |
603 | | does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given |
604 | | iteration is variable (could be another iteration or could be the next |
605 | | item). As these two opcodes are not listed in the next switch, they will |
606 | | end up as the next code to inspect, and return FALSE by virtue of being |
607 | | unsupported. */ |
608 | | |
609 | 0 | case OP_KET: |
610 | 0 | case OP_KETRPOS: |
611 | | /* The non-greedy case cannot be converted to a possessive form. */ |
612 | |
|
613 | 0 | if (base_list[1] == 0) return FALSE; |
614 | | |
615 | | /* If the bracket is capturing it might be referenced by an OP_RECURSE |
616 | | so its last iterator can never be possessified if the pattern contains |
617 | | recursions. (This could be improved by keeping a list of group numbers that |
618 | | are called by recursion.) */ |
619 | | |
620 | 0 | switch(*(code - GET(code, 1))) |
621 | 0 | { |
622 | 0 | case OP_CBRA: |
623 | 0 | case OP_SCBRA: |
624 | 0 | case OP_CBRAPOS: |
625 | 0 | case OP_SCBRAPOS: |
626 | 0 | if (cb->had_recurse) return FALSE; |
627 | 0 | break; |
628 | | |
629 | | /* A script run might have to backtrack if the iterated item can match |
630 | | characters from more than one script. So give up unless repeating an |
631 | | explicit character. */ |
632 | | |
633 | 0 | case OP_SCRIPT_RUN: |
634 | 0 | if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI) |
635 | 0 | return FALSE; |
636 | 0 | break; |
637 | | |
638 | | /* Atomic sub-patterns and assertions can always auto-possessify their |
639 | | last iterator. However, if the group was entered as a result of checking |
640 | | a previous iterator, this is not possible. */ |
641 | | |
642 | 0 | case OP_ASSERT: |
643 | 0 | case OP_ASSERT_NOT: |
644 | 0 | case OP_ASSERTBACK: |
645 | 0 | case OP_ASSERTBACK_NOT: |
646 | 0 | case OP_ONCE: |
647 | 0 | return !entered_a_group; |
648 | | |
649 | | /* Non-atomic assertions - don't possessify last iterator. This needs |
650 | | more thought. */ |
651 | | |
652 | 0 | case OP_ASSERT_NA: |
653 | 0 | case OP_ASSERTBACK_NA: |
654 | 0 | return FALSE; |
655 | 0 | } |
656 | | |
657 | | /* Skip over the bracket and inspect what comes next. */ |
658 | | |
659 | 0 | code += PRIV(OP_lengths)[c]; |
660 | 0 | continue; |
661 | | |
662 | | /* Handle cases where the next item is a group. */ |
663 | | |
664 | 0 | case OP_ONCE: |
665 | 0 | case OP_BRA: |
666 | 0 | case OP_CBRA: |
667 | 0 | next_code = code + GET(code, 1); |
668 | 0 | code += PRIV(OP_lengths)[c]; |
669 | | |
670 | | /* Check each branch. We have to recurse a level for all but the last |
671 | | branch. */ |
672 | |
|
673 | 0 | while (*next_code == OP_ALT) |
674 | 0 | { |
675 | 0 | if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit)) |
676 | 0 | return FALSE; |
677 | 0 | code = next_code + 1 + LINK_SIZE; |
678 | 0 | next_code += GET(next_code, 1); |
679 | 0 | } |
680 | | |
681 | 0 | entered_a_group = TRUE; |
682 | 0 | continue; |
683 | | |
684 | 0 | case OP_BRAZERO: |
685 | 0 | case OP_BRAMINZERO: |
686 | |
|
687 | 0 | next_code = code + 1; |
688 | 0 | if (*next_code != OP_BRA && *next_code != OP_CBRA && |
689 | 0 | *next_code != OP_ONCE) return FALSE; |
690 | | |
691 | 0 | do next_code += GET(next_code, 1); while (*next_code == OP_ALT); |
692 | | |
693 | | /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ |
694 | |
|
695 | 0 | next_code += 1 + LINK_SIZE; |
696 | 0 | if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, |
697 | 0 | rec_limit)) |
698 | 0 | return FALSE; |
699 | | |
700 | 0 | code += PRIV(OP_lengths)[c]; |
701 | 0 | continue; |
702 | | |
703 | | /* The next opcode does not need special handling; fall through and use it |
704 | | to see if the base can be possessified. */ |
705 | | |
706 | 0 | default: |
707 | 0 | break; |
708 | 0 | } |
709 | | |
710 | | /* We now have the next appropriate opcode to compare with the base. Check |
711 | | for a supported opcode, and load its properties. */ |
712 | | |
713 | 0 | code = get_chr_property_list(code, utf, ucp, cb->fcc, list); |
714 | 0 | if (code == NULL) return FALSE; /* Unsupported */ |
715 | | |
716 | | /* If either opcode is a small character list, set pointers for comparing |
717 | | characters from that list with another list, or with a property. */ |
718 | | |
719 | 0 | if (base_list[0] == OP_CHAR) |
720 | 0 | { |
721 | 0 | chr_ptr = base_list + 2; |
722 | 0 | list_ptr = list; |
723 | 0 | } |
724 | 0 | else if (list[0] == OP_CHAR) |
725 | 0 | { |
726 | 0 | chr_ptr = list + 2; |
727 | 0 | list_ptr = base_list; |
728 | 0 | } |
729 | | |
730 | | /* Character bitsets can also be compared to certain opcodes. */ |
731 | | |
732 | 0 | else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS |
733 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
734 | | /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ |
735 | 0 | || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) |
736 | | #endif |
737 | 0 | ) |
738 | 0 | { |
739 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
740 | 0 | if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) |
741 | | #else |
742 | 0 | if (base_list[0] == OP_CLASS) |
743 | 0 | #endif |
744 | 0 | { |
745 | 0 | set1 = (uint8_t *)(base_end - base_list[2]); |
746 | 0 | list_ptr = list; |
747 | 0 | } |
748 | 0 | else |
749 | 0 | { |
750 | 0 | set1 = (uint8_t *)(code - list[2]); |
751 | 0 | list_ptr = base_list; |
752 | 0 | } |
753 | |
|
754 | 0 | invert_bits = FALSE; |
755 | 0 | switch(list_ptr[0]) |
756 | 0 | { |
757 | 0 | case OP_CLASS: |
758 | 0 | case OP_NCLASS: |
759 | 0 | set2 = (uint8_t *) |
760 | 0 | ((list_ptr == list ? code : base_end) - list_ptr[2]); |
761 | 0 | break; |
762 | | |
763 | 0 | #ifdef SUPPORT_WIDE_CHARS |
764 | 0 | case OP_XCLASS: |
765 | 0 | xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE; |
766 | 0 | if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; |
767 | 0 | if ((*xclass_flags & XCL_MAP) == 0) |
768 | 0 | { |
769 | | /* No bits are set for characters < 256. */ |
770 | 0 | if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0; |
771 | | /* Might be an empty repeat. */ |
772 | 0 | continue; |
773 | 0 | } |
774 | 0 | set2 = (uint8_t *)(xclass_flags + 1); |
775 | 0 | break; |
776 | 0 | #endif |
777 | | |
778 | 0 | case OP_NOT_DIGIT: |
779 | 0 | invert_bits = TRUE; |
780 | | /* Fall through */ |
781 | 0 | case OP_DIGIT: |
782 | 0 | set2 = (uint8_t *)(cb->cbits + cbit_digit); |
783 | 0 | break; |
784 | | |
785 | 0 | case OP_NOT_WHITESPACE: |
786 | 0 | invert_bits = TRUE; |
787 | | /* Fall through */ |
788 | 0 | case OP_WHITESPACE: |
789 | 0 | set2 = (uint8_t *)(cb->cbits + cbit_space); |
790 | 0 | break; |
791 | | |
792 | 0 | case OP_NOT_WORDCHAR: |
793 | 0 | invert_bits = TRUE; |
794 | | /* Fall through */ |
795 | 0 | case OP_WORDCHAR: |
796 | 0 | set2 = (uint8_t *)(cb->cbits + cbit_word); |
797 | 0 | break; |
798 | | |
799 | 0 | default: |
800 | 0 | return FALSE; |
801 | 0 | } |
802 | | |
803 | | /* Because the bit sets are unaligned bytes, we need to perform byte |
804 | | comparison here. */ |
805 | | |
806 | 0 | set_end = set1 + 32; |
807 | 0 | if (invert_bits) |
808 | 0 | { |
809 | 0 | do |
810 | 0 | { |
811 | 0 | if ((*set1++ & ~(*set2++)) != 0) return FALSE; |
812 | 0 | } |
813 | 0 | while (set1 < set_end); |
814 | 0 | } |
815 | 0 | else |
816 | 0 | { |
817 | 0 | do |
818 | 0 | { |
819 | 0 | if ((*set1++ & *set2++) != 0) return FALSE; |
820 | 0 | } |
821 | 0 | while (set1 < set_end); |
822 | 0 | } |
823 | | |
824 | 0 | if (list[1] == 0) return TRUE; |
825 | | /* Might be an empty repeat. */ |
826 | 0 | continue; |
827 | 0 | } |
828 | | |
829 | | /* Some property combinations also acceptable. Unicode property opcodes are |
830 | | processed specially; the rest can be handled with a lookup table. */ |
831 | | |
832 | 0 | else |
833 | 0 | { |
834 | 0 | uint32_t leftop, rightop; |
835 | |
|
836 | 0 | leftop = base_list[0]; |
837 | 0 | rightop = list[0]; |
838 | |
|
839 | 0 | #ifdef SUPPORT_UNICODE |
840 | 0 | accepted = FALSE; /* Always set in non-unicode case. */ |
841 | 0 | if (leftop == OP_PROP || leftop == OP_NOTPROP) |
842 | 0 | { |
843 | 0 | if (rightop == OP_EOD) |
844 | 0 | accepted = TRUE; |
845 | 0 | else if (rightop == OP_PROP || rightop == OP_NOTPROP) |
846 | 0 | { |
847 | 0 | int n; |
848 | 0 | const uint8_t *p; |
849 | 0 | BOOL same = leftop == rightop; |
850 | 0 | BOOL lisprop = leftop == OP_PROP; |
851 | 0 | BOOL risprop = rightop == OP_PROP; |
852 | 0 | BOOL bothprop = lisprop && risprop; |
853 | | |
854 | | /* There's a table that specifies how each combination is to be |
855 | | processed: |
856 | | 0 Always return FALSE (never auto-possessify) |
857 | | 1 Character groups are distinct (possessify if both are OP_PROP) |
858 | | 2 Check character categories in the same group (general or particular) |
859 | | 3 Return TRUE if the two opcodes are not the same |
860 | | ... see comments below |
861 | | */ |
862 | |
|
863 | 0 | n = propposstab[base_list[2]][list[2]]; |
864 | 0 | switch(n) |
865 | 0 | { |
866 | 0 | case 0: break; |
867 | 0 | case 1: accepted = bothprop; break; |
868 | 0 | case 2: accepted = (base_list[3] == list[3]) != same; break; |
869 | 0 | case 3: accepted = !same; break; |
870 | | |
871 | 0 | case 4: /* Left general category, right particular category */ |
872 | 0 | accepted = risprop && catposstab[base_list[3]][list[3]] == same; |
873 | 0 | break; |
874 | | |
875 | 0 | case 5: /* Right general category, left particular category */ |
876 | 0 | accepted = lisprop && catposstab[list[3]][base_list[3]] == same; |
877 | 0 | break; |
878 | | |
879 | | /* This code is logically tricky. Think hard before fiddling with it. |
880 | | The posspropstab table has four entries per row. Each row relates to |
881 | | one of PCRE's special properties such as ALNUM or SPACE or WORD. |
882 | | Only WORD actually needs all four entries, but using repeats for the |
883 | | others means they can all use the same code below. |
884 | | |
885 | | The first two entries in each row are Unicode general categories, and |
886 | | apply always, because all the characters they include are part of the |
887 | | PCRE character set. The third and fourth entries are a general and a |
888 | | particular category, respectively, that include one or more relevant |
889 | | characters. One or the other is used, depending on whether the check |
890 | | is for a general or a particular category. However, in both cases the |
891 | | category contains more characters than the specials that are defined |
892 | | for the property being tested against. Therefore, it cannot be used |
893 | | in a NOTPROP case. |
894 | | |
895 | | Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. |
896 | | Underscore is covered by ucp_P or ucp_Po. */ |
897 | | |
898 | 0 | case 6: /* Left alphanum vs right general category */ |
899 | 0 | case 7: /* Left space vs right general category */ |
900 | 0 | case 8: /* Left word vs right general category */ |
901 | 0 | p = posspropstab[n-6]; |
902 | 0 | accepted = risprop && lisprop == |
903 | 0 | (list[3] != p[0] && |
904 | 0 | list[3] != p[1] && |
905 | 0 | (list[3] != p[2] || !lisprop)); |
906 | 0 | break; |
907 | | |
908 | 0 | case 9: /* Right alphanum vs left general category */ |
909 | 0 | case 10: /* Right space vs left general category */ |
910 | 0 | case 11: /* Right word vs left general category */ |
911 | 0 | p = posspropstab[n-9]; |
912 | 0 | accepted = lisprop && risprop == |
913 | 0 | (base_list[3] != p[0] && |
914 | 0 | base_list[3] != p[1] && |
915 | 0 | (base_list[3] != p[2] || !risprop)); |
916 | 0 | break; |
917 | | |
918 | 0 | case 12: /* Left alphanum vs right particular category */ |
919 | 0 | case 13: /* Left space vs right particular category */ |
920 | 0 | case 14: /* Left word vs right particular category */ |
921 | 0 | p = posspropstab[n-12]; |
922 | 0 | accepted = risprop && lisprop == |
923 | 0 | (catposstab[p[0]][list[3]] && |
924 | 0 | catposstab[p[1]][list[3]] && |
925 | 0 | (list[3] != p[3] || !lisprop)); |
926 | 0 | break; |
927 | | |
928 | 0 | case 15: /* Right alphanum vs left particular category */ |
929 | 0 | case 16: /* Right space vs left particular category */ |
930 | 0 | case 17: /* Right word vs left particular category */ |
931 | 0 | p = posspropstab[n-15]; |
932 | 0 | accepted = lisprop && risprop == |
933 | 0 | (catposstab[p[0]][base_list[3]] && |
934 | 0 | catposstab[p[1]][base_list[3]] && |
935 | 0 | (base_list[3] != p[3] || !risprop)); |
936 | 0 | break; |
937 | 0 | } |
938 | 0 | } |
939 | 0 | } |
940 | | |
941 | 0 | else |
942 | 0 | #endif /* SUPPORT_UNICODE */ |
943 | | |
944 | 0 | accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && |
945 | 0 | rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && |
946 | 0 | autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; |
947 | | |
948 | 0 | if (!accepted) return FALSE; |
949 | | |
950 | 0 | if (list[1] == 0) return TRUE; |
951 | | /* Might be an empty repeat. */ |
952 | 0 | continue; |
953 | 0 | } |
954 | | |
955 | | /* Control reaches here only if one of the items is a small character list. |
956 | | All characters are checked against the other side. */ |
957 | | |
958 | 0 | do |
959 | 0 | { |
960 | 0 | chr = *chr_ptr; |
961 | |
|
962 | 0 | switch(list_ptr[0]) |
963 | 0 | { |
964 | 0 | case OP_CHAR: |
965 | 0 | ochr_ptr = list_ptr + 2; |
966 | 0 | do |
967 | 0 | { |
968 | 0 | if (chr == *ochr_ptr) return FALSE; |
969 | 0 | ochr_ptr++; |
970 | 0 | } |
971 | 0 | while(*ochr_ptr != NOTACHAR); |
972 | 0 | break; |
973 | | |
974 | 0 | case OP_NOT: |
975 | 0 | ochr_ptr = list_ptr + 2; |
976 | 0 | do |
977 | 0 | { |
978 | 0 | if (chr == *ochr_ptr) |
979 | 0 | break; |
980 | 0 | ochr_ptr++; |
981 | 0 | } |
982 | 0 | while(*ochr_ptr != NOTACHAR); |
983 | 0 | if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */ |
984 | 0 | break; |
985 | | |
986 | | /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not* |
987 | | set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
988 | | |
989 | 0 | case OP_DIGIT: |
990 | 0 | if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE; |
991 | 0 | break; |
992 | | |
993 | 0 | case OP_NOT_DIGIT: |
994 | 0 | if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE; |
995 | 0 | break; |
996 | | |
997 | 0 | case OP_WHITESPACE: |
998 | 0 | if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE; |
999 | 0 | break; |
1000 | | |
1001 | 0 | case OP_NOT_WHITESPACE: |
1002 | 0 | if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE; |
1003 | 0 | break; |
1004 | | |
1005 | 0 | case OP_WORDCHAR: |
1006 | 0 | if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE; |
1007 | 0 | break; |
1008 | | |
1009 | 0 | case OP_NOT_WORDCHAR: |
1010 | 0 | if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE; |
1011 | 0 | break; |
1012 | | |
1013 | 0 | case OP_HSPACE: |
1014 | 0 | switch(chr) |
1015 | 0 | { |
1016 | 0 | HSPACE_CASES: return FALSE; |
1017 | 0 | default: break; |
1018 | 0 | } |
1019 | 0 | break; |
1020 | | |
1021 | 0 | case OP_NOT_HSPACE: |
1022 | 0 | switch(chr) |
1023 | 0 | { |
1024 | 0 | HSPACE_CASES: break; |
1025 | 0 | default: return FALSE; |
1026 | 0 | } |
1027 | 0 | break; |
1028 | | |
1029 | 0 | case OP_ANYNL: |
1030 | 0 | case OP_VSPACE: |
1031 | 0 | switch(chr) |
1032 | 0 | { |
1033 | 0 | VSPACE_CASES: return FALSE; |
1034 | 0 | default: break; |
1035 | 0 | } |
1036 | 0 | break; |
1037 | | |
1038 | 0 | case OP_NOT_VSPACE: |
1039 | 0 | switch(chr) |
1040 | 0 | { |
1041 | 0 | VSPACE_CASES: break; |
1042 | 0 | default: return FALSE; |
1043 | 0 | } |
1044 | 0 | break; |
1045 | | |
1046 | 0 | case OP_DOLL: |
1047 | 0 | case OP_EODN: |
1048 | 0 | switch (chr) |
1049 | 0 | { |
1050 | 0 | case CHAR_CR: |
1051 | 0 | case CHAR_LF: |
1052 | 0 | case CHAR_VT: |
1053 | 0 | case CHAR_FF: |
1054 | 0 | case CHAR_NEL: |
1055 | 0 | #ifndef EBCDIC |
1056 | 0 | case 0x2028: |
1057 | 0 | case 0x2029: |
1058 | 0 | #endif /* Not EBCDIC */ |
1059 | 0 | return FALSE; |
1060 | 0 | } |
1061 | 0 | break; |
1062 | | |
1063 | 0 | case OP_EOD: /* Can always possessify before \z */ |
1064 | 0 | break; |
1065 | | |
1066 | 0 | #ifdef SUPPORT_UNICODE |
1067 | 0 | case OP_PROP: |
1068 | 0 | case OP_NOTPROP: |
1069 | 0 | if (!check_char_prop(chr, list_ptr[2], list_ptr[3], |
1070 | 0 | list_ptr[0] == OP_NOTPROP)) |
1071 | 0 | return FALSE; |
1072 | 0 | break; |
1073 | 0 | #endif |
1074 | | |
1075 | 0 | case OP_NCLASS: |
1076 | 0 | if (chr > 255) return FALSE; |
1077 | | /* Fall through */ |
1078 | | |
1079 | 0 | case OP_CLASS: |
1080 | 0 | if (chr > 255) break; |
1081 | 0 | class_bitset = (uint8_t *) |
1082 | 0 | ((list_ptr == list ? code : base_end) - list_ptr[2]); |
1083 | 0 | if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE; |
1084 | 0 | break; |
1085 | | |
1086 | 0 | #ifdef SUPPORT_WIDE_CHARS |
1087 | 0 | case OP_XCLASS: |
1088 | 0 | if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - |
1089 | 0 | list_ptr[2] + LINK_SIZE, utf)) return FALSE; |
1090 | 0 | break; |
1091 | 0 | #endif |
1092 | | |
1093 | 0 | default: |
1094 | 0 | return FALSE; |
1095 | 0 | } |
1096 | | |
1097 | 0 | chr_ptr++; |
1098 | 0 | } |
1099 | 0 | while(*chr_ptr != NOTACHAR); |
1100 | | |
1101 | | /* At least one character must be matched from this opcode. */ |
1102 | | |
1103 | 0 | if (list[1] == 0) return TRUE; |
1104 | 0 | } |
1105 | | |
1106 | | /* Control never reaches here. There used to be a fail-save return FALSE; here, |
1107 | | but some compilers complain about an unreachable statement. */ |
1108 | 0 | } Unexecuted instantiation: pcre2_auto_possess.c:compare_opcodes Unexecuted instantiation: pcre2_auto_possess.c:compare_opcodes |
1109 | | |
1110 | | |
1111 | | |
1112 | | /************************************************* |
1113 | | * Scan compiled regex for auto-possession * |
1114 | | *************************************************/ |
1115 | | |
1116 | | /* Replaces single character iterations with their possessive alternatives |
1117 | | if appropriate. This function modifies the compiled opcode! Hitting a |
1118 | | non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a |
1119 | | bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches |
1120 | | overly complicated or large patterns. In these cases, the check just stops, |
1121 | | leaving the remainder of the pattern unpossessified. |
1122 | | |
1123 | | Arguments: |
1124 | | code points to start of the byte code |
1125 | | cb compile data block |
1126 | | |
1127 | | Returns: 0 for success |
1128 | | -1 if a non-existant opcode is encountered |
1129 | | */ |
1130 | | |
1131 | | int |
1132 | | PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb) |
1133 | 0 | { |
1134 | 0 | PCRE2_UCHAR c; |
1135 | 0 | PCRE2_SPTR end; |
1136 | 0 | PCRE2_UCHAR *repeat_opcode; |
1137 | 0 | uint32_t list[8]; |
1138 | 0 | int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ |
1139 | 0 | BOOL utf = (cb->external_options & PCRE2_UTF) != 0; |
1140 | 0 | BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; |
1141 | |
|
1142 | 0 | for (;;) |
1143 | 0 | { |
1144 | 0 | c = *code; |
1145 | |
|
1146 | 0 | if (c >= OP_TABLE_LENGTH) return -1; /* Something gone wrong */ |
1147 | | |
1148 | 0 | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) |
1149 | 0 | { |
1150 | 0 | c -= get_repeat_base(c) - OP_STAR; |
1151 | 0 | end = (c <= OP_MINUPTO) ? |
1152 | 0 | get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; |
1153 | 0 | list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; |
1154 | |
|
1155 | 0 | if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, |
1156 | 0 | &rec_limit)) |
1157 | 0 | { |
1158 | 0 | switch(c) |
1159 | 0 | { |
1160 | 0 | case OP_STAR: |
1161 | 0 | *code += OP_POSSTAR - OP_STAR; |
1162 | 0 | break; |
1163 | | |
1164 | 0 | case OP_MINSTAR: |
1165 | 0 | *code += OP_POSSTAR - OP_MINSTAR; |
1166 | 0 | break; |
1167 | | |
1168 | 0 | case OP_PLUS: |
1169 | 0 | *code += OP_POSPLUS - OP_PLUS; |
1170 | 0 | break; |
1171 | | |
1172 | 0 | case OP_MINPLUS: |
1173 | 0 | *code += OP_POSPLUS - OP_MINPLUS; |
1174 | 0 | break; |
1175 | | |
1176 | 0 | case OP_QUERY: |
1177 | 0 | *code += OP_POSQUERY - OP_QUERY; |
1178 | 0 | break; |
1179 | | |
1180 | 0 | case OP_MINQUERY: |
1181 | 0 | *code += OP_POSQUERY - OP_MINQUERY; |
1182 | 0 | break; |
1183 | | |
1184 | 0 | case OP_UPTO: |
1185 | 0 | *code += OP_POSUPTO - OP_UPTO; |
1186 | 0 | break; |
1187 | | |
1188 | 0 | case OP_MINUPTO: |
1189 | 0 | *code += OP_POSUPTO - OP_MINUPTO; |
1190 | 0 | break; |
1191 | 0 | } |
1192 | 0 | } |
1193 | 0 | c = *code; |
1194 | 0 | } |
1195 | 0 | else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS) |
1196 | 0 | { |
1197 | 0 | #ifdef SUPPORT_WIDE_CHARS |
1198 | 0 | if (c == OP_XCLASS) |
1199 | 0 | repeat_opcode = code + GET(code, 1); |
1200 | 0 | else |
1201 | 0 | #endif |
1202 | 0 | repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); |
1203 | |
|
1204 | 0 | c = *repeat_opcode; |
1205 | 0 | if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) |
1206 | 0 | { |
1207 | | /* The return from get_chr_property_list() will never be NULL when |
1208 | | *code (aka c) is one of the three class opcodes. However, gcc with |
1209 | | -fanalyzer notes that a NULL return is possible, and grumbles. Hence we |
1210 | | put in a check. */ |
1211 | |
|
1212 | 0 | end = get_chr_property_list(code, utf, ucp, cb->fcc, list); |
1213 | 0 | list[1] = (c & 1) == 0; |
1214 | |
|
1215 | 0 | if (end != NULL && |
1216 | 0 | compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) |
1217 | 0 | { |
1218 | 0 | switch (c) |
1219 | 0 | { |
1220 | 0 | case OP_CRSTAR: |
1221 | 0 | case OP_CRMINSTAR: |
1222 | 0 | *repeat_opcode = OP_CRPOSSTAR; |
1223 | 0 | break; |
1224 | | |
1225 | 0 | case OP_CRPLUS: |
1226 | 0 | case OP_CRMINPLUS: |
1227 | 0 | *repeat_opcode = OP_CRPOSPLUS; |
1228 | 0 | break; |
1229 | | |
1230 | 0 | case OP_CRQUERY: |
1231 | 0 | case OP_CRMINQUERY: |
1232 | 0 | *repeat_opcode = OP_CRPOSQUERY; |
1233 | 0 | break; |
1234 | | |
1235 | 0 | case OP_CRRANGE: |
1236 | 0 | case OP_CRMINRANGE: |
1237 | 0 | *repeat_opcode = OP_CRPOSRANGE; |
1238 | 0 | break; |
1239 | 0 | } |
1240 | 0 | } |
1241 | 0 | } |
1242 | 0 | c = *code; |
1243 | 0 | } |
1244 | | |
1245 | 0 | switch(c) |
1246 | 0 | { |
1247 | 0 | case OP_END: |
1248 | 0 | return 0; |
1249 | | |
1250 | 0 | case OP_TYPESTAR: |
1251 | 0 | case OP_TYPEMINSTAR: |
1252 | 0 | case OP_TYPEPLUS: |
1253 | 0 | case OP_TYPEMINPLUS: |
1254 | 0 | case OP_TYPEQUERY: |
1255 | 0 | case OP_TYPEMINQUERY: |
1256 | 0 | case OP_TYPEPOSSTAR: |
1257 | 0 | case OP_TYPEPOSPLUS: |
1258 | 0 | case OP_TYPEPOSQUERY: |
1259 | 0 | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
1260 | 0 | break; |
1261 | | |
1262 | 0 | case OP_TYPEUPTO: |
1263 | 0 | case OP_TYPEMINUPTO: |
1264 | 0 | case OP_TYPEEXACT: |
1265 | 0 | case OP_TYPEPOSUPTO: |
1266 | 0 | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
1267 | 0 | code += 2; |
1268 | 0 | break; |
1269 | | |
1270 | 0 | case OP_CALLOUT_STR: |
1271 | 0 | code += GET(code, 1 + 2*LINK_SIZE); |
1272 | 0 | break; |
1273 | | |
1274 | 0 | #ifdef SUPPORT_WIDE_CHARS |
1275 | 0 | case OP_XCLASS: |
1276 | 0 | code += GET(code, 1); |
1277 | 0 | break; |
1278 | 0 | #endif |
1279 | | |
1280 | 0 | case OP_MARK: |
1281 | 0 | case OP_COMMIT_ARG: |
1282 | 0 | case OP_PRUNE_ARG: |
1283 | 0 | case OP_SKIP_ARG: |
1284 | 0 | case OP_THEN_ARG: |
1285 | 0 | code += code[1]; |
1286 | 0 | break; |
1287 | 0 | } |
1288 | | |
1289 | | /* Add in the fixed length from the table */ |
1290 | | |
1291 | 0 | code += PRIV(OP_lengths)[c]; |
1292 | | |
1293 | | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be |
1294 | | followed by a multi-byte character. The length in the table is a minimum, so |
1295 | | we have to arrange to skip the extra code units. */ |
1296 | |
|
1297 | 0 | #ifdef MAYBE_UTF_MULTI |
1298 | 0 | if (utf) switch(c) |
1299 | 0 | { |
1300 | 0 | case OP_CHAR: |
1301 | 0 | case OP_CHARI: |
1302 | 0 | case OP_NOT: |
1303 | 0 | case OP_NOTI: |
1304 | 0 | case OP_STAR: |
1305 | 0 | case OP_MINSTAR: |
1306 | 0 | case OP_PLUS: |
1307 | 0 | case OP_MINPLUS: |
1308 | 0 | case OP_QUERY: |
1309 | 0 | case OP_MINQUERY: |
1310 | 0 | case OP_UPTO: |
1311 | 0 | case OP_MINUPTO: |
1312 | 0 | case OP_EXACT: |
1313 | 0 | case OP_POSSTAR: |
1314 | 0 | case OP_POSPLUS: |
1315 | 0 | case OP_POSQUERY: |
1316 | 0 | case OP_POSUPTO: |
1317 | 0 | case OP_STARI: |
1318 | 0 | case OP_MINSTARI: |
1319 | 0 | case OP_PLUSI: |
1320 | 0 | case OP_MINPLUSI: |
1321 | 0 | case OP_QUERYI: |
1322 | 0 | case OP_MINQUERYI: |
1323 | 0 | case OP_UPTOI: |
1324 | 0 | case OP_MINUPTOI: |
1325 | 0 | case OP_EXACTI: |
1326 | 0 | case OP_POSSTARI: |
1327 | 0 | case OP_POSPLUSI: |
1328 | 0 | case OP_POSQUERYI: |
1329 | 0 | case OP_POSUPTOI: |
1330 | 0 | case OP_NOTSTAR: |
1331 | 0 | case OP_NOTMINSTAR: |
1332 | 0 | case OP_NOTPLUS: |
1333 | 0 | case OP_NOTMINPLUS: |
1334 | 0 | case OP_NOTQUERY: |
1335 | 0 | case OP_NOTMINQUERY: |
1336 | 0 | case OP_NOTUPTO: |
1337 | 0 | case OP_NOTMINUPTO: |
1338 | 0 | case OP_NOTEXACT: |
1339 | 0 | case OP_NOTPOSSTAR: |
1340 | 0 | case OP_NOTPOSPLUS: |
1341 | 0 | case OP_NOTPOSQUERY: |
1342 | 0 | case OP_NOTPOSUPTO: |
1343 | 0 | case OP_NOTSTARI: |
1344 | 0 | case OP_NOTMINSTARI: |
1345 | 0 | case OP_NOTPLUSI: |
1346 | 0 | case OP_NOTMINPLUSI: |
1347 | 0 | case OP_NOTQUERYI: |
1348 | 0 | case OP_NOTMINQUERYI: |
1349 | 0 | case OP_NOTUPTOI: |
1350 | 0 | case OP_NOTMINUPTOI: |
1351 | 0 | case OP_NOTEXACTI: |
1352 | 0 | case OP_NOTPOSSTARI: |
1353 | 0 | case OP_NOTPOSPLUSI: |
1354 | 0 | case OP_NOTPOSQUERYI: |
1355 | 0 | case OP_NOTPOSUPTOI: |
1356 | 0 | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
1357 | 0 | break; |
1358 | 0 | } |
1359 | | #else |
1360 | | (void)(utf); /* Keep compiler happy by referencing function argument */ |
1361 | | #endif /* SUPPORT_WIDE_CHARS */ |
1362 | 0 | } |
1363 | 0 | } Unexecuted instantiation: _pcre2_auto_possessify_8 Unexecuted instantiation: _pcre2_auto_possessify_16 |
1364 | | |
1365 | | /* End of pcre2_auto_possess.c */ |