/src/pcre2/src/pcre2_auto_possess.c
Line | Count | Source |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2024 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | |
42 | | /* This module contains functions that scan a compiled pattern and change |
43 | | repeats into possessive repeats where possible. */ |
44 | | |
45 | | |
46 | | #include "pcre2_internal.h" |
47 | | |
48 | | |
49 | | |
50 | | /* This macro represents the max size of list[] and that is used to keep |
51 | | track of UCD info in several places, it should be kept on sync with the |
52 | | value used by GenerateUcd.py */ |
53 | 1.83M | #define MAX_LIST 8 |
54 | | |
55 | | /************************************************* |
56 | | * Tables for auto-possessification * |
57 | | *************************************************/ |
58 | | |
59 | | /* This table is used to check whether auto-possessification is possible |
60 | | between adjacent character-type opcodes. The left-hand (repeated) opcode is |
61 | | used to select the row, and the right-hand opcode is use to select the column. |
62 | | A value of 1 means that auto-possessification is OK. For example, the second |
63 | | value in the first row means that \D+\d can be turned into \D++\d. |
64 | | |
65 | | The Unicode property types (\P and \p) have to be present to fill out the table |
66 | | because of what their opcode values are, but the table values should always be |
67 | | zero because property types are handled separately in the code. The last four |
68 | | columns apply to items that cannot be repeated, so there is no need to have |
69 | | rows for them. Note that OP_DIGIT etc. are generated only when PCRE2_UCP is |
70 | | *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
71 | | |
72 | | #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1) |
73 | | #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1) |
74 | | |
75 | | static const uint8_t autoposstab[APTROWS][APTCOLS] = { |
76 | | /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */ |
77 | | { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */ |
78 | | { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */ |
79 | | { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */ |
80 | | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */ |
81 | | { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */ |
82 | | { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */ |
83 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */ |
84 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */ |
85 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */ |
86 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */ |
87 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */ |
88 | | { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */ |
89 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */ |
90 | | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */ |
91 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */ |
92 | | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */ |
93 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */ |
94 | | }; |
95 | | |
96 | | #ifdef SUPPORT_UNICODE |
97 | | /* This table is used to check whether auto-possessification is possible |
98 | | between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The |
99 | | left-hand (repeated) opcode is used to select the row, and the right-hand |
100 | | opcode is used to select the column. The values are as follows: |
101 | | |
102 | | 0 Always return FALSE (never auto-possessify) |
103 | | 1 Character groups are distinct (possessify if both are OP_PROP) |
104 | | 2 Check character categories in the same group (general or particular) |
105 | | 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP) |
106 | | |
107 | | 4 Check left general category vs right particular category |
108 | | 5 Check right general category vs left particular category |
109 | | |
110 | | 6 Left alphanum vs right general category |
111 | | 7 Left space vs right general category |
112 | | 8 Left word vs right general category |
113 | | |
114 | | 9 Right alphanum vs left general category |
115 | | 10 Right space vs left general category |
116 | | 11 Right word vs left general category |
117 | | |
118 | | 12 Left alphanum vs right particular category |
119 | | 13 Left space vs right particular category |
120 | | 14 Left word vs right particular category |
121 | | |
122 | | 15 Right alphanum vs left particular category |
123 | | 16 Right space vs left particular category |
124 | | 17 Right word vs left particular category |
125 | | */ |
126 | | |
127 | | static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = { |
128 | | /* LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */ |
129 | | { 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */ |
130 | | { 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */ |
131 | | { 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */ |
132 | | { 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */ |
133 | | { 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */ |
134 | | { 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */ |
135 | | { 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */ |
136 | | { 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */ |
137 | | { 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */ |
138 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */ |
139 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */ |
140 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */ |
141 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */ |
142 | | /* PT_ANY does not need a record. */ |
143 | | }; |
144 | | |
145 | | /* This table is used to check whether auto-possessification is possible |
146 | | between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one |
147 | | specifies a general category and the other specifies a particular category. The |
148 | | row is selected by the general category and the column by the particular |
149 | | category. The value is 1 if the particular category is not part of the general |
150 | | category. */ |
151 | | |
152 | | static const uint8_t catposstab[7][30] = { |
153 | | /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */ |
154 | | { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */ |
155 | | { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */ |
156 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */ |
157 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ |
158 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */ |
159 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */ |
160 | | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */ |
161 | | }; |
162 | | |
163 | | /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against |
164 | | a general or particular category. The properties in each row are those |
165 | | that apply to the character set in question. Duplication means that a little |
166 | | unnecessary work is done when checking, but this keeps things much simpler |
167 | | because they can all use the same code. For more details see the comment where |
168 | | this table is used. |
169 | | |
170 | | Note: SPACE and PXSPACE used to be different because Perl excluded VT from |
171 | | "space", but from Perl 5.18 it's included, so both categories are treated the |
172 | | same here. */ |
173 | | |
174 | | static const uint8_t posspropstab[3][4] = { |
175 | | { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */ |
176 | | { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */ |
177 | | { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */ |
178 | | }; |
179 | | #endif /* SUPPORT_UNICODE */ |
180 | | |
181 | | |
182 | | |
183 | | #ifdef SUPPORT_UNICODE |
184 | | /************************************************* |
185 | | * Check a character and a property * |
186 | | *************************************************/ |
187 | | |
188 | | /* This function is called by compare_opcodes() when a property item is |
189 | | adjacent to a fixed character. |
190 | | |
191 | | Arguments: |
192 | | c the character |
193 | | ptype the property type |
194 | | pdata the data for the type |
195 | | negated TRUE if it's a negated property (\P or \p{^) |
196 | | |
197 | | Returns: TRUE if auto-possessifying is OK |
198 | | */ |
199 | | |
200 | | static BOOL |
201 | | check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata, |
202 | | BOOL negated) |
203 | 998k | { |
204 | 998k | BOOL ok, rc; |
205 | 998k | const uint32_t *p; |
206 | 998k | const ucd_record *prop = GET_UCD(c); |
207 | | |
208 | 998k | switch(ptype) |
209 | 998k | { |
210 | 35.5k | case PT_LAMP: |
211 | 35.5k | return (prop->chartype == ucp_Lu || |
212 | 24.6k | prop->chartype == ucp_Ll || |
213 | 14.1k | prop->chartype == ucp_Lt) == negated; |
214 | | |
215 | 79.2k | case PT_GC: |
216 | 79.2k | return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; |
217 | | |
218 | 118k | case PT_PC: |
219 | 118k | return (pdata == prop->chartype) == negated; |
220 | | |
221 | 32.7k | case PT_SC: |
222 | 32.7k | return (pdata == prop->script) == negated; |
223 | | |
224 | 82.2k | case PT_SCX: |
225 | 82.2k | ok = (pdata == prop->script |
226 | 72.2k | || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); |
227 | 82.2k | return ok == negated; |
228 | | |
229 | | /* These are specials */ |
230 | | |
231 | 10.2k | case PT_ALNUM: |
232 | 10.2k | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
233 | 6.93k | PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; |
234 | | |
235 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, which |
236 | | means that Perl space and POSIX space are now identical. PCRE was changed |
237 | | at release 8.34. */ |
238 | | |
239 | 354k | case PT_SPACE: /* Perl space */ |
240 | 359k | case PT_PXSPACE: /* POSIX space */ |
241 | 359k | switch(c) |
242 | 359k | { |
243 | 1.09M | HSPACE_CASES: |
244 | 1.09M | VSPACE_CASES: |
245 | 155k | rc = negated; |
246 | 155k | break; |
247 | | |
248 | 203k | default: |
249 | 203k | rc = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; |
250 | 359k | } |
251 | 359k | return rc; |
252 | | |
253 | 150k | case PT_WORD: |
254 | 150k | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
255 | 77.7k | PRIV(ucp_gentype)[prop->chartype] == ucp_N || |
256 | 69.1k | c == CHAR_UNDERSCORE) == negated; |
257 | | |
258 | 0 | case PT_CLIST: |
259 | 0 | p = PRIV(ucd_caseless_sets) + prop->caseset; |
260 | 0 | for (;;) |
261 | 0 | { |
262 | 0 | if (c < *p) return !negated; |
263 | 0 | if (c == *p++) return negated; |
264 | 0 | } |
265 | | /* LCOV_EXCL_START */ |
266 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ |
267 | 0 | break; |
268 | | /* LCOV_EXCL_STOP */ |
269 | | |
270 | | /* Haven't yet thought these through. */ |
271 | | |
272 | 20.9k | case PT_BIDICL: |
273 | 20.9k | return FALSE; |
274 | | |
275 | 48.8k | case PT_BOOL: |
276 | 48.8k | return FALSE; |
277 | 998k | } |
278 | | |
279 | 59.8k | return FALSE; |
280 | 998k | } pcre2_auto_possess.c:check_char_prop Line | Count | Source | 203 | 661k | { | 204 | 661k | BOOL ok, rc; | 205 | 661k | const uint32_t *p; | 206 | 661k | const ucd_record *prop = GET_UCD(c); | 207 | | | 208 | 661k | switch(ptype) | 209 | 661k | { | 210 | 27.1k | case PT_LAMP: | 211 | 27.1k | return (prop->chartype == ucp_Lu || | 212 | 19.0k | prop->chartype == ucp_Ll || | 213 | 10.4k | prop->chartype == ucp_Lt) == negated; | 214 | | | 215 | 46.2k | case PT_GC: | 216 | 46.2k | return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; | 217 | | | 218 | 76.6k | case PT_PC: | 219 | 76.6k | return (pdata == prop->chartype) == negated; | 220 | | | 221 | 28.3k | case PT_SC: | 222 | 28.3k | return (pdata == prop->script) == negated; | 223 | | | 224 | 50.6k | case PT_SCX: | 225 | 50.6k | ok = (pdata == prop->script | 226 | 41.9k | || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); | 227 | 50.6k | return ok == negated; | 228 | | | 229 | | /* These are specials */ | 230 | | | 231 | 7.37k | case PT_ALNUM: | 232 | 7.37k | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || | 233 | 4.57k | PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; | 234 | | | 235 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, which | 236 | | means that Perl space and POSIX space are now identical. PCRE was changed | 237 | | at release 8.34. */ | 238 | | | 239 | 224k | case PT_SPACE: /* Perl space */ | 240 | 227k | case PT_PXSPACE: /* POSIX space */ | 241 | 227k | switch(c) | 242 | 227k | { | 243 | 752k | HSPACE_CASES: | 244 | 752k | VSPACE_CASES: | 245 | 108k | rc = negated; | 246 | 108k | break; | 247 | | | 248 | 119k | default: | 249 | 119k | rc = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; | 250 | 227k | } | 251 | 227k | return rc; | 252 | | | 253 | 100k | case PT_WORD: | 254 | 100k | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || | 255 | 51.3k | PRIV(ucp_gentype)[prop->chartype] == ucp_N || | 256 | 44.2k | c == CHAR_UNDERSCORE) == negated; | 257 | | | 258 | 0 | case PT_CLIST: | 259 | 0 | p = PRIV(ucd_caseless_sets) + prop->caseset; | 260 | 0 | for (;;) | 261 | 0 | { | 262 | 0 | if (c < *p) return !negated; | 263 | 0 | if (c == *p++) return negated; | 264 | 0 | } | 265 | | /* LCOV_EXCL_START */ | 266 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ | 267 | 0 | break; | 268 | | /* LCOV_EXCL_STOP */ | 269 | | | 270 | | /* Haven't yet thought these through. */ | 271 | | | 272 | 15.5k | case PT_BIDICL: | 273 | 15.5k | return FALSE; | 274 | | | 275 | 21.9k | case PT_BOOL: | 276 | 21.9k | return FALSE; | 277 | 661k | } | 278 | | | 279 | 59.4k | return FALSE; | 280 | 661k | } |
pcre2_auto_possess.c:check_char_prop Line | Count | Source | 203 | 336k | { | 204 | 336k | BOOL ok, rc; | 205 | 336k | const uint32_t *p; | 206 | 336k | const ucd_record *prop = GET_UCD(c); | 207 | | | 208 | 336k | switch(ptype) | 209 | 336k | { | 210 | 8.45k | case PT_LAMP: | 211 | 8.45k | return (prop->chartype == ucp_Lu || | 212 | 5.58k | prop->chartype == ucp_Ll || | 213 | 3.78k | prop->chartype == ucp_Lt) == negated; | 214 | | | 215 | 33.0k | case PT_GC: | 216 | 33.0k | return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; | 217 | | | 218 | 41.9k | case PT_PC: | 219 | 41.9k | return (pdata == prop->chartype) == negated; | 220 | | | 221 | 4.48k | case PT_SC: | 222 | 4.48k | return (pdata == prop->script) == negated; | 223 | | | 224 | 31.6k | case PT_SCX: | 225 | 31.6k | ok = (pdata == prop->script | 226 | 30.2k | || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); | 227 | 31.6k | return ok == negated; | 228 | | | 229 | | /* These are specials */ | 230 | | | 231 | 2.90k | case PT_ALNUM: | 232 | 2.90k | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || | 233 | 2.35k | PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; | 234 | | | 235 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, which | 236 | | means that Perl space and POSIX space are now identical. PCRE was changed | 237 | | at release 8.34. */ | 238 | | | 239 | 130k | case PT_SPACE: /* Perl space */ | 240 | 131k | case PT_PXSPACE: /* POSIX space */ | 241 | 131k | switch(c) | 242 | 131k | { | 243 | 345k | HSPACE_CASES: | 244 | 345k | VSPACE_CASES: | 245 | 47.3k | rc = negated; | 246 | 47.3k | break; | 247 | | | 248 | 84.5k | default: | 249 | 84.5k | rc = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; | 250 | 131k | } | 251 | 131k | return rc; | 252 | | | 253 | 49.9k | case PT_WORD: | 254 | 49.9k | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || | 255 | 26.4k | PRIV(ucp_gentype)[prop->chartype] == ucp_N || | 256 | 24.8k | c == CHAR_UNDERSCORE) == negated; | 257 | | | 258 | 0 | case PT_CLIST: | 259 | 0 | p = PRIV(ucd_caseless_sets) + prop->caseset; | 260 | 0 | for (;;) | 261 | 0 | { | 262 | 0 | if (c < *p) return !negated; | 263 | 0 | if (c == *p++) return negated; | 264 | 0 | } | 265 | | /* LCOV_EXCL_START */ | 266 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ | 267 | 0 | break; | 268 | | /* LCOV_EXCL_STOP */ | 269 | | | 270 | | /* Haven't yet thought these through. */ | 271 | | | 272 | 5.36k | case PT_BIDICL: | 273 | 5.36k | return FALSE; | 274 | | | 275 | 26.9k | case PT_BOOL: | 276 | 26.9k | return FALSE; | 277 | 336k | } | 278 | | | 279 | 364 | return FALSE; | 280 | 336k | } |
|
281 | | #endif /* SUPPORT_UNICODE */ |
282 | | |
283 | | |
284 | | |
285 | | /************************************************* |
286 | | * Base opcode of repeated opcodes * |
287 | | *************************************************/ |
288 | | |
289 | | /* Returns the base opcode for repeated single character type opcodes. If the |
290 | | opcode is not a repeated character type, it returns with the original value. |
291 | | |
292 | | Arguments: c opcode |
293 | | Returns: base opcode for the type |
294 | | */ |
295 | | |
296 | | static PCRE2_UCHAR |
297 | | get_repeat_base(PCRE2_UCHAR c) |
298 | 129M | { |
299 | 129M | return (c > OP_TYPEPOSUPTO)? c : |
300 | 129M | (c >= OP_TYPESTAR)? OP_TYPESTAR : |
301 | 129M | (c >= OP_NOTSTARI)? OP_NOTSTARI : |
302 | 86.5M | (c >= OP_NOTSTAR)? OP_NOTSTAR : |
303 | 85.6M | (c >= OP_STARI)? OP_STARI : |
304 | 82.6M | OP_STAR; |
305 | 129M | } |
306 | | |
307 | | |
308 | | /************************************************* |
309 | | * Fill the character property list * |
310 | | *************************************************/ |
311 | | |
312 | | /* Checks whether the code points to an opcode that can take part in auto- |
313 | | possessification, and if so, fills a list with its properties. |
314 | | |
315 | | Arguments: |
316 | | code points to start of expression |
317 | | utf TRUE if in UTF mode |
318 | | ucp TRUE if in UCP mode |
319 | | fcc points to the case-flipping table |
320 | | list points to output list |
321 | | list[0] will be filled with the opcode |
322 | | list[1] will be non-zero if this opcode |
323 | | can match an empty character string |
324 | | list[2..7] depends on the opcode |
325 | | |
326 | | Returns: points to the start of the next opcode if *code is accepted |
327 | | NULL if *code is not accepted |
328 | | */ |
329 | | |
330 | | static PCRE2_SPTR |
331 | | get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc, |
332 | | uint32_t *list) |
333 | 89.4M | { |
334 | 89.4M | PCRE2_UCHAR c = *code; |
335 | 89.4M | PCRE2_UCHAR base; |
336 | 89.4M | PCRE2_SPTR end; |
337 | 89.4M | PCRE2_SPTR class_end; |
338 | 89.4M | uint32_t chr; |
339 | | |
340 | 89.4M | #ifdef SUPPORT_UNICODE |
341 | 89.4M | uint32_t *clist_dest; |
342 | 89.4M | const uint32_t *clist_src; |
343 | | #else |
344 | | (void)utf; /* Suppress "unused parameter" compiler warnings */ |
345 | | (void)ucp; |
346 | | #endif |
347 | | |
348 | 89.4M | list[0] = c; |
349 | 89.4M | list[1] = FALSE; |
350 | 89.4M | code++; |
351 | | |
352 | 89.4M | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) |
353 | 64.2M | { |
354 | 64.2M | base = get_repeat_base(c); |
355 | 64.2M | c -= (base - OP_STAR); |
356 | | |
357 | 64.2M | if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) |
358 | 3.00M | code += IMM2_SIZE; |
359 | | |
360 | 64.2M | list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && |
361 | 46.5M | c != OP_POSPLUS); |
362 | | |
363 | 64.2M | switch(base) |
364 | 64.2M | { |
365 | 33.3M | case OP_STAR: |
366 | 33.3M | list[0] = OP_CHAR; |
367 | 33.3M | break; |
368 | | |
369 | 7.73M | case OP_STARI: |
370 | 7.73M | list[0] = OP_CHARI; |
371 | 7.73M | break; |
372 | | |
373 | 1.41M | case OP_NOTSTAR: |
374 | 1.41M | list[0] = OP_NOT; |
375 | 1.41M | break; |
376 | | |
377 | 420k | case OP_NOTSTARI: |
378 | 420k | list[0] = OP_NOTI; |
379 | 420k | break; |
380 | | |
381 | 21.3M | case OP_TYPESTAR: |
382 | 21.3M | list[0] = *code; |
383 | 21.3M | code++; |
384 | 21.3M | break; |
385 | 64.2M | } |
386 | 64.2M | c = list[0]; |
387 | 64.2M | } |
388 | | |
389 | 89.4M | switch(c) |
390 | 89.4M | { |
391 | 486k | case OP_NOT_DIGIT: |
392 | 3.19M | case OP_DIGIT: |
393 | 5.18M | case OP_NOT_WHITESPACE: |
394 | 5.65M | case OP_WHITESPACE: |
395 | 7.30M | case OP_NOT_WORDCHAR: |
396 | 8.41M | case OP_WORDCHAR: |
397 | 11.6M | case OP_ANY: |
398 | 12.3M | case OP_ALLANY: |
399 | 13.9M | case OP_ANYNL: |
400 | 15.1M | case OP_NOT_HSPACE: |
401 | 16.2M | case OP_HSPACE: |
402 | 17.0M | case OP_NOT_VSPACE: |
403 | 17.4M | case OP_VSPACE: |
404 | 18.7M | case OP_EXTUNI: |
405 | 18.8M | case OP_EODN: |
406 | 18.9M | case OP_EOD: |
407 | 19.1M | case OP_DOLL: |
408 | 19.2M | case OP_DOLLM: |
409 | 19.2M | return code; |
410 | | |
411 | 45.7M | case OP_CHAR: |
412 | 47.1M | case OP_NOT: |
413 | 47.1M | GETCHARINCTEST(chr, code); |
414 | 47.1M | list[2] = chr; |
415 | 47.1M | list[3] = NOTACHAR; |
416 | 47.1M | return code; |
417 | | |
418 | 11.6M | case OP_CHARI: |
419 | 12.0M | case OP_NOTI: |
420 | 12.0M | list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; |
421 | 12.0M | GETCHARINCTEST(chr, code); |
422 | 12.0M | list[2] = chr; |
423 | | |
424 | 12.0M | #ifdef SUPPORT_UNICODE |
425 | 12.0M | if (chr < 128 || (chr < 256 && !utf && !ucp)) |
426 | 7.41M | list[3] = fcc[chr]; |
427 | 4.65M | else |
428 | 4.65M | list[3] = UCD_OTHERCASE(chr); |
429 | | #elif defined SUPPORT_WIDE_CHARS |
430 | | list[3] = (chr < 256) ? fcc[chr] : chr; |
431 | | #else |
432 | | list[3] = fcc[chr]; |
433 | | #endif |
434 | | |
435 | | /* The othercase might be the same value. */ |
436 | | |
437 | 12.0M | if (chr == list[3]) |
438 | 9.17M | list[3] = NOTACHAR; |
439 | 2.89M | else |
440 | 2.89M | list[4] = NOTACHAR; |
441 | 12.0M | return code; |
442 | | |
443 | 0 | #ifdef SUPPORT_UNICODE |
444 | 1.95M | case OP_PROP: |
445 | 4.35M | case OP_NOTPROP: |
446 | 4.35M | if (code[0] != PT_CLIST) |
447 | 3.89M | { |
448 | 3.89M | list[2] = code[0]; |
449 | 3.89M | list[3] = code[1]; |
450 | 3.89M | return code + 2; |
451 | 3.89M | } |
452 | | |
453 | | /* Convert only if we have enough space. */ |
454 | | |
455 | 456k | clist_src = PRIV(ucd_caseless_sets) + code[1]; |
456 | 456k | clist_dest = list + 2; |
457 | 456k | code += 2; |
458 | | |
459 | 1.83M | do { |
460 | 1.83M | if (clist_dest >= list + MAX_LIST) |
461 | 0 | { |
462 | | /* Early return if there is not enough space. GenerateUcd.py |
463 | | generated a list with more than 5 characters and something |
464 | | must be done about that going forward. */ |
465 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Remove if it ever triggers */ |
466 | 0 | list[2] = code[0]; |
467 | 0 | list[3] = code[1]; |
468 | 0 | return code; |
469 | 0 | } |
470 | 1.83M | *clist_dest++ = *clist_src; |
471 | 1.83M | } |
472 | 1.83M | while(*clist_src++ != NOTACHAR); |
473 | | |
474 | | /* All characters are stored. The terminating NOTACHAR is copied from the |
475 | | clist itself. */ |
476 | | |
477 | 456k | list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; |
478 | 456k | return code; |
479 | 0 | #endif |
480 | | |
481 | 761k | case OP_NCLASS: |
482 | 2.51M | case OP_CLASS: |
483 | 2.51M | #ifdef SUPPORT_WIDE_CHARS |
484 | 4.25M | case OP_XCLASS: |
485 | 4.45M | case OP_ECLASS: |
486 | 4.45M | if (c == OP_XCLASS || c == OP_ECLASS) |
487 | 1.93M | end = code + GET(code, 0) - 1; |
488 | 2.51M | else |
489 | 2.51M | #endif |
490 | 2.51M | end = code + 32 / sizeof(PCRE2_UCHAR); |
491 | 4.45M | class_end = end; |
492 | | |
493 | 4.45M | switch(*end) |
494 | 4.45M | { |
495 | 506k | case OP_CRSTAR: |
496 | 625k | case OP_CRMINSTAR: |
497 | 1.44M | case OP_CRQUERY: |
498 | 1.61M | case OP_CRMINQUERY: |
499 | 1.62M | case OP_CRPOSSTAR: |
500 | 1.62M | case OP_CRPOSQUERY: |
501 | 1.62M | list[1] = TRUE; |
502 | 1.62M | end++; |
503 | 1.62M | break; |
504 | | |
505 | 1.27M | case OP_CRPLUS: |
506 | 1.55M | case OP_CRMINPLUS: |
507 | 1.57M | case OP_CRPOSPLUS: |
508 | 1.57M | end++; |
509 | 1.57M | break; |
510 | | |
511 | 558k | case OP_CRRANGE: |
512 | 793k | case OP_CRMINRANGE: |
513 | 805k | case OP_CRPOSRANGE: |
514 | 805k | list[1] = (GET2(end, 1) == 0); |
515 | 805k | end += 1 + 2 * IMM2_SIZE; |
516 | 805k | break; |
517 | 4.45M | } |
518 | 4.45M | list[2] = (uint32_t)(end - code); |
519 | 4.45M | list[3] = (uint32_t)(end - class_end); |
520 | 4.45M | return end; |
521 | 89.4M | } |
522 | | |
523 | 2.15M | return NULL; /* Opcode not accepted */ |
524 | 89.4M | } pcre2_auto_possess.c:get_chr_property_list Line | Count | Source | 333 | 44.8M | { | 334 | 44.8M | PCRE2_UCHAR c = *code; | 335 | 44.8M | PCRE2_UCHAR base; | 336 | 44.8M | PCRE2_SPTR end; | 337 | 44.8M | PCRE2_SPTR class_end; | 338 | 44.8M | uint32_t chr; | 339 | | | 340 | 44.8M | #ifdef SUPPORT_UNICODE | 341 | 44.8M | uint32_t *clist_dest; | 342 | 44.8M | const uint32_t *clist_src; | 343 | | #else | 344 | | (void)utf; /* Suppress "unused parameter" compiler warnings */ | 345 | | (void)ucp; | 346 | | #endif | 347 | | | 348 | 44.8M | list[0] = c; | 349 | 44.8M | list[1] = FALSE; | 350 | 44.8M | code++; | 351 | | | 352 | 44.8M | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) | 353 | 33.9M | { | 354 | 33.9M | base = get_repeat_base(c); | 355 | 33.9M | c -= (base - OP_STAR); | 356 | | | 357 | 33.9M | if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) | 358 | 1.38M | code += IMM2_SIZE; | 359 | | | 360 | 33.9M | list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && | 361 | 24.9M | c != OP_POSPLUS); | 362 | | | 363 | 33.9M | switch(base) | 364 | 33.9M | { | 365 | 19.7M | case OP_STAR: | 366 | 19.7M | list[0] = OP_CHAR; | 367 | 19.7M | break; | 368 | | | 369 | 3.90M | case OP_STARI: | 370 | 3.90M | list[0] = OP_CHARI; | 371 | 3.90M | break; | 372 | | | 373 | 688k | case OP_NOTSTAR: | 374 | 688k | list[0] = OP_NOT; | 375 | 688k | break; | 376 | | | 377 | 258k | case OP_NOTSTARI: | 378 | 258k | list[0] = OP_NOTI; | 379 | 258k | break; | 380 | | | 381 | 9.36M | case OP_TYPESTAR: | 382 | 9.36M | list[0] = *code; | 383 | 9.36M | code++; | 384 | 9.36M | break; | 385 | 33.9M | } | 386 | 33.9M | c = list[0]; | 387 | 33.9M | } | 388 | | | 389 | 44.8M | switch(c) | 390 | 44.8M | { | 391 | 269k | case OP_NOT_DIGIT: | 392 | 720k | case OP_DIGIT: | 393 | 1.45M | case OP_NOT_WHITESPACE: | 394 | 1.65M | case OP_WHITESPACE: | 395 | 2.07M | case OP_NOT_WORDCHAR: | 396 | 2.86M | case OP_WORDCHAR: | 397 | 5.45M | case OP_ANY: | 398 | 5.74M | case OP_ALLANY: | 399 | 6.19M | case OP_ANYNL: | 400 | 6.89M | case OP_NOT_HSPACE: | 401 | 7.67M | case OP_HSPACE: | 402 | 7.81M | case OP_NOT_VSPACE: | 403 | 7.91M | case OP_VSPACE: | 404 | 8.49M | case OP_EXTUNI: | 405 | 8.52M | case OP_EODN: | 406 | 8.54M | case OP_EOD: | 407 | 8.66M | case OP_DOLL: | 408 | 8.69M | case OP_DOLLM: | 409 | 8.69M | return code; | 410 | | | 411 | 24.9M | case OP_CHAR: | 412 | 25.6M | case OP_NOT: | 413 | 25.6M | GETCHARINCTEST(chr, code); | 414 | 25.6M | list[2] = chr; | 415 | 25.6M | list[3] = NOTACHAR; | 416 | 25.6M | return code; | 417 | | | 418 | 5.42M | case OP_CHARI: | 419 | 5.72M | case OP_NOTI: | 420 | 5.72M | list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; | 421 | 5.72M | GETCHARINCTEST(chr, code); | 422 | 5.72M | list[2] = chr; | 423 | | | 424 | 5.72M | #ifdef SUPPORT_UNICODE | 425 | 5.72M | if (chr < 128 || (chr < 256 && !utf && !ucp)) | 426 | 4.88M | list[3] = fcc[chr]; | 427 | 845k | else | 428 | 845k | list[3] = UCD_OTHERCASE(chr); | 429 | | #elif defined SUPPORT_WIDE_CHARS | 430 | | list[3] = (chr < 256) ? fcc[chr] : chr; | 431 | | #else | 432 | | list[3] = fcc[chr]; | 433 | | #endif | 434 | | | 435 | | /* The othercase might be the same value. */ | 436 | | | 437 | 5.72M | if (chr == list[3]) | 438 | 3.93M | list[3] = NOTACHAR; | 439 | 1.79M | else | 440 | 1.79M | list[4] = NOTACHAR; | 441 | 5.72M | return code; | 442 | | | 443 | 0 | #ifdef SUPPORT_UNICODE | 444 | 595k | case OP_PROP: | 445 | 1.65M | case OP_NOTPROP: | 446 | 1.65M | if (code[0] != PT_CLIST) | 447 | 1.40M | { | 448 | 1.40M | list[2] = code[0]; | 449 | 1.40M | list[3] = code[1]; | 450 | 1.40M | return code + 2; | 451 | 1.40M | } | 452 | | | 453 | | /* Convert only if we have enough space. */ | 454 | | | 455 | 253k | clist_src = PRIV(ucd_caseless_sets) + code[1]; | 456 | 253k | clist_dest = list + 2; | 457 | 253k | code += 2; | 458 | | | 459 | 1.01M | do { | 460 | 1.01M | if (clist_dest >= list + MAX_LIST) | 461 | 0 | { | 462 | | /* Early return if there is not enough space. GenerateUcd.py | 463 | | generated a list with more than 5 characters and something | 464 | | must be done about that going forward. */ | 465 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Remove if it ever triggers */ | 466 | 0 | list[2] = code[0]; | 467 | 0 | list[3] = code[1]; | 468 | 0 | return code; | 469 | 0 | } | 470 | 1.01M | *clist_dest++ = *clist_src; | 471 | 1.01M | } | 472 | 1.01M | while(*clist_src++ != NOTACHAR); | 473 | | | 474 | | /* All characters are stored. The terminating NOTACHAR is copied from the | 475 | | clist itself. */ | 476 | | | 477 | 253k | list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; | 478 | 253k | return code; | 479 | 0 | #endif | 480 | | | 481 | 593k | case OP_NCLASS: | 482 | 1.97M | case OP_CLASS: | 483 | 1.97M | #ifdef SUPPORT_WIDE_CHARS | 484 | 2.30M | case OP_XCLASS: | 485 | 2.42M | case OP_ECLASS: | 486 | 2.42M | if (c == OP_XCLASS || c == OP_ECLASS) | 487 | 453k | end = code + GET(code, 0) - 1; | 488 | 1.97M | else | 489 | 1.97M | #endif | 490 | 1.97M | end = code + 32 / sizeof(PCRE2_UCHAR); | 491 | 2.42M | class_end = end; | 492 | | | 493 | 2.42M | switch(*end) | 494 | 2.42M | { | 495 | 248k | case OP_CRSTAR: | 496 | 286k | case OP_CRMINSTAR: | 497 | 667k | case OP_CRQUERY: | 498 | 767k | case OP_CRMINQUERY: | 499 | 770k | case OP_CRPOSSTAR: | 500 | 774k | case OP_CRPOSQUERY: | 501 | 774k | list[1] = TRUE; | 502 | 774k | end++; | 503 | 774k | break; | 504 | | | 505 | 890k | case OP_CRPLUS: | 506 | 1.07M | case OP_CRMINPLUS: | 507 | 1.08M | case OP_CRPOSPLUS: | 508 | 1.08M | end++; | 509 | 1.08M | break; | 510 | | | 511 | 230k | case OP_CRRANGE: | 512 | 345k | case OP_CRMINRANGE: | 513 | 350k | case OP_CRPOSRANGE: | 514 | 350k | list[1] = (GET2(end, 1) == 0); | 515 | 350k | end += 1 + 2 * IMM2_SIZE; | 516 | 350k | break; | 517 | 2.42M | } | 518 | 2.42M | list[2] = (uint32_t)(end - code); | 519 | 2.42M | list[3] = (uint32_t)(end - class_end); | 520 | 2.42M | return end; | 521 | 44.8M | } | 522 | | | 523 | 694k | return NULL; /* Opcode not accepted */ | 524 | 44.8M | } |
pcre2_auto_possess.c:get_chr_property_list Line | Count | Source | 333 | 17.6M | { | 334 | 17.6M | PCRE2_UCHAR c = *code; | 335 | 17.6M | PCRE2_UCHAR base; | 336 | 17.6M | PCRE2_SPTR end; | 337 | 17.6M | PCRE2_SPTR class_end; | 338 | 17.6M | uint32_t chr; | 339 | | | 340 | 17.6M | #ifdef SUPPORT_UNICODE | 341 | 17.6M | uint32_t *clist_dest; | 342 | 17.6M | const uint32_t *clist_src; | 343 | | #else | 344 | | (void)utf; /* Suppress "unused parameter" compiler warnings */ | 345 | | (void)ucp; | 346 | | #endif | 347 | | | 348 | 17.6M | list[0] = c; | 349 | 17.6M | list[1] = FALSE; | 350 | 17.6M | code++; | 351 | | | 352 | 17.6M | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) | 353 | 11.2M | { | 354 | 11.2M | base = get_repeat_base(c); | 355 | 11.2M | c -= (base - OP_STAR); | 356 | | | 357 | 11.2M | if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) | 358 | 652k | code += IMM2_SIZE; | 359 | | | 360 | 11.2M | list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && | 361 | 8.17M | c != OP_POSPLUS); | 362 | | | 363 | 11.2M | switch(base) | 364 | 11.2M | { | 365 | 5.67M | case OP_STAR: | 366 | 5.67M | list[0] = OP_CHAR; | 367 | 5.67M | break; | 368 | | | 369 | 2.09M | case OP_STARI: | 370 | 2.09M | list[0] = OP_CHARI; | 371 | 2.09M | break; | 372 | | | 373 | 212k | case OP_NOTSTAR: | 374 | 212k | list[0] = OP_NOT; | 375 | 212k | break; | 376 | | | 377 | 73.2k | case OP_NOTSTARI: | 378 | 73.2k | list[0] = OP_NOTI; | 379 | 73.2k | break; | 380 | | | 381 | 3.17M | case OP_TYPESTAR: | 382 | 3.17M | list[0] = *code; | 383 | 3.17M | code++; | 384 | 3.17M | break; | 385 | 11.2M | } | 386 | 11.2M | c = list[0]; | 387 | 11.2M | } | 388 | | | 389 | 17.6M | switch(c) | 390 | 17.6M | { | 391 | 64.4k | case OP_NOT_DIGIT: | 392 | 201k | case OP_DIGIT: | 393 | 780k | case OP_NOT_WHITESPACE: | 394 | 893k | case OP_WHITESPACE: | 395 | 998k | case OP_NOT_WORDCHAR: | 396 | 1.12M | case OP_WORDCHAR: | 397 | 1.25M | case OP_ANY: | 398 | 1.29M | case OP_ALLANY: | 399 | 1.72M | case OP_ANYNL: | 400 | 1.89M | case OP_NOT_HSPACE: | 401 | 2.05M | case OP_HSPACE: | 402 | 2.27M | case OP_NOT_VSPACE: | 403 | 2.34M | case OP_VSPACE: | 404 | 2.72M | case OP_EXTUNI: | 405 | 2.74M | case OP_EODN: | 406 | 2.76M | case OP_EOD: | 407 | 2.80M | case OP_DOLL: | 408 | 2.84M | case OP_DOLLM: | 409 | 2.84M | return code; | 410 | | | 411 | 8.95M | case OP_CHAR: | 412 | 9.16M | case OP_NOT: | 413 | 9.16M | GETCHARINCTEST(chr, code); | 414 | 9.16M | list[2] = chr; | 415 | 9.16M | list[3] = NOTACHAR; | 416 | 9.16M | return code; | 417 | | | 418 | 3.34M | case OP_CHARI: | 419 | 3.41M | case OP_NOTI: | 420 | 3.41M | list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; | 421 | 3.41M | GETCHARINCTEST(chr, code); | 422 | 3.41M | list[2] = chr; | 423 | | | 424 | 3.41M | #ifdef SUPPORT_UNICODE | 425 | 3.41M | if (chr < 128 || (chr < 256 && !utf && !ucp)) | 426 | 1.23M | list[3] = fcc[chr]; | 427 | 2.18M | else | 428 | 2.18M | list[3] = UCD_OTHERCASE(chr); | 429 | | #elif defined SUPPORT_WIDE_CHARS | 430 | | list[3] = (chr < 256) ? fcc[chr] : chr; | 431 | | #else | 432 | | list[3] = fcc[chr]; | 433 | | #endif | 434 | | | 435 | | /* The othercase might be the same value. */ | 436 | | | 437 | 3.41M | if (chr == list[3]) | 438 | 2.86M | list[3] = NOTACHAR; | 439 | 549k | else | 440 | 549k | list[4] = NOTACHAR; | 441 | 3.41M | return code; | 442 | | | 443 | 0 | #ifdef SUPPORT_UNICODE | 444 | 447k | case OP_PROP: | 445 | 839k | case OP_NOTPROP: | 446 | 839k | if (code[0] != PT_CLIST) | 447 | 748k | { | 448 | 748k | list[2] = code[0]; | 449 | 748k | list[3] = code[1]; | 450 | 748k | return code + 2; | 451 | 748k | } | 452 | | | 453 | | /* Convert only if we have enough space. */ | 454 | | | 455 | 91.0k | clist_src = PRIV(ucd_caseless_sets) + code[1]; | 456 | 91.0k | clist_dest = list + 2; | 457 | 91.0k | code += 2; | 458 | | | 459 | 365k | do { | 460 | 365k | if (clist_dest >= list + MAX_LIST) | 461 | 0 | { | 462 | | /* Early return if there is not enough space. GenerateUcd.py | 463 | | generated a list with more than 5 characters and something | 464 | | must be done about that going forward. */ | 465 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Remove if it ever triggers */ | 466 | 0 | list[2] = code[0]; | 467 | 0 | list[3] = code[1]; | 468 | 0 | return code; | 469 | 0 | } | 470 | 365k | *clist_dest++ = *clist_src; | 471 | 365k | } | 472 | 365k | while(*clist_src++ != NOTACHAR); | 473 | | | 474 | | /* All characters are stored. The terminating NOTACHAR is copied from the | 475 | | clist itself. */ | 476 | | | 477 | 91.0k | list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; | 478 | 91.0k | return code; | 479 | 0 | #endif | 480 | | | 481 | 70.6k | case OP_NCLASS: | 482 | 215k | case OP_CLASS: | 483 | 215k | #ifdef SUPPORT_WIDE_CHARS | 484 | 652k | case OP_XCLASS: | 485 | 688k | case OP_ECLASS: | 486 | 688k | if (c == OP_XCLASS || c == OP_ECLASS) | 487 | 472k | end = code + GET(code, 0) - 1; | 488 | 215k | else | 489 | 215k | #endif | 490 | 215k | end = code + 32 / sizeof(PCRE2_UCHAR); | 491 | 688k | class_end = end; | 492 | | | 493 | 688k | switch(*end) | 494 | 688k | { | 495 | 114k | case OP_CRSTAR: | 496 | 164k | case OP_CRMINSTAR: | 497 | 265k | case OP_CRQUERY: | 498 | 302k | case OP_CRMINQUERY: | 499 | 304k | case OP_CRPOSSTAR: | 500 | 305k | case OP_CRPOSQUERY: | 501 | 305k | list[1] = TRUE; | 502 | 305k | end++; | 503 | 305k | break; | 504 | | | 505 | 113k | case OP_CRPLUS: | 506 | 139k | case OP_CRMINPLUS: | 507 | 144k | case OP_CRPOSPLUS: | 508 | 144k | end++; | 509 | 144k | break; | 510 | | | 511 | 123k | case OP_CRRANGE: | 512 | 162k | case OP_CRMINRANGE: | 513 | 163k | case OP_CRPOSRANGE: | 514 | 163k | list[1] = (GET2(end, 1) == 0); | 515 | 163k | end += 1 + 2 * IMM2_SIZE; | 516 | 163k | break; | 517 | 688k | } | 518 | 688k | list[2] = (uint32_t)(end - code); | 519 | 688k | list[3] = (uint32_t)(end - class_end); | 520 | 688k | return end; | 521 | 17.6M | } | 522 | | | 523 | 708k | return NULL; /* Opcode not accepted */ | 524 | 17.6M | } |
pcre2_auto_possess.c:get_chr_property_list Line | Count | Source | 333 | 26.8M | { | 334 | 26.8M | PCRE2_UCHAR c = *code; | 335 | 26.8M | PCRE2_UCHAR base; | 336 | 26.8M | PCRE2_SPTR end; | 337 | 26.8M | PCRE2_SPTR class_end; | 338 | 26.8M | uint32_t chr; | 339 | | | 340 | 26.8M | #ifdef SUPPORT_UNICODE | 341 | 26.8M | uint32_t *clist_dest; | 342 | 26.8M | const uint32_t *clist_src; | 343 | | #else | 344 | | (void)utf; /* Suppress "unused parameter" compiler warnings */ | 345 | | (void)ucp; | 346 | | #endif | 347 | | | 348 | 26.8M | list[0] = c; | 349 | 26.8M | list[1] = FALSE; | 350 | 26.8M | code++; | 351 | | | 352 | 26.8M | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) | 353 | 19.0M | { | 354 | 19.0M | base = get_repeat_base(c); | 355 | 19.0M | c -= (base - OP_STAR); | 356 | | | 357 | 19.0M | if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) | 358 | 970k | code += IMM2_SIZE; | 359 | | | 360 | 19.0M | list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && | 361 | 13.4M | c != OP_POSPLUS); | 362 | | | 363 | 19.0M | switch(base) | 364 | 19.0M | { | 365 | 7.86M | case OP_STAR: | 366 | 7.86M | list[0] = OP_CHAR; | 367 | 7.86M | break; | 368 | | | 369 | 1.73M | case OP_STARI: | 370 | 1.73M | list[0] = OP_CHARI; | 371 | 1.73M | break; | 372 | | | 373 | 517k | case OP_NOTSTAR: | 374 | 517k | list[0] = OP_NOT; | 375 | 517k | break; | 376 | | | 377 | 88.9k | case OP_NOTSTARI: | 378 | 88.9k | list[0] = OP_NOTI; | 379 | 88.9k | break; | 380 | | | 381 | 8.84M | case OP_TYPESTAR: | 382 | 8.84M | list[0] = *code; | 383 | 8.84M | code++; | 384 | 8.84M | break; | 385 | 19.0M | } | 386 | 19.0M | c = list[0]; | 387 | 19.0M | } | 388 | | | 389 | 26.8M | switch(c) | 390 | 26.8M | { | 391 | 152k | case OP_NOT_DIGIT: | 392 | 2.27M | case OP_DIGIT: | 393 | 2.94M | case OP_NOT_WHITESPACE: | 394 | 3.11M | case OP_WHITESPACE: | 395 | 4.23M | case OP_NOT_WORDCHAR: | 396 | 4.42M | case OP_WORDCHAR: | 397 | 4.95M | case OP_ANY: | 398 | 5.32M | case OP_ALLANY: | 399 | 6.02M | case OP_ANYNL: | 400 | 6.36M | case OP_NOT_HSPACE: | 401 | 6.56M | case OP_HSPACE: | 402 | 6.93M | case OP_NOT_VSPACE: | 403 | 7.17M | case OP_VSPACE: | 404 | 7.57M | case OP_EXTUNI: | 405 | 7.59M | case OP_EODN: | 406 | 7.60M | case OP_EOD: | 407 | 7.67M | case OP_DOLL: | 408 | 7.68M | case OP_DOLLM: | 409 | 7.68M | return code; | 410 | | | 411 | 11.8M | case OP_CHAR: | 412 | 12.3M | case OP_NOT: | 413 | 12.3M | GETCHARINCTEST(chr, code); | 414 | 12.3M | list[2] = chr; | 415 | 12.3M | list[3] = NOTACHAR; | 416 | 12.3M | return code; | 417 | | | 418 | 2.82M | case OP_CHARI: | 419 | 2.92M | case OP_NOTI: | 420 | 2.92M | list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; | 421 | 2.92M | GETCHARINCTEST(chr, code); | 422 | 2.92M | list[2] = chr; | 423 | | | 424 | 2.92M | #ifdef SUPPORT_UNICODE | 425 | 2.92M | if (chr < 128 || (chr < 256 && !utf && !ucp)) | 426 | 1.30M | list[3] = fcc[chr]; | 427 | 1.62M | else | 428 | 1.62M | list[3] = UCD_OTHERCASE(chr); | 429 | | #elif defined SUPPORT_WIDE_CHARS | 430 | | list[3] = (chr < 256) ? fcc[chr] : chr; | 431 | | #else | 432 | | list[3] = fcc[chr]; | 433 | | #endif | 434 | | | 435 | | /* The othercase might be the same value. */ | 436 | | | 437 | 2.92M | if (chr == list[3]) | 438 | 2.36M | list[3] = NOTACHAR; | 439 | 552k | else | 440 | 552k | list[4] = NOTACHAR; | 441 | 2.92M | return code; | 442 | | | 443 | 0 | #ifdef SUPPORT_UNICODE | 444 | 915k | case OP_PROP: | 445 | 1.85M | case OP_NOTPROP: | 446 | 1.85M | if (code[0] != PT_CLIST) | 447 | 1.74M | { | 448 | 1.74M | list[2] = code[0]; | 449 | 1.74M | list[3] = code[1]; | 450 | 1.74M | return code + 2; | 451 | 1.74M | } | 452 | | | 453 | | /* Convert only if we have enough space. */ | 454 | | | 455 | 112k | clist_src = PRIV(ucd_caseless_sets) + code[1]; | 456 | 112k | clist_dest = list + 2; | 457 | 112k | code += 2; | 458 | | | 459 | 454k | do { | 460 | 454k | if (clist_dest >= list + MAX_LIST) | 461 | 0 | { | 462 | | /* Early return if there is not enough space. GenerateUcd.py | 463 | | generated a list with more than 5 characters and something | 464 | | must be done about that going forward. */ | 465 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Remove if it ever triggers */ | 466 | 0 | list[2] = code[0]; | 467 | 0 | list[3] = code[1]; | 468 | 0 | return code; | 469 | 0 | } | 470 | 454k | *clist_dest++ = *clist_src; | 471 | 454k | } | 472 | 454k | while(*clist_src++ != NOTACHAR); | 473 | | | 474 | | /* All characters are stored. The terminating NOTACHAR is copied from the | 475 | | clist itself. */ | 476 | | | 477 | 112k | list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; | 478 | 112k | return code; | 479 | 0 | #endif | 480 | | | 481 | 97.8k | case OP_NCLASS: | 482 | 328k | case OP_CLASS: | 483 | 328k | #ifdef SUPPORT_WIDE_CHARS | 484 | 1.28M | case OP_XCLASS: | 485 | 1.34M | case OP_ECLASS: | 486 | 1.34M | if (c == OP_XCLASS || c == OP_ECLASS) | 487 | 1.01M | end = code + GET(code, 0) - 1; | 488 | 328k | else | 489 | 328k | #endif | 490 | 328k | end = code + 32 / sizeof(PCRE2_UCHAR); | 491 | 1.34M | class_end = end; | 492 | | | 493 | 1.34M | switch(*end) | 494 | 1.34M | { | 495 | 143k | case OP_CRSTAR: | 496 | 174k | case OP_CRMINSTAR: | 497 | 507k | case OP_CRQUERY: | 498 | 545k | case OP_CRMINQUERY: | 499 | 547k | case OP_CRPOSSTAR: | 500 | 549k | case OP_CRPOSQUERY: | 501 | 549k | list[1] = TRUE; | 502 | 549k | end++; | 503 | 549k | break; | 504 | | | 505 | 272k | case OP_CRPLUS: | 506 | 344k | case OP_CRMINPLUS: | 507 | 346k | case OP_CRPOSPLUS: | 508 | 346k | end++; | 509 | 346k | break; | 510 | | | 511 | 203k | case OP_CRRANGE: | 512 | 285k | case OP_CRMINRANGE: | 513 | 291k | case OP_CRPOSRANGE: | 514 | 291k | list[1] = (GET2(end, 1) == 0); | 515 | 291k | end += 1 + 2 * IMM2_SIZE; | 516 | 291k | break; | 517 | 1.34M | } | 518 | 1.34M | list[2] = (uint32_t)(end - code); | 519 | 1.34M | list[3] = (uint32_t)(end - class_end); | 520 | 1.34M | return end; | 521 | 26.8M | } | 522 | | | 523 | 750k | return NULL; /* Opcode not accepted */ | 524 | 26.8M | } |
|
525 | | |
526 | | |
527 | | |
528 | | /************************************************* |
529 | | * Scan further character sets for match * |
530 | | *************************************************/ |
531 | | |
532 | | /* Checks whether the base and the current opcode have a common character, in |
533 | | which case the base cannot be possessified. |
534 | | |
535 | | Arguments: |
536 | | code points to the byte code |
537 | | utf TRUE in UTF mode |
538 | | ucp TRUE in UCP mode |
539 | | cb compile data block |
540 | | base_list the data list of the base opcode |
541 | | base_end the end of the base opcode |
542 | | rec_limit points to recursion depth counter |
543 | | |
544 | | Returns: TRUE if the auto-possessification is possible |
545 | | */ |
546 | | |
547 | | static BOOL |
548 | | compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb, |
549 | | const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) |
550 | 67.3M | { |
551 | 67.3M | PCRE2_UCHAR c; |
552 | 67.3M | uint32_t list[MAX_LIST]; |
553 | 67.3M | const uint32_t *chr_ptr; |
554 | 67.3M | const uint32_t *ochr_ptr; |
555 | 67.3M | const uint32_t *list_ptr; |
556 | 67.3M | PCRE2_SPTR next_code; |
557 | 67.3M | #ifdef SUPPORT_WIDE_CHARS |
558 | 67.3M | PCRE2_SPTR xclass_flags; |
559 | 67.3M | #endif |
560 | 67.3M | const uint8_t *class_bitset; |
561 | 67.3M | const uint8_t *set1, *set2, *set_end; |
562 | 67.3M | uint32_t chr; |
563 | 67.3M | BOOL accepted, invert_bits; |
564 | 67.3M | BOOL entered_a_group = FALSE; |
565 | | |
566 | 67.3M | if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */ |
567 | | |
568 | | /* Note: the base_list[1] contains whether the current opcode has a greedy |
569 | | (represented by a non-zero value) quantifier. This is a different from |
570 | | other character type lists, which store here that the character iterator |
571 | | matches to an empty string (also represented by a non-zero value). */ |
572 | | |
573 | 26.3M | for(;;) |
574 | 76.2M | { |
575 | 76.2M | PCRE2_SPTR bracode; |
576 | | |
577 | | /* All operations move the code pointer forward. |
578 | | Therefore infinite recursions are not possible. */ |
579 | | |
580 | 76.2M | c = *code; |
581 | | |
582 | | /* Skip over callouts */ |
583 | | |
584 | 76.2M | if (c == OP_CALLOUT) |
585 | 3.36M | { |
586 | 3.36M | code += PRIV(OP_lengths)[c]; |
587 | 3.36M | continue; |
588 | 3.36M | } |
589 | | |
590 | 72.9M | if (c == OP_CALLOUT_STR) |
591 | 20.5k | { |
592 | 20.5k | code += GET(code, 1 + 2*LINK_SIZE); |
593 | 20.5k | continue; |
594 | 20.5k | } |
595 | | |
596 | | /* At the end of a branch, skip to the end of the group and process it. */ |
597 | | |
598 | 72.8M | if (c == OP_ALT) |
599 | 1.85M | { |
600 | 8.95M | do code += GET(code, 1); while (*code == OP_ALT); |
601 | 1.85M | c = *code; |
602 | 1.85M | } |
603 | | |
604 | | /* Inspect the next opcode. */ |
605 | | |
606 | 72.8M | switch(c) |
607 | 72.8M | { |
608 | | /* We can always possessify a greedy iterator at the end of the pattern, |
609 | | which is reached after skipping over the final OP_KET. A non-greedy |
610 | | iterator must never be possessified. */ |
611 | | |
612 | 289k | case OP_END: |
613 | 289k | return base_list[1] != 0; |
614 | | |
615 | | /* When an iterator is at the end of certain kinds of group we can inspect |
616 | | what follows the group by skipping over the closing ket. Note that this |
617 | | does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given |
618 | | iteration is variable (could be another iteration or could be the next |
619 | | item). As these two opcodes are not listed in the next switch, they will |
620 | | end up as the next code to inspect, and return FALSE by virtue of being |
621 | | unsupported. */ |
622 | | |
623 | 46.1M | case OP_KET: |
624 | 46.3M | case OP_KETRPOS: |
625 | | /* The non-greedy case cannot be converted to a possessive form. */ |
626 | | |
627 | 46.3M | if (base_list[1] == 0) return FALSE; |
628 | | |
629 | | /* If the bracket is capturing it might be referenced by an OP_RECURSE |
630 | | so its last iterator can never be possessified if the pattern contains |
631 | | recursions. (This could be improved by keeping a list of group numbers that |
632 | | are called by recursion.) */ |
633 | | |
634 | 45.3M | bracode = code - GET(code, 1); |
635 | 45.3M | switch(*bracode) |
636 | 45.3M | { |
637 | 2.89M | case OP_CBRA: |
638 | 2.89M | case OP_SCBRA: |
639 | 2.93M | case OP_CBRAPOS: |
640 | 3.00M | case OP_SCBRAPOS: |
641 | 3.00M | if (cb->had_recurse) return FALSE; |
642 | 2.58M | break; |
643 | | |
644 | | /* A script run might have to backtrack if the iterated item can match |
645 | | characters from more than one script. So give up unless repeating an |
646 | | explicit character. */ |
647 | | |
648 | 2.58M | case OP_SCRIPT_RUN: |
649 | 73.0k | if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI) |
650 | 19.4k | return FALSE; |
651 | 53.6k | break; |
652 | | |
653 | | /* Atomic sub-patterns and forward assertions can always auto-possessify |
654 | | their last iterator. However, if the group was entered as a result of |
655 | | checking a previous iterator, this is not possible. */ |
656 | | |
657 | 275k | case OP_ASSERT: |
658 | 426k | case OP_ASSERT_NOT: |
659 | 518k | case OP_ONCE: |
660 | 518k | return !entered_a_group; |
661 | | |
662 | | /* Fixed-length lookbehinds can be treated the same way, but variable |
663 | | length lookbehinds must not auto-possessify their last iterator. Note |
664 | | that in order to identify a variable length lookbehind we must check |
665 | | through all branches, because some may be of fixed length. */ |
666 | | |
667 | 152k | case OP_ASSERTBACK: |
668 | 264k | case OP_ASSERTBACK_NOT: |
669 | 264k | do |
670 | 293k | { |
671 | 293k | if (bracode[1+LINK_SIZE] == OP_VREVERSE) return FALSE; /* Variable */ |
672 | 42.5k | bracode += GET(bracode, 1); |
673 | 42.5k | } |
674 | 264k | while (*bracode == OP_ALT); |
675 | 14.1k | return !entered_a_group; /* Not variable length */ |
676 | | |
677 | | /* Non-atomic assertions - don't possessify last iterator. This needs |
678 | | more thought. */ |
679 | | |
680 | 170k | case OP_ASSERT_NA: |
681 | 546k | case OP_ASSERTBACK_NA: |
682 | 546k | return FALSE; |
683 | 45.3M | } |
684 | | |
685 | | /* Skip over the bracket and inspect what comes next. */ |
686 | | |
687 | 43.6M | code += PRIV(OP_lengths)[c]; |
688 | 43.6M | continue; |
689 | | |
690 | | /* Handle cases where the next item is a group. */ |
691 | | |
692 | 36.1k | case OP_ONCE: |
693 | 678k | case OP_BRA: |
694 | 2.09M | case OP_CBRA: |
695 | 2.09M | next_code = code + GET(code, 1); |
696 | 2.09M | code += PRIV(OP_lengths)[c]; |
697 | | |
698 | | /* Check each branch. We have to recurse a level for all but the last |
699 | | branch. */ |
700 | | |
701 | 2.75M | while (*next_code == OP_ALT) |
702 | 1.07M | { |
703 | 1.07M | if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit)) |
704 | 424k | return FALSE; |
705 | 653k | code = next_code + 1 + LINK_SIZE; |
706 | 653k | next_code += GET(next_code, 1); |
707 | 653k | } |
708 | | |
709 | 1.67M | entered_a_group = TRUE; |
710 | 1.67M | continue; |
711 | | |
712 | 517k | case OP_BRAZERO: |
713 | 559k | case OP_BRAMINZERO: |
714 | | |
715 | 559k | next_code = code + 1; |
716 | 559k | if (*next_code != OP_BRA && *next_code != OP_CBRA && |
717 | 75.1k | *next_code != OP_ONCE) return FALSE; |
718 | | |
719 | 556k | do next_code += GET(next_code, 1); while (*next_code == OP_ALT); |
720 | | |
721 | | /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ |
722 | | |
723 | 502k | next_code += 1 + LINK_SIZE; |
724 | 502k | if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, |
725 | 502k | rec_limit)) |
726 | 55.6k | return FALSE; |
727 | | |
728 | 446k | code += PRIV(OP_lengths)[c]; |
729 | 446k | continue; |
730 | | |
731 | | /* The next opcode does not need special handling; fall through and use it |
732 | | to see if the base can be possessified. */ |
733 | | |
734 | 23.6M | default: |
735 | 23.6M | break; |
736 | 72.8M | } |
737 | | |
738 | | /* We now have the next appropriate opcode to compare with the base. Check |
739 | | for a supported opcode, and load its properties. */ |
740 | | |
741 | 23.6M | code = get_chr_property_list(code, utf, ucp, cb->fcc, list); |
742 | 23.6M | if (code == NULL) return FALSE; /* Unsupported */ |
743 | | |
744 | | /* If either opcode is a small character list, set pointers for comparing |
745 | | characters from that list with another list, or with a property. */ |
746 | | |
747 | 21.4M | if (base_list[0] == OP_CHAR) |
748 | 13.4M | { |
749 | 13.4M | chr_ptr = base_list + 2; |
750 | 13.4M | list_ptr = list; |
751 | 13.4M | } |
752 | 8.00M | else if (list[0] == OP_CHAR) |
753 | 6.37M | { |
754 | 6.37M | chr_ptr = list + 2; |
755 | 6.37M | list_ptr = base_list; |
756 | 6.37M | } |
757 | | |
758 | | /* Character bitsets can also be compared to certain opcodes. */ |
759 | | |
760 | 1.63M | else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS |
761 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
762 | | /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ |
763 | 590k | || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) |
764 | | #endif |
765 | 1.63M | ) |
766 | 312k | { |
767 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
768 | 208k | if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) |
769 | | #else |
770 | 104k | if (base_list[0] == OP_CLASS) |
771 | 68.5k | #endif |
772 | 191k | { |
773 | 191k | set1 = (const uint8_t *)(base_end - base_list[2]); |
774 | 191k | list_ptr = list; |
775 | 191k | } |
776 | 120k | else |
777 | 120k | { |
778 | 120k | set1 = (const uint8_t *)(code - list[2]); |
779 | 120k | list_ptr = base_list; |
780 | 120k | } |
781 | | |
782 | 312k | invert_bits = FALSE; |
783 | 312k | switch(list_ptr[0]) |
784 | 312k | { |
785 | 45.0k | case OP_CLASS: |
786 | 65.8k | case OP_NCLASS: |
787 | 65.8k | set2 = (const uint8_t *) |
788 | 65.8k | ((list_ptr == list ? code : base_end) - list_ptr[2]); |
789 | 65.8k | break; |
790 | | |
791 | 0 | #ifdef SUPPORT_WIDE_CHARS |
792 | 44.1k | case OP_XCLASS: |
793 | 44.1k | xclass_flags = (list_ptr == list ? code : base_end) - |
794 | 44.1k | list_ptr[2] + LINK_SIZE; |
795 | 44.1k | if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; |
796 | 33.2k | if ((*xclass_flags & XCL_MAP) == 0) |
797 | 12.0k | { |
798 | | /* No bits are set for characters < 256. */ |
799 | 12.0k | if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0; |
800 | | /* Might be an empty repeat. */ |
801 | 4.93k | continue; |
802 | 12.0k | } |
803 | 21.2k | set2 = (const uint8_t *)(xclass_flags + 1); |
804 | 21.2k | break; |
805 | 0 | #endif |
806 | | |
807 | 15.6k | case OP_NOT_DIGIT: |
808 | 15.6k | invert_bits = TRUE; |
809 | 15.6k | PCRE2_FALLTHROUGH /* Fall through */ |
810 | 26.6k | case OP_DIGIT: |
811 | 26.6k | set2 = (const uint8_t *)(cb->cbits + cbit_digit); |
812 | 26.6k | break; |
813 | | |
814 | 11.8k | case OP_NOT_WHITESPACE: |
815 | 11.8k | invert_bits = TRUE; |
816 | 11.8k | PCRE2_FALLTHROUGH /* Fall through */ |
817 | 23.7k | case OP_WHITESPACE: |
818 | 23.7k | set2 = (const uint8_t *)(cb->cbits + cbit_space); |
819 | 23.7k | break; |
820 | | |
821 | 12.3k | case OP_NOT_WORDCHAR: |
822 | 12.3k | invert_bits = TRUE; |
823 | 12.3k | PCRE2_FALLTHROUGH /* Fall through */ |
824 | 26.9k | case OP_WORDCHAR: |
825 | 26.9k | set2 = (const uint8_t *)(cb->cbits + cbit_word); |
826 | 26.9k | break; |
827 | | |
828 | 125k | default: |
829 | 125k | return FALSE; |
830 | 312k | } |
831 | | |
832 | | /* Because the bit sets are unaligned bytes, we need to perform byte |
833 | | comparison here. */ |
834 | | |
835 | 164k | set_end = set1 + 32; |
836 | 164k | if (invert_bits) |
837 | 39.8k | { |
838 | 39.8k | do |
839 | 561k | { |
840 | 561k | if ((*set1++ & ~(*set2++)) != 0) return FALSE; |
841 | 561k | } |
842 | 533k | while (set1 < set_end); |
843 | 39.8k | } |
844 | 124k | else |
845 | 124k | { |
846 | 124k | do |
847 | 2.20M | { |
848 | 2.20M | if ((*set1++ & *set2++) != 0) return FALSE; |
849 | 2.20M | } |
850 | 2.13M | while (set1 < set_end); |
851 | 124k | } |
852 | | |
853 | 64.4k | if (list[1] == 0) return TRUE; |
854 | | /* Might be an empty repeat. */ |
855 | 20.3k | continue; |
856 | 64.4k | } |
857 | | |
858 | | /* Some property combinations also acceptable. Unicode property opcodes are |
859 | | processed specially; the rest can be handled with a lookup table. */ |
860 | | |
861 | 1.32M | else |
862 | 1.32M | { |
863 | 1.32M | uint32_t leftop, rightop; |
864 | | |
865 | 1.32M | leftop = base_list[0]; |
866 | 1.32M | rightop = list[0]; |
867 | | |
868 | 1.32M | #ifdef SUPPORT_UNICODE |
869 | 1.32M | accepted = FALSE; /* Always set in non-unicode case. */ |
870 | 1.32M | if (leftop == OP_PROP || leftop == OP_NOTPROP) |
871 | 306k | { |
872 | 306k | if (rightop == OP_EOD) |
873 | 9.22k | accepted = TRUE; |
874 | 296k | else if (rightop == OP_PROP || rightop == OP_NOTPROP) |
875 | 218k | { |
876 | 218k | int n; |
877 | 218k | const uint8_t *p; |
878 | 218k | BOOL same = leftop == rightop; |
879 | 218k | BOOL lisprop = leftop == OP_PROP; |
880 | 218k | BOOL risprop = rightop == OP_PROP; |
881 | 218k | BOOL bothprop = lisprop && risprop; |
882 | | |
883 | | /* There's a table that specifies how each combination is to be |
884 | | processed: |
885 | | 0 Always return FALSE (never auto-possessify) |
886 | | 1 Character groups are distinct (possessify if both are OP_PROP) |
887 | | 2 Check character categories in the same group (general or particular) |
888 | | 3 Return TRUE if the two opcodes are not the same |
889 | | ... see comments below |
890 | | */ |
891 | | |
892 | 218k | n = propposstab[base_list[2]][list[2]]; |
893 | 218k | switch(n) |
894 | 218k | { |
895 | 7.64k | case 0: break; |
896 | 11.6k | case 1: accepted = bothprop; break; |
897 | 16.1k | case 2: accepted = (base_list[3] == list[3]) != same; break; |
898 | 16.3k | case 3: accepted = !same; break; |
899 | | |
900 | 16.6k | case 4: /* Left general category, right particular category */ |
901 | 16.6k | accepted = risprop && catposstab[base_list[3]][list[3]] == same; |
902 | 16.6k | break; |
903 | | |
904 | 13.8k | case 5: /* Right general category, left particular category */ |
905 | 13.8k | accepted = lisprop && catposstab[list[3]][base_list[3]] == same; |
906 | 13.8k | break; |
907 | | |
908 | | /* This code is logically tricky. Think hard before fiddling with it. |
909 | | The posspropstab table has four entries per row. Each row relates to |
910 | | one of PCRE's special properties such as ALNUM or SPACE or WORD. |
911 | | Only WORD actually needs all four entries, but using repeats for the |
912 | | others means they can all use the same code below. |
913 | | |
914 | | The first two entries in each row are Unicode general categories, and |
915 | | apply always, because all the characters they include are part of the |
916 | | PCRE character set. The third and fourth entries are a general and a |
917 | | particular category, respectively, that include one or more relevant |
918 | | characters. One or the other is used, depending on whether the check |
919 | | is for a general or a particular category. However, in both cases the |
920 | | category contains more characters than the specials that are defined |
921 | | for the property being tested against. Therefore, it cannot be used |
922 | | in a NOTPROP case. |
923 | | |
924 | | Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. |
925 | | Underscore is covered by ucp_P or ucp_Po. */ |
926 | | |
927 | 4.58k | case 6: /* Left alphanum vs right general category */ |
928 | 18.2k | case 7: /* Left space vs right general category */ |
929 | 36.6k | case 8: /* Left word vs right general category */ |
930 | 36.6k | p = posspropstab[n-6]; |
931 | 36.6k | accepted = risprop && lisprop == |
932 | 27.1k | (list[3] != p[0] && |
933 | 20.7k | list[3] != p[1] && |
934 | 15.3k | (list[3] != p[2] || !lisprop)); |
935 | 36.6k | break; |
936 | | |
937 | 1.87k | case 9: /* Right alphanum vs left general category */ |
938 | 14.0k | case 10: /* Right space vs left general category */ |
939 | 28.9k | case 11: /* Right word vs left general category */ |
940 | 28.9k | p = posspropstab[n-9]; |
941 | 28.9k | accepted = lisprop && risprop == |
942 | 21.0k | (base_list[3] != p[0] && |
943 | 16.7k | base_list[3] != p[1] && |
944 | 10.9k | (base_list[3] != p[2] || !risprop)); |
945 | 28.9k | break; |
946 | | |
947 | 2.44k | case 12: /* Left alphanum vs right particular category */ |
948 | 26.3k | case 13: /* Left space vs right particular category */ |
949 | 38.3k | case 14: /* Left word vs right particular category */ |
950 | 38.3k | p = posspropstab[n-12]; |
951 | 38.3k | accepted = risprop && lisprop == |
952 | 26.2k | (catposstab[p[0]][list[3]] && |
953 | 20.5k | catposstab[p[1]][list[3]] && |
954 | 12.8k | (list[3] != p[3] || !lisprop)); |
955 | 38.3k | break; |
956 | | |
957 | 4.04k | case 15: /* Right alphanum vs left particular category */ |
958 | 19.6k | case 16: /* Right space vs left particular category */ |
959 | 31.8k | case 17: /* Right word vs left particular category */ |
960 | 31.8k | p = posspropstab[n-15]; |
961 | 31.8k | accepted = lisprop && risprop == |
962 | 22.6k | (catposstab[p[0]][base_list[3]] && |
963 | 19.0k | catposstab[p[1]][base_list[3]] && |
964 | 9.58k | (base_list[3] != p[3] || !risprop)); |
965 | 31.8k | break; |
966 | 218k | } |
967 | 218k | } |
968 | 306k | } |
969 | | |
970 | 1.02M | else |
971 | 1.02M | #endif /* SUPPORT_UNICODE */ |
972 | | |
973 | 1.02M | accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && |
974 | 786k | rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && |
975 | 744k | autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; |
976 | | |
977 | 1.32M | if (!accepted) return FALSE; |
978 | | |
979 | 211k | if (list[1] == 0) return TRUE; |
980 | | /* Might be an empty repeat. */ |
981 | 53.5k | continue; |
982 | 211k | } |
983 | | |
984 | | /* Control reaches here only if one of the items is a small character list. |
985 | | All characters are checked against the other side. */ |
986 | | |
987 | 19.8M | do |
988 | 20.9M | { |
989 | 20.9M | chr = *chr_ptr; |
990 | | |
991 | 20.9M | switch(list_ptr[0]) |
992 | 20.9M | { |
993 | 12.0M | case OP_CHAR: |
994 | 12.0M | ochr_ptr = list_ptr + 2; |
995 | 12.0M | do |
996 | 13.0M | { |
997 | 13.0M | if (chr == *ochr_ptr) return FALSE; |
998 | 12.3M | ochr_ptr++; |
999 | 12.3M | } |
1000 | 12.3M | while(*ochr_ptr != NOTACHAR); |
1001 | 11.4M | break; |
1002 | | |
1003 | 11.4M | case OP_NOT: |
1004 | 793k | ochr_ptr = list_ptr + 2; |
1005 | 793k | do |
1006 | 874k | { |
1007 | 874k | if (chr == *ochr_ptr) |
1008 | 70.7k | break; |
1009 | 803k | ochr_ptr++; |
1010 | 803k | } |
1011 | 803k | while(*ochr_ptr != NOTACHAR); |
1012 | 793k | if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */ |
1013 | 70.7k | break; |
1014 | | |
1015 | | /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not* |
1016 | | set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
1017 | | |
1018 | 225k | case OP_DIGIT: |
1019 | 225k | if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE; |
1020 | 204k | break; |
1021 | | |
1022 | 204k | case OP_NOT_DIGIT: |
1023 | 146k | if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE; |
1024 | 8.45k | break; |
1025 | | |
1026 | 237k | case OP_WHITESPACE: |
1027 | 237k | if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE; |
1028 | 221k | break; |
1029 | | |
1030 | 539k | case OP_NOT_WHITESPACE: |
1031 | 539k | if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE; |
1032 | 32.9k | break; |
1033 | | |
1034 | 210k | case OP_WORDCHAR: |
1035 | 210k | if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE; |
1036 | 150k | break; |
1037 | | |
1038 | 225k | case OP_NOT_WORDCHAR: |
1039 | 225k | if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE; |
1040 | 53.6k | break; |
1041 | | |
1042 | 408k | case OP_HSPACE: |
1043 | 408k | switch(chr) |
1044 | 408k | { |
1045 | 160k | HSPACE_CASES: return FALSE; |
1046 | 247k | default: break; |
1047 | 408k | } |
1048 | 247k | break; |
1049 | | |
1050 | 379k | case OP_NOT_HSPACE: |
1051 | 379k | switch(chr) |
1052 | 379k | { |
1053 | 143k | HSPACE_CASES: break; |
1054 | 236k | default: return FALSE; |
1055 | 379k | } |
1056 | 143k | break; |
1057 | | |
1058 | 625k | case OP_ANYNL: |
1059 | 728k | case OP_VSPACE: |
1060 | 728k | switch(chr) |
1061 | 728k | { |
1062 | 127k | VSPACE_CASES: return FALSE; |
1063 | 601k | default: break; |
1064 | 728k | } |
1065 | 601k | break; |
1066 | | |
1067 | 601k | case OP_NOT_VSPACE: |
1068 | 264k | switch(chr) |
1069 | 264k | { |
1070 | 57.9k | VSPACE_CASES: break; |
1071 | 206k | default: return FALSE; |
1072 | 264k | } |
1073 | 57.9k | break; |
1074 | | |
1075 | 162k | case OP_DOLL: |
1076 | 211k | case OP_EODN: |
1077 | 211k | switch (chr) |
1078 | 211k | { |
1079 | 9.66k | case CHAR_CR: |
1080 | 20.2k | case CHAR_LF: |
1081 | 28.4k | case CHAR_VT: |
1082 | 35.9k | case CHAR_FF: |
1083 | 42.4k | case CHAR_NEL: |
1084 | 42.4k | #ifndef EBCDIC |
1085 | 50.8k | case 0x2028: |
1086 | 57.2k | case 0x2029: |
1087 | 57.2k | #endif /* Not EBCDIC */ |
1088 | 57.2k | return FALSE; |
1089 | 211k | } |
1090 | 153k | break; |
1091 | | |
1092 | 153k | case OP_EOD: /* Can always possessify before \z */ |
1093 | 26.0k | break; |
1094 | | |
1095 | 0 | #ifdef SUPPORT_UNICODE |
1096 | 390k | case OP_PROP: |
1097 | 998k | case OP_NOTPROP: |
1098 | 998k | if (!check_char_prop(chr, list_ptr[2], list_ptr[3], |
1099 | 998k | list_ptr[0] == OP_NOTPROP)) |
1100 | 563k | return FALSE; |
1101 | 434k | break; |
1102 | 434k | #endif |
1103 | | |
1104 | 434k | case OP_NCLASS: |
1105 | 362k | if (chr > 255) return FALSE; |
1106 | 289k | PCRE2_FALLTHROUGH /* Fall through */ |
1107 | 289k | |
1108 | 907k | case OP_CLASS: |
1109 | 907k | if (chr > 255) break; |
1110 | 804k | class_bitset = (const uint8_t *) |
1111 | 804k | ((list_ptr == list ? code : base_end) - list_ptr[2]); |
1112 | 804k | if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE; |
1113 | 450k | break; |
1114 | | |
1115 | 450k | #ifdef SUPPORT_WIDE_CHARS |
1116 | 1.02M | case OP_XCLASS: |
1117 | 1.02M | if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - |
1118 | 1.02M | list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf)) |
1119 | 378k | return FALSE; |
1120 | 641k | break; |
1121 | | |
1122 | 641k | case OP_ECLASS: |
1123 | 133k | if (PRIV(eclass)(chr, |
1124 | 133k | (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE, |
1125 | 133k | (list_ptr == list ? code : base_end) - list_ptr[3], |
1126 | 133k | (const uint8_t*)cb->start_code, utf)) |
1127 | 62.4k | return FALSE; |
1128 | 70.5k | break; |
1129 | 70.5k | #endif /* SUPPORT_WIDE_CHARS */ |
1130 | | |
1131 | 1.35M | default: |
1132 | 1.35M | return FALSE; |
1133 | 20.9M | } |
1134 | | |
1135 | 15.1M | chr_ptr++; |
1136 | 15.1M | } |
1137 | 19.8M | while(*chr_ptr != NOTACHAR); |
1138 | | |
1139 | | /* At least one character must be matched from this opcode. */ |
1140 | | |
1141 | 13.9M | if (list[1] == 0) return TRUE; |
1142 | 13.9M | } |
1143 | | |
1144 | | /* LCOV_EXCL_START */ |
1145 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ |
1146 | 0 | return FALSE; /* Avoid compiler warnings */ |
1147 | | /* LCOV_EXCL_STOP */ |
1148 | 26.3M | } pcre2_auto_possess.c:compare_opcodes Line | Count | Source | 550 | 35.7M | { | 551 | 35.7M | PCRE2_UCHAR c; | 552 | 35.7M | uint32_t list[MAX_LIST]; | 553 | 35.7M | const uint32_t *chr_ptr; | 554 | 35.7M | const uint32_t *ochr_ptr; | 555 | 35.7M | const uint32_t *list_ptr; | 556 | 35.7M | PCRE2_SPTR next_code; | 557 | 35.7M | #ifdef SUPPORT_WIDE_CHARS | 558 | 35.7M | PCRE2_SPTR xclass_flags; | 559 | 35.7M | #endif | 560 | 35.7M | const uint8_t *class_bitset; | 561 | 35.7M | const uint8_t *set1, *set2, *set_end; | 562 | 35.7M | uint32_t chr; | 563 | 35.7M | BOOL accepted, invert_bits; | 564 | 35.7M | BOOL entered_a_group = FALSE; | 565 | | | 566 | 35.7M | if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */ | 567 | | | 568 | | /* Note: the base_list[1] contains whether the current opcode has a greedy | 569 | | (represented by a non-zero value) quantifier. This is a different from | 570 | | other character type lists, which store here that the character iterator | 571 | | matches to an empty string (also represented by a non-zero value). */ | 572 | | | 573 | 10.6M | for(;;) | 574 | 27.6M | { | 575 | 27.6M | PCRE2_SPTR bracode; | 576 | | | 577 | | /* All operations move the code pointer forward. | 578 | | Therefore infinite recursions are not possible. */ | 579 | | | 580 | 27.6M | c = *code; | 581 | | | 582 | | /* Skip over callouts */ | 583 | | | 584 | 27.6M | if (c == OP_CALLOUT) | 585 | 1.16M | { | 586 | 1.16M | code += PRIV(OP_lengths)[c]; | 587 | 1.16M | continue; | 588 | 1.16M | } | 589 | | | 590 | 26.4M | if (c == OP_CALLOUT_STR) | 591 | 10.9k | { | 592 | 10.9k | code += GET(code, 1 + 2*LINK_SIZE); | 593 | 10.9k | continue; | 594 | 10.9k | } | 595 | | | 596 | | /* At the end of a branch, skip to the end of the group and process it. */ | 597 | | | 598 | 26.4M | if (c == OP_ALT) | 599 | 645k | { | 600 | 5.91M | do code += GET(code, 1); while (*code == OP_ALT); | 601 | 645k | c = *code; | 602 | 645k | } | 603 | | | 604 | | /* Inspect the next opcode. */ | 605 | | | 606 | 26.4M | switch(c) | 607 | 26.4M | { | 608 | | /* We can always possessify a greedy iterator at the end of the pattern, | 609 | | which is reached after skipping over the final OP_KET. A non-greedy | 610 | | iterator must never be possessified. */ | 611 | | | 612 | 101k | case OP_END: | 613 | 101k | return base_list[1] != 0; | 614 | | | 615 | | /* When an iterator is at the end of certain kinds of group we can inspect | 616 | | what follows the group by skipping over the closing ket. Note that this | 617 | | does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given | 618 | | iteration is variable (could be another iteration or could be the next | 619 | | item). As these two opcodes are not listed in the next switch, they will | 620 | | end up as the next code to inspect, and return FALSE by virtue of being | 621 | | unsupported. */ | 622 | | | 623 | 15.4M | case OP_KET: | 624 | 15.5M | case OP_KETRPOS: | 625 | | /* The non-greedy case cannot be converted to a possessive form. */ | 626 | | | 627 | 15.5M | if (base_list[1] == 0) return FALSE; | 628 | | | 629 | | /* If the bracket is capturing it might be referenced by an OP_RECURSE | 630 | | so its last iterator can never be possessified if the pattern contains | 631 | | recursions. (This could be improved by keeping a list of group numbers that | 632 | | are called by recursion.) */ | 633 | | | 634 | 15.2M | bracode = code - GET(code, 1); | 635 | 15.2M | switch(*bracode) | 636 | 15.2M | { | 637 | 1.02M | case OP_CBRA: | 638 | 1.02M | case OP_SCBRA: | 639 | 1.02M | case OP_CBRAPOS: | 640 | 1.08M | case OP_SCBRAPOS: | 641 | 1.08M | if (cb->had_recurse) return FALSE; | 642 | 960k | break; | 643 | | | 644 | | /* A script run might have to backtrack if the iterated item can match | 645 | | characters from more than one script. So give up unless repeating an | 646 | | explicit character. */ | 647 | | | 648 | 960k | case OP_SCRIPT_RUN: | 649 | 35.6k | if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI) | 650 | 9.03k | return FALSE; | 651 | 26.5k | break; | 652 | | | 653 | | /* Atomic sub-patterns and forward assertions can always auto-possessify | 654 | | their last iterator. However, if the group was entered as a result of | 655 | | checking a previous iterator, this is not possible. */ | 656 | | | 657 | 118k | case OP_ASSERT: | 658 | 199k | case OP_ASSERT_NOT: | 659 | 239k | case OP_ONCE: | 660 | 239k | return !entered_a_group; | 661 | | | 662 | | /* Fixed-length lookbehinds can be treated the same way, but variable | 663 | | length lookbehinds must not auto-possessify their last iterator. Note | 664 | | that in order to identify a variable length lookbehind we must check | 665 | | through all branches, because some may be of fixed length. */ | 666 | | | 667 | 63.2k | case OP_ASSERTBACK: | 668 | 124k | case OP_ASSERTBACK_NOT: | 669 | 124k | do | 670 | 136k | { | 671 | 136k | if (bracode[1+LINK_SIZE] == OP_VREVERSE) return FALSE; /* Variable */ | 672 | 17.9k | bracode += GET(bracode, 1); | 673 | 17.9k | } | 674 | 124k | while (*bracode == OP_ALT); | 675 | 6.12k | return !entered_a_group; /* Not variable length */ | 676 | | | 677 | | /* Non-atomic assertions - don't possessify last iterator. This needs | 678 | | more thought. */ | 679 | | | 680 | 91.5k | case OP_ASSERT_NA: | 681 | 133k | case OP_ASSERTBACK_NA: | 682 | 133k | return FALSE; | 683 | 15.2M | } | 684 | | | 685 | | /* Skip over the bracket and inspect what comes next. */ | 686 | | | 687 | 14.6M | code += PRIV(OP_lengths)[c]; | 688 | 14.6M | continue; | 689 | | | 690 | | /* Handle cases where the next item is a group. */ | 691 | | | 692 | 18.3k | case OP_ONCE: | 693 | 241k | case OP_BRA: | 694 | 867k | case OP_CBRA: | 695 | 867k | next_code = code + GET(code, 1); | 696 | 867k | code += PRIV(OP_lengths)[c]; | 697 | | | 698 | | /* Check each branch. We have to recurse a level for all but the last | 699 | | branch. */ | 700 | | | 701 | 1.08M | while (*next_code == OP_ALT) | 702 | 386k | { | 703 | 386k | if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit)) | 704 | 164k | return FALSE; | 705 | 221k | code = next_code + 1 + LINK_SIZE; | 706 | 221k | next_code += GET(next_code, 1); | 707 | 221k | } | 708 | | | 709 | 702k | entered_a_group = TRUE; | 710 | 702k | continue; | 711 | | | 712 | 181k | case OP_BRAZERO: | 713 | 208k | case OP_BRAMINZERO: | 714 | | | 715 | 208k | next_code = code + 1; | 716 | 208k | if (*next_code != OP_BRA && *next_code != OP_CBRA && | 717 | 45.6k | *next_code != OP_ONCE) return FALSE; | 718 | | | 719 | 183k | do next_code += GET(next_code, 1); while (*next_code == OP_ALT); | 720 | | | 721 | | /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ | 722 | | | 723 | 172k | next_code += 1 + LINK_SIZE; | 724 | 172k | if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, | 725 | 172k | rec_limit)) | 726 | 28.8k | return FALSE; | 727 | | | 728 | 143k | code += PRIV(OP_lengths)[c]; | 729 | 143k | continue; | 730 | | | 731 | | /* The next opcode does not need special handling; fall through and use it | 732 | | to see if the base can be possessified. */ | 733 | | | 734 | 9.68M | default: | 735 | 9.68M | break; | 736 | 26.4M | } | 737 | | | 738 | | /* We now have the next appropriate opcode to compare with the base. Check | 739 | | for a supported opcode, and load its properties. */ | 740 | | | 741 | 9.68M | code = get_chr_property_list(code, utf, ucp, cb->fcc, list); | 742 | 9.68M | if (code == NULL) return FALSE; /* Unsupported */ | 743 | | | 744 | | /* If either opcode is a small character list, set pointers for comparing | 745 | | characters from that list with another list, or with a property. */ | 746 | | | 747 | 8.99M | if (base_list[0] == OP_CHAR) | 748 | 6.01M | { | 749 | 6.01M | chr_ptr = base_list + 2; | 750 | 6.01M | list_ptr = list; | 751 | 6.01M | } | 752 | 2.97M | else if (list[0] == OP_CHAR) | 753 | 2.25M | { | 754 | 2.25M | chr_ptr = list + 2; | 755 | 2.25M | list_ptr = base_list; | 756 | 2.25M | } | 757 | | | 758 | | /* Character bitsets can also be compared to certain opcodes. */ | 759 | | | 760 | 722k | else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS | 761 | 590k | #if PCRE2_CODE_UNIT_WIDTH == 8 | 762 | | /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ | 763 | 590k | || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) | 764 | 722k | #endif | 765 | 722k | ) | 766 | 208k | { | 767 | 208k | #if PCRE2_CODE_UNIT_WIDTH == 8 | 768 | 208k | if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) | 769 | | #else | 770 | | if (base_list[0] == OP_CLASS) | 771 | | #endif | 772 | 123k | { | 773 | 123k | set1 = (const uint8_t *)(base_end - base_list[2]); | 774 | 123k | list_ptr = list; | 775 | 123k | } | 776 | 84.9k | else | 777 | 84.9k | { | 778 | 84.9k | set1 = (const uint8_t *)(code - list[2]); | 779 | 84.9k | list_ptr = base_list; | 780 | 84.9k | } | 781 | | | 782 | 208k | invert_bits = FALSE; | 783 | 208k | switch(list_ptr[0]) | 784 | 208k | { | 785 | 26.7k | case OP_CLASS: | 786 | 39.2k | case OP_NCLASS: | 787 | 39.2k | set2 = (const uint8_t *) | 788 | 39.2k | ((list_ptr == list ? code : base_end) - list_ptr[2]); | 789 | 39.2k | break; | 790 | | | 791 | 0 | #ifdef SUPPORT_WIDE_CHARS | 792 | 11.4k | case OP_XCLASS: | 793 | 11.4k | xclass_flags = (list_ptr == list ? code : base_end) - | 794 | 11.4k | list_ptr[2] + LINK_SIZE; | 795 | 11.4k | if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; | 796 | 7.38k | if ((*xclass_flags & XCL_MAP) == 0) | 797 | 3.62k | { | 798 | | /* No bits are set for characters < 256. */ | 799 | 3.62k | if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0; | 800 | | /* Might be an empty repeat. */ | 801 | 2.00k | continue; | 802 | 3.62k | } | 803 | 3.76k | set2 = (const uint8_t *)(xclass_flags + 1); | 804 | 3.76k | break; | 805 | 0 | #endif | 806 | | | 807 | 7.84k | case OP_NOT_DIGIT: | 808 | 7.84k | invert_bits = TRUE; | 809 | 7.84k | PCRE2_FALLTHROUGH /* Fall through */ | 810 | 14.5k | case OP_DIGIT: | 811 | 14.5k | set2 = (const uint8_t *)(cb->cbits + cbit_digit); | 812 | 14.5k | break; | 813 | | | 814 | 4.90k | case OP_NOT_WHITESPACE: | 815 | 4.90k | invert_bits = TRUE; | 816 | 4.90k | PCRE2_FALLTHROUGH /* Fall through */ | 817 | 10.7k | case OP_WHITESPACE: | 818 | 10.7k | set2 = (const uint8_t *)(cb->cbits + cbit_space); | 819 | 10.7k | break; | 820 | | | 821 | 4.25k | case OP_NOT_WORDCHAR: | 822 | 4.25k | invert_bits = TRUE; | 823 | 4.25k | PCRE2_FALLTHROUGH /* Fall through */ | 824 | 14.8k | case OP_WORDCHAR: | 825 | 14.8k | set2 = (const uint8_t *)(cb->cbits + cbit_word); | 826 | 14.8k | break; | 827 | | | 828 | 117k | default: | 829 | 117k | return FALSE; | 830 | 208k | } | 831 | | | 832 | | /* Because the bit sets are unaligned bytes, we need to perform byte | 833 | | comparison here. */ | 834 | | | 835 | 83.2k | set_end = set1 + 32; | 836 | 83.2k | if (invert_bits) | 837 | 17.0k | { | 838 | 17.0k | do | 839 | 179k | { | 840 | 179k | if ((*set1++ & ~(*set2++)) != 0) return FALSE; | 841 | 179k | } | 842 | 166k | while (set1 < set_end); | 843 | 17.0k | } | 844 | 66.2k | else | 845 | 66.2k | { | 846 | 66.2k | do | 847 | 1.04M | { | 848 | 1.04M | if ((*set1++ & *set2++) != 0) return FALSE; | 849 | 1.04M | } | 850 | 999k | while (set1 < set_end); | 851 | 66.2k | } | 852 | | | 853 | 25.6k | if (list[1] == 0) return TRUE; | 854 | | /* Might be an empty repeat. */ | 855 | 8.20k | continue; | 856 | 25.6k | } | 857 | | | 858 | | /* Some property combinations also acceptable. Unicode property opcodes are | 859 | | processed specially; the rest can be handled with a lookup table. */ | 860 | | | 861 | 514k | else | 862 | 514k | { | 863 | 514k | uint32_t leftop, rightop; | 864 | | | 865 | 514k | leftop = base_list[0]; | 866 | 514k | rightop = list[0]; | 867 | | | 868 | 514k | #ifdef SUPPORT_UNICODE | 869 | 514k | accepted = FALSE; /* Always set in non-unicode case. */ | 870 | 514k | if (leftop == OP_PROP || leftop == OP_NOTPROP) | 871 | 109k | { | 872 | 109k | if (rightop == OP_EOD) | 873 | 4.06k | accepted = TRUE; | 874 | 105k | else if (rightop == OP_PROP || rightop == OP_NOTPROP) | 875 | 76.0k | { | 876 | 76.0k | int n; | 877 | 76.0k | const uint8_t *p; | 878 | 76.0k | BOOL same = leftop == rightop; | 879 | 76.0k | BOOL lisprop = leftop == OP_PROP; | 880 | 76.0k | BOOL risprop = rightop == OP_PROP; | 881 | 76.0k | BOOL bothprop = lisprop && risprop; | 882 | | | 883 | | /* There's a table that specifies how each combination is to be | 884 | | processed: | 885 | | 0 Always return FALSE (never auto-possessify) | 886 | | 1 Character groups are distinct (possessify if both are OP_PROP) | 887 | | 2 Check character categories in the same group (general or particular) | 888 | | 3 Return TRUE if the two opcodes are not the same | 889 | | ... see comments below | 890 | | */ | 891 | | | 892 | 76.0k | n = propposstab[base_list[2]][list[2]]; | 893 | 76.0k | switch(n) | 894 | 76.0k | { | 895 | 2.03k | case 0: break; | 896 | 4.18k | case 1: accepted = bothprop; break; | 897 | 5.06k | case 2: accepted = (base_list[3] == list[3]) != same; break; | 898 | 6.55k | case 3: accepted = !same; break; | 899 | | | 900 | 5.17k | case 4: /* Left general category, right particular category */ | 901 | 5.17k | accepted = risprop && catposstab[base_list[3]][list[3]] == same; | 902 | 5.17k | break; | 903 | | | 904 | 3.28k | case 5: /* Right general category, left particular category */ | 905 | 3.28k | accepted = lisprop && catposstab[list[3]][base_list[3]] == same; | 906 | 3.28k | break; | 907 | | | 908 | | /* This code is logically tricky. Think hard before fiddling with it. | 909 | | The posspropstab table has four entries per row. Each row relates to | 910 | | one of PCRE's special properties such as ALNUM or SPACE or WORD. | 911 | | Only WORD actually needs all four entries, but using repeats for the | 912 | | others means they can all use the same code below. | 913 | | | 914 | | The first two entries in each row are Unicode general categories, and | 915 | | apply always, because all the characters they include are part of the | 916 | | PCRE character set. The third and fourth entries are a general and a | 917 | | particular category, respectively, that include one or more relevant | 918 | | characters. One or the other is used, depending on whether the check | 919 | | is for a general or a particular category. However, in both cases the | 920 | | category contains more characters than the specials that are defined | 921 | | for the property being tested against. Therefore, it cannot be used | 922 | | in a NOTPROP case. | 923 | | | 924 | | Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. | 925 | | Underscore is covered by ucp_P or ucp_Po. */ | 926 | | | 927 | 3.58k | case 6: /* Left alphanum vs right general category */ | 928 | 8.29k | case 7: /* Left space vs right general category */ | 929 | 12.2k | case 8: /* Left word vs right general category */ | 930 | 12.2k | p = posspropstab[n-6]; | 931 | 12.2k | accepted = risprop && lisprop == | 932 | 7.07k | (list[3] != p[0] && | 933 | 5.28k | list[3] != p[1] && | 934 | 3.96k | (list[3] != p[2] || !lisprop)); | 935 | 12.2k | break; | 936 | | | 937 | 1.29k | case 9: /* Right alphanum vs left general category */ | 938 | 5.32k | case 10: /* Right space vs left general category */ | 939 | 9.64k | case 11: /* Right word vs left general category */ | 940 | 9.64k | p = posspropstab[n-9]; | 941 | 9.64k | accepted = lisprop && risprop == | 942 | 6.15k | (base_list[3] != p[0] && | 943 | 4.65k | base_list[3] != p[1] && | 944 | 2.74k | (base_list[3] != p[2] || !risprop)); | 945 | 9.64k | break; | 946 | | | 947 | 1.56k | case 12: /* Left alphanum vs right particular category */ | 948 | 12.0k | case 13: /* Left space vs right particular category */ | 949 | 15.1k | case 14: /* Left word vs right particular category */ | 950 | 15.1k | p = posspropstab[n-12]; | 951 | 15.1k | accepted = risprop && lisprop == | 952 | 8.71k | (catposstab[p[0]][list[3]] && | 953 | 6.73k | catposstab[p[1]][list[3]] && | 954 | 4.32k | (list[3] != p[3] || !lisprop)); | 955 | 15.1k | break; | 956 | | | 957 | 2.81k | case 15: /* Right alphanum vs left particular category */ | 958 | 9.63k | case 16: /* Right space vs left particular category */ | 959 | 12.6k | case 17: /* Right word vs left particular category */ | 960 | 12.6k | p = posspropstab[n-15]; | 961 | 12.6k | accepted = lisprop && risprop == | 962 | 9.51k | (catposstab[p[0]][base_list[3]] && | 963 | 8.21k | catposstab[p[1]][base_list[3]] && | 964 | 4.05k | (base_list[3] != p[3] || !risprop)); | 965 | 12.6k | break; | 966 | 76.0k | } | 967 | 76.0k | } | 968 | 109k | } | 969 | | | 970 | 404k | else | 971 | 404k | #endif /* SUPPORT_UNICODE */ | 972 | | | 973 | 404k | accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && | 974 | 336k | rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && | 975 | 314k | autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; | 976 | | | 977 | 514k | if (!accepted) return FALSE; | 978 | | | 979 | 86.8k | if (list[1] == 0) return TRUE; | 980 | | /* Might be an empty repeat. */ | 981 | 24.0k | continue; | 982 | 86.8k | } | 983 | | | 984 | | /* Control reaches here only if one of the items is a small character list. | 985 | | All characters are checked against the other side. */ | 986 | | | 987 | 8.27M | do | 988 | 8.87M | { | 989 | 8.87M | chr = *chr_ptr; | 990 | | | 991 | 8.87M | switch(list_ptr[0]) | 992 | 8.87M | { | 993 | 5.47M | case OP_CHAR: | 994 | 5.47M | ochr_ptr = list_ptr + 2; | 995 | 5.47M | do | 996 | 6.10M | { | 997 | 6.10M | if (chr == *ochr_ptr) return FALSE; | 998 | 5.75M | ochr_ptr++; | 999 | 5.75M | } | 1000 | 5.75M | while(*ochr_ptr != NOTACHAR); | 1001 | 5.12M | break; | 1002 | | | 1003 | 5.12M | case OP_NOT: | 1004 | 307k | ochr_ptr = list_ptr + 2; | 1005 | 307k | do | 1006 | 342k | { | 1007 | 342k | if (chr == *ochr_ptr) | 1008 | 28.8k | break; | 1009 | 313k | ochr_ptr++; | 1010 | 313k | } | 1011 | 313k | while(*ochr_ptr != NOTACHAR); | 1012 | 307k | if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */ | 1013 | 28.8k | break; | 1014 | | | 1015 | | /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not* | 1016 | | set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ | 1017 | | | 1018 | 105k | case OP_DIGIT: | 1019 | 105k | if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE; | 1020 | 93.6k | break; | 1021 | | | 1022 | 93.6k | case OP_NOT_DIGIT: | 1023 | 60.5k | if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE; | 1024 | 5.55k | break; | 1025 | | | 1026 | 82.2k | case OP_WHITESPACE: | 1027 | 82.2k | if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE; | 1028 | 76.2k | break; | 1029 | | | 1030 | 134k | case OP_NOT_WHITESPACE: | 1031 | 134k | if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE; | 1032 | 11.5k | break; | 1033 | | | 1034 | 81.0k | case OP_WORDCHAR: | 1035 | 81.0k | if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE; | 1036 | 51.4k | break; | 1037 | | | 1038 | 73.3k | case OP_NOT_WORDCHAR: | 1039 | 73.3k | if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE; | 1040 | 30.4k | break; | 1041 | | | 1042 | 201k | case OP_HSPACE: | 1043 | 201k | switch(chr) | 1044 | 201k | { | 1045 | 34.4k | HSPACE_CASES: return FALSE; | 1046 | 166k | default: break; | 1047 | 201k | } | 1048 | 166k | break; | 1049 | | | 1050 | 166k | case OP_NOT_HSPACE: | 1051 | 125k | switch(chr) | 1052 | 125k | { | 1053 | 40.3k | HSPACE_CASES: break; | 1054 | 85.0k | default: return FALSE; | 1055 | 125k | } | 1056 | 40.3k | break; | 1057 | | | 1058 | 163k | case OP_ANYNL: | 1059 | 195k | case OP_VSPACE: | 1060 | 195k | switch(chr) | 1061 | 195k | { | 1062 | 20.3k | VSPACE_CASES: return FALSE; | 1063 | 175k | default: break; | 1064 | 195k | } | 1065 | 175k | break; | 1066 | | | 1067 | 175k | case OP_NOT_VSPACE: | 1068 | 44.8k | switch(chr) | 1069 | 44.8k | { | 1070 | 16.0k | VSPACE_CASES: break; | 1071 | 28.8k | default: return FALSE; | 1072 | 44.8k | } | 1073 | 16.0k | break; | 1074 | | | 1075 | 82.1k | case OP_DOLL: | 1076 | 98.7k | case OP_EODN: | 1077 | 98.7k | switch (chr) | 1078 | 98.7k | { | 1079 | 4.91k | case CHAR_CR: | 1080 | 6.65k | case CHAR_LF: | 1081 | 8.61k | case CHAR_VT: | 1082 | 10.9k | case CHAR_FF: | 1083 | 14.0k | case CHAR_NEL: | 1084 | 14.0k | #ifndef EBCDIC | 1085 | 15.4k | case 0x2028: | 1086 | 17.0k | case 0x2029: | 1087 | 17.0k | #endif /* Not EBCDIC */ | 1088 | 17.0k | return FALSE; | 1089 | 98.7k | } | 1090 | 81.7k | break; | 1091 | | | 1092 | 81.7k | case OP_EOD: /* Can always possessify before \z */ | 1093 | 8.79k | break; | 1094 | | | 1095 | 0 | #ifdef SUPPORT_UNICODE | 1096 | 119k | case OP_PROP: | 1097 | 343k | case OP_NOTPROP: | 1098 | 343k | if (!check_char_prop(chr, list_ptr[2], list_ptr[3], | 1099 | 343k | list_ptr[0] == OP_NOTPROP)) | 1100 | 208k | return FALSE; | 1101 | 135k | break; | 1102 | 135k | #endif | 1103 | | | 1104 | 247k | case OP_NCLASS: | 1105 | 247k | if (chr > 255) return FALSE; | 1106 | 245k | PCRE2_FALLTHROUGH /* Fall through */ | 1107 | 245k | | 1108 | 656k | case OP_CLASS: | 1109 | 656k | if (chr > 255) break; | 1110 | 649k | class_bitset = (const uint8_t *) | 1111 | 649k | ((list_ptr == list ? code : base_end) - list_ptr[2]); | 1112 | 649k | if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE; | 1113 | 354k | break; | 1114 | | | 1115 | 354k | #ifdef SUPPORT_WIDE_CHARS | 1116 | 354k | case OP_XCLASS: | 1117 | 111k | if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - | 1118 | 111k | list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf)) | 1119 | 47.0k | return FALSE; | 1120 | 64.3k | break; | 1121 | | | 1122 | 64.3k | case OP_ECLASS: | 1123 | 51.3k | if (PRIV(eclass)(chr, | 1124 | 51.3k | (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE, | 1125 | 51.3k | (list_ptr == list ? code : base_end) - list_ptr[3], | 1126 | 51.3k | (const uint8_t*)cb->start_code, utf)) | 1127 | 33.5k | return FALSE; | 1128 | 17.7k | break; | 1129 | 17.7k | #endif /* SUPPORT_WIDE_CHARS */ | 1130 | | | 1131 | 710k | default: | 1132 | 710k | return FALSE; | 1133 | 8.87M | } | 1134 | | | 1135 | 6.49M | chr_ptr++; | 1136 | 6.49M | } | 1137 | 8.27M | while(*chr_ptr != NOTACHAR); | 1138 | | | 1139 | | /* At least one character must be matched from this opcode. */ | 1140 | | | 1141 | 5.89M | if (list[1] == 0) return TRUE; | 1142 | 5.89M | } | 1143 | | | 1144 | | /* LCOV_EXCL_START */ | 1145 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ | 1146 | 0 | return FALSE; /* Avoid compiler warnings */ | 1147 | | /* LCOV_EXCL_STOP */ | 1148 | 10.6M | } |
pcre2_auto_possess.c:compare_opcodes Line | Count | Source | 550 | 31.6M | { | 551 | 31.6M | PCRE2_UCHAR c; | 552 | 31.6M | uint32_t list[MAX_LIST]; | 553 | 31.6M | const uint32_t *chr_ptr; | 554 | 31.6M | const uint32_t *ochr_ptr; | 555 | 31.6M | const uint32_t *list_ptr; | 556 | 31.6M | PCRE2_SPTR next_code; | 557 | 31.6M | #ifdef SUPPORT_WIDE_CHARS | 558 | 31.6M | PCRE2_SPTR xclass_flags; | 559 | 31.6M | #endif | 560 | 31.6M | const uint8_t *class_bitset; | 561 | 31.6M | const uint8_t *set1, *set2, *set_end; | 562 | 31.6M | uint32_t chr; | 563 | 31.6M | BOOL accepted, invert_bits; | 564 | 31.6M | BOOL entered_a_group = FALSE; | 565 | | | 566 | 31.6M | if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */ | 567 | | | 568 | | /* Note: the base_list[1] contains whether the current opcode has a greedy | 569 | | (represented by a non-zero value) quantifier. This is a different from | 570 | | other character type lists, which store here that the character iterator | 571 | | matches to an empty string (also represented by a non-zero value). */ | 572 | | | 573 | 15.6M | for(;;) | 574 | 48.6M | { | 575 | 48.6M | PCRE2_SPTR bracode; | 576 | | | 577 | | /* All operations move the code pointer forward. | 578 | | Therefore infinite recursions are not possible. */ | 579 | | | 580 | 48.6M | c = *code; | 581 | | | 582 | | /* Skip over callouts */ | 583 | | | 584 | 48.6M | if (c == OP_CALLOUT) | 585 | 2.20M | { | 586 | 2.20M | code += PRIV(OP_lengths)[c]; | 587 | 2.20M | continue; | 588 | 2.20M | } | 589 | | | 590 | 46.4M | if (c == OP_CALLOUT_STR) | 591 | 9.53k | { | 592 | 9.53k | code += GET(code, 1 + 2*LINK_SIZE); | 593 | 9.53k | continue; | 594 | 9.53k | } | 595 | | | 596 | | /* At the end of a branch, skip to the end of the group and process it. */ | 597 | | | 598 | 46.4M | if (c == OP_ALT) | 599 | 1.21M | { | 600 | 3.04M | do code += GET(code, 1); while (*code == OP_ALT); | 601 | 1.21M | c = *code; | 602 | 1.21M | } | 603 | | | 604 | | /* Inspect the next opcode. */ | 605 | | | 606 | 46.4M | switch(c) | 607 | 46.4M | { | 608 | | /* We can always possessify a greedy iterator at the end of the pattern, | 609 | | which is reached after skipping over the final OP_KET. A non-greedy | 610 | | iterator must never be possessified. */ | 611 | | | 612 | 188k | case OP_END: | 613 | 188k | return base_list[1] != 0; | 614 | | | 615 | | /* When an iterator is at the end of certain kinds of group we can inspect | 616 | | what follows the group by skipping over the closing ket. Note that this | 617 | | does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given | 618 | | iteration is variable (could be another iteration or could be the next | 619 | | item). As these two opcodes are not listed in the next switch, they will | 620 | | end up as the next code to inspect, and return FALSE by virtue of being | 621 | | unsupported. */ | 622 | | | 623 | 30.6M | case OP_KET: | 624 | 30.7M | case OP_KETRPOS: | 625 | | /* The non-greedy case cannot be converted to a possessive form. */ | 626 | | | 627 | 30.7M | if (base_list[1] == 0) return FALSE; | 628 | | | 629 | | /* If the bracket is capturing it might be referenced by an OP_RECURSE | 630 | | so its last iterator can never be possessified if the pattern contains | 631 | | recursions. (This could be improved by keeping a list of group numbers that | 632 | | are called by recursion.) */ | 633 | | | 634 | 30.1M | bracode = code - GET(code, 1); | 635 | 30.1M | switch(*bracode) | 636 | 30.1M | { | 637 | 1.87M | case OP_CBRA: | 638 | 1.87M | case OP_SCBRA: | 639 | 1.90M | case OP_CBRAPOS: | 640 | 1.91M | case OP_SCBRAPOS: | 641 | 1.91M | if (cb->had_recurse) return FALSE; | 642 | 1.62M | break; | 643 | | | 644 | | /* A script run might have to backtrack if the iterated item can match | 645 | | characters from more than one script. So give up unless repeating an | 646 | | explicit character. */ | 647 | | | 648 | 1.62M | case OP_SCRIPT_RUN: | 649 | 37.4k | if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI) | 650 | 10.4k | return FALSE; | 651 | 27.0k | break; | 652 | | | 653 | | /* Atomic sub-patterns and forward assertions can always auto-possessify | 654 | | their last iterator. However, if the group was entered as a result of | 655 | | checking a previous iterator, this is not possible. */ | 656 | | | 657 | 156k | case OP_ASSERT: | 658 | 227k | case OP_ASSERT_NOT: | 659 | 278k | case OP_ONCE: | 660 | 278k | return !entered_a_group; | 661 | | | 662 | | /* Fixed-length lookbehinds can be treated the same way, but variable | 663 | | length lookbehinds must not auto-possessify their last iterator. Note | 664 | | that in order to identify a variable length lookbehind we must check | 665 | | through all branches, because some may be of fixed length. */ | 666 | | | 667 | 89.3k | case OP_ASSERTBACK: | 668 | 140k | case OP_ASSERTBACK_NOT: | 669 | 140k | do | 670 | 156k | { | 671 | 156k | if (bracode[1+LINK_SIZE] == OP_VREVERSE) return FALSE; /* Variable */ | 672 | 24.6k | bracode += GET(bracode, 1); | 673 | 24.6k | } | 674 | 140k | while (*bracode == OP_ALT); | 675 | 8.05k | return !entered_a_group; /* Not variable length */ | 676 | | | 677 | | /* Non-atomic assertions - don't possessify last iterator. This needs | 678 | | more thought. */ | 679 | | | 680 | 79.3k | case OP_ASSERT_NA: | 681 | 412k | case OP_ASSERTBACK_NA: | 682 | 412k | return FALSE; | 683 | 30.1M | } | 684 | | | 685 | | /* Skip over the bracket and inspect what comes next. */ | 686 | | | 687 | 28.9M | code += PRIV(OP_lengths)[c]; | 688 | 28.9M | continue; | 689 | | | 690 | | /* Handle cases where the next item is a group. */ | 691 | | | 692 | 17.8k | case OP_ONCE: | 693 | 437k | case OP_BRA: | 694 | 1.23M | case OP_CBRA: | 695 | 1.23M | next_code = code + GET(code, 1); | 696 | 1.23M | code += PRIV(OP_lengths)[c]; | 697 | | | 698 | | /* Check each branch. We have to recurse a level for all but the last | 699 | | branch. */ | 700 | | | 701 | 1.66M | while (*next_code == OP_ALT) | 702 | 691k | { | 703 | 691k | if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit)) | 704 | 259k | return FALSE; | 705 | 432k | code = next_code + 1 + LINK_SIZE; | 706 | 432k | next_code += GET(next_code, 1); | 707 | 432k | } | 708 | | | 709 | 971k | entered_a_group = TRUE; | 710 | 971k | continue; | 711 | | | 712 | 335k | case OP_BRAZERO: | 713 | 350k | case OP_BRAMINZERO: | 714 | | | 715 | 350k | next_code = code + 1; | 716 | 350k | if (*next_code != OP_BRA && *next_code != OP_CBRA && | 717 | 29.5k | *next_code != OP_ONCE) return FALSE; | 718 | | | 719 | 372k | do next_code += GET(next_code, 1); while (*next_code == OP_ALT); | 720 | | | 721 | | /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ | 722 | | | 723 | 329k | next_code += 1 + LINK_SIZE; | 724 | 329k | if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, | 725 | 329k | rec_limit)) | 726 | 26.8k | return FALSE; | 727 | | | 728 | 302k | code += PRIV(OP_lengths)[c]; | 729 | 302k | continue; | 730 | | | 731 | | /* The next opcode does not need special handling; fall through and use it | 732 | | to see if the base can be possessified. */ | 733 | | | 734 | 13.9M | default: | 735 | 13.9M | break; | 736 | 46.4M | } | 737 | | | 738 | | /* We now have the next appropriate opcode to compare with the base. Check | 739 | | for a supported opcode, and load its properties. */ | 740 | | | 741 | 13.9M | code = get_chr_property_list(code, utf, ucp, cb->fcc, list); | 742 | 13.9M | if (code == NULL) return FALSE; /* Unsupported */ | 743 | | | 744 | | /* If either opcode is a small character list, set pointers for comparing | 745 | | characters from that list with another list, or with a property. */ | 746 | | | 747 | 12.4M | if (base_list[0] == OP_CHAR) | 748 | 7.43M | { | 749 | 7.43M | chr_ptr = base_list + 2; | 750 | 7.43M | list_ptr = list; | 751 | 7.43M | } | 752 | 5.02M | else if (list[0] == OP_CHAR) | 753 | 4.11M | { | 754 | 4.11M | chr_ptr = list + 2; | 755 | 4.11M | list_ptr = base_list; | 756 | 4.11M | } | 757 | | | 758 | | /* Character bitsets can also be compared to certain opcodes. */ | 759 | | | 760 | 916k | else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS | 761 | | #if PCRE2_CODE_UNIT_WIDTH == 8 | 762 | | /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ | 763 | | || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) | 764 | | #endif | 765 | 916k | ) | 766 | 104k | { | 767 | | #if PCRE2_CODE_UNIT_WIDTH == 8 | 768 | | if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) | 769 | | #else | 770 | 104k | if (base_list[0] == OP_CLASS) | 771 | 68.5k | #endif | 772 | 68.5k | { | 773 | 68.5k | set1 = (const uint8_t *)(base_end - base_list[2]); | 774 | 68.5k | list_ptr = list; | 775 | 68.5k | } | 776 | 35.7k | else | 777 | 35.7k | { | 778 | 35.7k | set1 = (const uint8_t *)(code - list[2]); | 779 | 35.7k | list_ptr = base_list; | 780 | 35.7k | } | 781 | | | 782 | 104k | invert_bits = FALSE; | 783 | 104k | switch(list_ptr[0]) | 784 | 104k | { | 785 | 18.3k | case OP_CLASS: | 786 | 26.5k | case OP_NCLASS: | 787 | 26.5k | set2 = (const uint8_t *) | 788 | 26.5k | ((list_ptr == list ? code : base_end) - list_ptr[2]); | 789 | 26.5k | break; | 790 | | | 791 | 0 | #ifdef SUPPORT_WIDE_CHARS | 792 | 32.7k | case OP_XCLASS: | 793 | 32.7k | xclass_flags = (list_ptr == list ? code : base_end) - | 794 | 32.7k | list_ptr[2] + LINK_SIZE; | 795 | 32.7k | if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; | 796 | 25.8k | if ((*xclass_flags & XCL_MAP) == 0) | 797 | 8.43k | { | 798 | | /* No bits are set for characters < 256. */ | 799 | 8.43k | if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0; | 800 | | /* Might be an empty repeat. */ | 801 | 2.92k | continue; | 802 | 8.43k | } | 803 | 17.4k | set2 = (const uint8_t *)(xclass_flags + 1); | 804 | 17.4k | break; | 805 | 0 | #endif | 806 | | | 807 | 7.79k | case OP_NOT_DIGIT: | 808 | 7.79k | invert_bits = TRUE; | 809 | 7.79k | PCRE2_FALLTHROUGH /* Fall through */ | 810 | 12.0k | case OP_DIGIT: | 811 | 12.0k | set2 = (const uint8_t *)(cb->cbits + cbit_digit); | 812 | 12.0k | break; | 813 | | | 814 | 6.94k | case OP_NOT_WHITESPACE: | 815 | 6.94k | invert_bits = TRUE; | 816 | 6.94k | PCRE2_FALLTHROUGH /* Fall through */ | 817 | 12.9k | case OP_WHITESPACE: | 818 | 12.9k | set2 = (const uint8_t *)(cb->cbits + cbit_space); | 819 | 12.9k | break; | 820 | | | 821 | 8.06k | case OP_NOT_WORDCHAR: | 822 | 8.06k | invert_bits = TRUE; | 823 | 8.06k | PCRE2_FALLTHROUGH /* Fall through */ | 824 | 12.0k | case OP_WORDCHAR: | 825 | 12.0k | set2 = (const uint8_t *)(cb->cbits + cbit_word); | 826 | 12.0k | break; | 827 | | | 828 | 7.86k | default: | 829 | 7.86k | return FALSE; | 830 | 104k | } | 831 | | | 832 | | /* Because the bit sets are unaligned bytes, we need to perform byte | 833 | | comparison here. */ | 834 | | | 835 | 81.1k | set_end = set1 + 32; | 836 | 81.1k | if (invert_bits) | 837 | 22.7k | { | 838 | 22.7k | do | 839 | 382k | { | 840 | 382k | if ((*set1++ & ~(*set2++)) != 0) return FALSE; | 841 | 382k | } | 842 | 367k | while (set1 < set_end); | 843 | 22.7k | } | 844 | 58.3k | else | 845 | 58.3k | { | 846 | 58.3k | do | 847 | 1.16M | { | 848 | 1.16M | if ((*set1++ & *set2++) != 0) return FALSE; | 849 | 1.16M | } | 850 | 1.13M | while (set1 < set_end); | 851 | 58.3k | } | 852 | | | 853 | 38.8k | if (list[1] == 0) return TRUE; | 854 | | /* Might be an empty repeat. */ | 855 | 12.1k | continue; | 856 | 38.8k | } | 857 | | | 858 | | /* Some property combinations also acceptable. Unicode property opcodes are | 859 | | processed specially; the rest can be handled with a lookup table. */ | 860 | | | 861 | 811k | else | 862 | 811k | { | 863 | 811k | uint32_t leftop, rightop; | 864 | | | 865 | 811k | leftop = base_list[0]; | 866 | 811k | rightop = list[0]; | 867 | | | 868 | 811k | #ifdef SUPPORT_UNICODE | 869 | 811k | accepted = FALSE; /* Always set in non-unicode case. */ | 870 | 811k | if (leftop == OP_PROP || leftop == OP_NOTPROP) | 871 | 196k | { | 872 | 196k | if (rightop == OP_EOD) | 873 | 5.16k | accepted = TRUE; | 874 | 191k | else if (rightop == OP_PROP || rightop == OP_NOTPROP) | 875 | 142k | { | 876 | 142k | int n; | 877 | 142k | const uint8_t *p; | 878 | 142k | BOOL same = leftop == rightop; | 879 | 142k | BOOL lisprop = leftop == OP_PROP; | 880 | 142k | BOOL risprop = rightop == OP_PROP; | 881 | 142k | BOOL bothprop = lisprop && risprop; | 882 | | | 883 | | /* There's a table that specifies how each combination is to be | 884 | | processed: | 885 | | 0 Always return FALSE (never auto-possessify) | 886 | | 1 Character groups are distinct (possessify if both are OP_PROP) | 887 | | 2 Check character categories in the same group (general or particular) | 888 | | 3 Return TRUE if the two opcodes are not the same | 889 | | ... see comments below | 890 | | */ | 891 | | | 892 | 142k | n = propposstab[base_list[2]][list[2]]; | 893 | 142k | switch(n) | 894 | 142k | { | 895 | 5.61k | case 0: break; | 896 | 7.50k | case 1: accepted = bothprop; break; | 897 | 11.0k | case 2: accepted = (base_list[3] == list[3]) != same; break; | 898 | 9.83k | case 3: accepted = !same; break; | 899 | | | 900 | 11.4k | case 4: /* Left general category, right particular category */ | 901 | 11.4k | accepted = risprop && catposstab[base_list[3]][list[3]] == same; | 902 | 11.4k | break; | 903 | | | 904 | 10.5k | case 5: /* Right general category, left particular category */ | 905 | 10.5k | accepted = lisprop && catposstab[list[3]][base_list[3]] == same; | 906 | 10.5k | break; | 907 | | | 908 | | /* This code is logically tricky. Think hard before fiddling with it. | 909 | | The posspropstab table has four entries per row. Each row relates to | 910 | | one of PCRE's special properties such as ALNUM or SPACE or WORD. | 911 | | Only WORD actually needs all four entries, but using repeats for the | 912 | | others means they can all use the same code below. | 913 | | | 914 | | The first two entries in each row are Unicode general categories, and | 915 | | apply always, because all the characters they include are part of the | 916 | | PCRE character set. The third and fourth entries are a general and a | 917 | | particular category, respectively, that include one or more relevant | 918 | | characters. One or the other is used, depending on whether the check | 919 | | is for a general or a particular category. However, in both cases the | 920 | | category contains more characters than the specials that are defined | 921 | | for the property being tested against. Therefore, it cannot be used | 922 | | in a NOTPROP case. | 923 | | | 924 | | Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. | 925 | | Underscore is covered by ucp_P or ucp_Po. */ | 926 | | | 927 | 1.00k | case 6: /* Left alphanum vs right general category */ | 928 | 10.0k | case 7: /* Left space vs right general category */ | 929 | 24.3k | case 8: /* Left word vs right general category */ | 930 | 24.3k | p = posspropstab[n-6]; | 931 | 24.3k | accepted = risprop && lisprop == | 932 | 20.0k | (list[3] != p[0] && | 933 | 15.4k | list[3] != p[1] && | 934 | 11.3k | (list[3] != p[2] || !lisprop)); | 935 | 24.3k | break; | 936 | | | 937 | 578 | case 9: /* Right alphanum vs left general category */ | 938 | 8.69k | case 10: /* Right space vs left general category */ | 939 | 19.2k | case 11: /* Right word vs left general category */ | 940 | 19.2k | p = posspropstab[n-9]; | 941 | 19.2k | accepted = lisprop && risprop == | 942 | 14.8k | (base_list[3] != p[0] && | 943 | 12.1k | base_list[3] != p[1] && | 944 | 8.20k | (base_list[3] != p[2] || !risprop)); | 945 | 19.2k | break; | 946 | | | 947 | 879 | case 12: /* Left alphanum vs right particular category */ | 948 | 14.2k | case 13: /* Left space vs right particular category */ | 949 | 23.1k | case 14: /* Left word vs right particular category */ | 950 | 23.1k | p = posspropstab[n-12]; | 951 | 23.1k | accepted = risprop && lisprop == | 952 | 17.5k | (catposstab[p[0]][list[3]] && | 953 | 13.8k | catposstab[p[1]][list[3]] && | 954 | 8.48k | (list[3] != p[3] || !lisprop)); | 955 | 23.1k | break; | 956 | | | 957 | 1.23k | case 15: /* Right alphanum vs left particular category */ | 958 | 9.96k | case 16: /* Right space vs left particular category */ | 959 | 19.1k | case 17: /* Right word vs left particular category */ | 960 | 19.1k | p = posspropstab[n-15]; | 961 | 19.1k | accepted = lisprop && risprop == | 962 | 13.0k | (catposstab[p[0]][base_list[3]] && | 963 | 10.8k | catposstab[p[1]][base_list[3]] && | 964 | 5.52k | (base_list[3] != p[3] || !risprop)); | 965 | 19.1k | break; | 966 | 142k | } | 967 | 142k | } | 968 | 196k | } | 969 | | | 970 | 615k | else | 971 | 615k | #endif /* SUPPORT_UNICODE */ | 972 | | | 973 | 615k | accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && | 974 | 450k | rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && | 975 | 430k | autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; | 976 | | | 977 | 811k | if (!accepted) return FALSE; | 978 | | | 979 | 124k | if (list[1] == 0) return TRUE; | 980 | | /* Might be an empty repeat. */ | 981 | 29.4k | continue; | 982 | 124k | } | 983 | | | 984 | | /* Control reaches here only if one of the items is a small character list. | 985 | | All characters are checked against the other side. */ | 986 | | | 987 | 11.5M | do | 988 | 12.1M | { | 989 | 12.1M | chr = *chr_ptr; | 990 | | | 991 | 12.1M | switch(list_ptr[0]) | 992 | 12.1M | { | 993 | 6.61M | case OP_CHAR: | 994 | 6.61M | ochr_ptr = list_ptr + 2; | 995 | 6.61M | do | 996 | 6.92M | { | 997 | 6.92M | if (chr == *ochr_ptr) return FALSE; | 998 | 6.63M | ochr_ptr++; | 999 | 6.63M | } | 1000 | 6.63M | while(*ochr_ptr != NOTACHAR); | 1001 | 6.31M | break; | 1002 | | | 1003 | 6.31M | case OP_NOT: | 1004 | 485k | ochr_ptr = list_ptr + 2; | 1005 | 485k | do | 1006 | 531k | { | 1007 | 531k | if (chr == *ochr_ptr) | 1008 | 41.8k | break; | 1009 | 489k | ochr_ptr++; | 1010 | 489k | } | 1011 | 489k | while(*ochr_ptr != NOTACHAR); | 1012 | 485k | if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */ | 1013 | 41.8k | break; | 1014 | | | 1015 | | /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not* | 1016 | | set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ | 1017 | | | 1018 | 119k | case OP_DIGIT: | 1019 | 119k | if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE; | 1020 | 110k | break; | 1021 | | | 1022 | 110k | case OP_NOT_DIGIT: | 1023 | 86.3k | if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE; | 1024 | 2.89k | break; | 1025 | | | 1026 | 155k | case OP_WHITESPACE: | 1027 | 155k | if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE; | 1028 | 145k | break; | 1029 | | | 1030 | 404k | case OP_NOT_WHITESPACE: | 1031 | 404k | if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE; | 1032 | 21.4k | break; | 1033 | | | 1034 | 129k | case OP_WORDCHAR: | 1035 | 129k | if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE; | 1036 | 98.9k | break; | 1037 | | | 1038 | 152k | case OP_NOT_WORDCHAR: | 1039 | 152k | if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE; | 1040 | 23.2k | break; | 1041 | | | 1042 | 207k | case OP_HSPACE: | 1043 | 207k | switch(chr) | 1044 | 207k | { | 1045 | 126k | HSPACE_CASES: return FALSE; | 1046 | 80.9k | default: break; | 1047 | 207k | } | 1048 | 80.9k | break; | 1049 | | | 1050 | 254k | case OP_NOT_HSPACE: | 1051 | 254k | switch(chr) | 1052 | 254k | { | 1053 | 103k | HSPACE_CASES: break; | 1054 | 151k | default: return FALSE; | 1055 | 254k | } | 1056 | 103k | break; | 1057 | | | 1058 | 461k | case OP_ANYNL: | 1059 | 532k | case OP_VSPACE: | 1060 | 532k | switch(chr) | 1061 | 532k | { | 1062 | 106k | VSPACE_CASES: return FALSE; | 1063 | 425k | default: break; | 1064 | 532k | } | 1065 | 425k | break; | 1066 | | | 1067 | 425k | case OP_NOT_VSPACE: | 1068 | 219k | switch(chr) | 1069 | 219k | { | 1070 | 41.8k | VSPACE_CASES: break; | 1071 | 177k | default: return FALSE; | 1072 | 219k | } | 1073 | 41.8k | break; | 1074 | | | 1075 | 80.1k | case OP_DOLL: | 1076 | 112k | case OP_EODN: | 1077 | 112k | switch (chr) | 1078 | 112k | { | 1079 | 4.75k | case CHAR_CR: | 1080 | 13.6k | case CHAR_LF: | 1081 | 19.7k | case CHAR_VT: | 1082 | 24.9k | case CHAR_FF: | 1083 | 28.4k | case CHAR_NEL: | 1084 | 28.4k | #ifndef EBCDIC | 1085 | 35.3k | case 0x2028: | 1086 | 40.2k | case 0x2029: | 1087 | 40.2k | #endif /* Not EBCDIC */ | 1088 | 40.2k | return FALSE; | 1089 | 112k | } | 1090 | 72.0k | break; | 1091 | | | 1092 | 72.0k | case OP_EOD: /* Can always possessify before \z */ | 1093 | 17.2k | break; | 1094 | | | 1095 | 0 | #ifdef SUPPORT_UNICODE | 1096 | 271k | case OP_PROP: | 1097 | 655k | case OP_NOTPROP: | 1098 | 655k | if (!check_char_prop(chr, list_ptr[2], list_ptr[3], | 1099 | 655k | list_ptr[0] == OP_NOTPROP)) | 1100 | 355k | return FALSE; | 1101 | 299k | break; | 1102 | 299k | #endif | 1103 | | | 1104 | 299k | case OP_NCLASS: | 1105 | 114k | if (chr > 255) return FALSE; | 1106 | 44.2k | PCRE2_FALLTHROUGH /* Fall through */ | 1107 | 44.2k | | 1108 | 251k | case OP_CLASS: | 1109 | 251k | if (chr > 255) break; | 1110 | 155k | class_bitset = (const uint8_t *) | 1111 | 155k | ((list_ptr == list ? code : base_end) - list_ptr[2]); | 1112 | 155k | if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE; | 1113 | 96.2k | break; | 1114 | | | 1115 | 96.2k | #ifdef SUPPORT_WIDE_CHARS | 1116 | 908k | case OP_XCLASS: | 1117 | 908k | if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - | 1118 | 908k | list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf)) | 1119 | 331k | return FALSE; | 1120 | 577k | break; | 1121 | | | 1122 | 577k | case OP_ECLASS: | 1123 | 81.7k | if (PRIV(eclass)(chr, | 1124 | 81.7k | (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE, | 1125 | 81.7k | (list_ptr == list ? code : base_end) - list_ptr[3], | 1126 | 81.7k | (const uint8_t*)cb->start_code, utf)) | 1127 | 28.9k | return FALSE; | 1128 | 52.7k | break; | 1129 | 52.7k | #endif /* SUPPORT_WIDE_CHARS */ | 1130 | | | 1131 | 645k | default: | 1132 | 645k | return FALSE; | 1133 | 12.1M | } | 1134 | | | 1135 | 8.62M | chr_ptr++; | 1136 | 8.62M | } | 1137 | 11.5M | while(*chr_ptr != NOTACHAR); | 1138 | | | 1139 | | /* At least one character must be matched from this opcode. */ | 1140 | | | 1141 | 8.07M | if (list[1] == 0) return TRUE; | 1142 | 8.07M | } | 1143 | | | 1144 | | /* LCOV_EXCL_START */ | 1145 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ | 1146 | 0 | return FALSE; /* Avoid compiler warnings */ | 1147 | | /* LCOV_EXCL_STOP */ | 1148 | 15.6M | } |
|
1149 | | |
1150 | | |
1151 | | |
1152 | | /************************************************* |
1153 | | * Scan compiled regex for auto-possession * |
1154 | | *************************************************/ |
1155 | | |
1156 | | /* Replaces single character iterations with their possessive alternatives |
1157 | | if appropriate. This function modifies the compiled opcode! Hitting a |
1158 | | non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a |
1159 | | bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches |
1160 | | overly complicated or large patterns. In these cases, the check just stops, |
1161 | | leaving the remainder of the pattern unpossessified. |
1162 | | |
1163 | | Arguments: |
1164 | | code points to start of the byte code |
1165 | | cb compile data block |
1166 | | |
1167 | | Returns: 0 for success |
1168 | | -1 if a non-existant opcode is encountered |
1169 | | */ |
1170 | | |
1171 | | int |
1172 | | PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb) |
1173 | 530k | { |
1174 | 530k | PCRE2_UCHAR c; |
1175 | 530k | PCRE2_SPTR end; |
1176 | 530k | PCRE2_UCHAR *repeat_opcode; |
1177 | 530k | uint32_t list[MAX_LIST]; |
1178 | 530k | int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ |
1179 | 530k | BOOL utf = (cb->external_options & PCRE2_UTF) != 0; |
1180 | 530k | BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; |
1181 | | |
1182 | 530k | for (;;) |
1183 | 1.41G | { |
1184 | 1.41G | c = *code; |
1185 | | |
1186 | | /* LCOV_EXCL_START */ |
1187 | 1.41G | if (c >= OP_TABLE_LENGTH) |
1188 | 0 | { |
1189 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
1190 | 0 | return -1; /* Something gone wrong */ |
1191 | 0 | } |
1192 | | /* LCOV_EXCL_STOP */ |
1193 | | |
1194 | 1.41G | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) |
1195 | 65.0M | { |
1196 | 65.0M | c -= get_repeat_base(c) - OP_STAR; |
1197 | 65.0M | end = (c <= OP_MINUPTO) ? |
1198 | 65.0M | get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; |
1199 | 65.0M | list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; |
1200 | | |
1201 | 65.0M | if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, |
1202 | 62.0M | &rec_limit)) |
1203 | 12.1M | { |
1204 | 12.1M | switch(c) |
1205 | 12.1M | { |
1206 | 2.86M | case OP_STAR: |
1207 | 2.86M | *code += OP_POSSTAR - OP_STAR; |
1208 | 2.86M | break; |
1209 | | |
1210 | 555k | case OP_MINSTAR: |
1211 | 555k | *code += OP_POSSTAR - OP_MINSTAR; |
1212 | 555k | break; |
1213 | | |
1214 | 2.80M | case OP_PLUS: |
1215 | 2.80M | *code += OP_POSPLUS - OP_PLUS; |
1216 | 2.80M | break; |
1217 | | |
1218 | 514k | case OP_MINPLUS: |
1219 | 514k | *code += OP_POSPLUS - OP_MINPLUS; |
1220 | 514k | break; |
1221 | | |
1222 | 3.93M | case OP_QUERY: |
1223 | 3.93M | *code += OP_POSQUERY - OP_QUERY; |
1224 | 3.93M | break; |
1225 | | |
1226 | 785k | case OP_MINQUERY: |
1227 | 785k | *code += OP_POSQUERY - OP_MINQUERY; |
1228 | 785k | break; |
1229 | | |
1230 | 612k | case OP_UPTO: |
1231 | 612k | *code += OP_POSUPTO - OP_UPTO; |
1232 | 612k | break; |
1233 | | |
1234 | 126k | case OP_MINUPTO: |
1235 | 126k | *code += OP_POSUPTO - OP_MINUPTO; |
1236 | 126k | break; |
1237 | 12.1M | } |
1238 | 12.1M | } |
1239 | 65.0M | c = *code; |
1240 | 65.0M | } |
1241 | 1.34G | else if (c == OP_CLASS || c == OP_NCLASS |
1242 | 1.34G | #ifdef SUPPORT_WIDE_CHARS |
1243 | 1.34G | || c == OP_XCLASS || c == OP_ECLASS |
1244 | 1.34G | #endif |
1245 | 1.34G | ) |
1246 | 11.2M | { |
1247 | 11.2M | #ifdef SUPPORT_WIDE_CHARS |
1248 | 11.2M | if (c == OP_XCLASS || c == OP_ECLASS) |
1249 | 5.77M | repeat_opcode = code + GET(code, 1); |
1250 | 5.49M | else |
1251 | 5.49M | #endif |
1252 | 5.49M | repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); |
1253 | | |
1254 | 11.2M | c = *repeat_opcode; |
1255 | 11.2M | if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) |
1256 | 3.71M | { |
1257 | | /* The return from get_chr_property_list() will never be NULL when |
1258 | | *code (aka c) is one of the four class opcodes. However, gcc with |
1259 | | -fanalyzer notes that a NULL return is possible, and grumbles. Hence we |
1260 | | put in a check. */ |
1261 | | |
1262 | 3.71M | end = get_chr_property_list(code, utf, ucp, cb->fcc, list); |
1263 | 3.71M | list[1] = (c & 1) == 0; |
1264 | | |
1265 | 3.71M | if (end != NULL && |
1266 | 3.71M | compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) |
1267 | 943k | { |
1268 | 943k | switch (c) |
1269 | 943k | { |
1270 | 158k | case OP_CRSTAR: |
1271 | 192k | case OP_CRMINSTAR: |
1272 | 192k | *repeat_opcode = OP_CRPOSSTAR; |
1273 | 192k | break; |
1274 | | |
1275 | 222k | case OP_CRPLUS: |
1276 | 266k | case OP_CRMINPLUS: |
1277 | 266k | *repeat_opcode = OP_CRPOSPLUS; |
1278 | 266k | break; |
1279 | | |
1280 | 166k | case OP_CRQUERY: |
1281 | 197k | case OP_CRMINQUERY: |
1282 | 197k | *repeat_opcode = OP_CRPOSQUERY; |
1283 | 197k | break; |
1284 | | |
1285 | 221k | case OP_CRRANGE: |
1286 | 287k | case OP_CRMINRANGE: |
1287 | 287k | *repeat_opcode = OP_CRPOSRANGE; |
1288 | 287k | break; |
1289 | 943k | } |
1290 | 943k | } |
1291 | 3.71M | } |
1292 | 11.2M | c = *code; |
1293 | 11.2M | } |
1294 | | |
1295 | 1.41G | switch(c) |
1296 | 1.41G | { |
1297 | 530k | case OP_END: |
1298 | 530k | return 0; |
1299 | | |
1300 | 4.34M | case OP_TYPESTAR: |
1301 | 5.17M | case OP_TYPEMINSTAR: |
1302 | 9.66M | case OP_TYPEPLUS: |
1303 | 10.6M | case OP_TYPEMINPLUS: |
1304 | 16.3M | case OP_TYPEQUERY: |
1305 | 17.7M | case OP_TYPEMINQUERY: |
1306 | 18.4M | case OP_TYPEPOSSTAR: |
1307 | 19.2M | case OP_TYPEPOSPLUS: |
1308 | 19.6M | case OP_TYPEPOSQUERY: |
1309 | 19.6M | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
1310 | 19.6M | break; |
1311 | | |
1312 | 656k | case OP_TYPEUPTO: |
1313 | 801k | case OP_TYPEMINUPTO: |
1314 | 1.31M | case OP_TYPEEXACT: |
1315 | 1.63M | case OP_TYPEPOSUPTO: |
1316 | 1.63M | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
1317 | 222k | code += 2; |
1318 | 1.63M | break; |
1319 | | |
1320 | 120k | case OP_CALLOUT_STR: |
1321 | 120k | code += GET(code, 1 + 2*LINK_SIZE); |
1322 | 120k | break; |
1323 | | |
1324 | 0 | #ifdef SUPPORT_WIDE_CHARS |
1325 | 5.11M | case OP_XCLASS: |
1326 | 5.77M | case OP_ECLASS: |
1327 | 5.77M | code += GET(code, 1); |
1328 | 5.77M | break; |
1329 | 0 | #endif |
1330 | | |
1331 | 650k | case OP_MARK: |
1332 | 745k | case OP_COMMIT_ARG: |
1333 | 818k | case OP_PRUNE_ARG: |
1334 | 1.45M | case OP_SKIP_ARG: |
1335 | 1.60M | case OP_THEN_ARG: |
1336 | 1.60M | code += code[1]; |
1337 | 1.60M | break; |
1338 | 1.41G | } |
1339 | | |
1340 | | /* Add in the fixed length from the table */ |
1341 | | |
1342 | 1.41G | code += PRIV(OP_lengths)[c]; |
1343 | | |
1344 | | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be |
1345 | | followed by a multi-byte character. The length in the table is a minimum, so |
1346 | | we have to arrange to skip the extra code units. */ |
1347 | | |
1348 | | #ifdef MAYBE_UTF_MULTI |
1349 | 1.06G | if (utf) switch(c) |
1350 | 151M | { |
1351 | 28.1M | case OP_CHAR: |
1352 | 71.1M | case OP_CHARI: |
1353 | 71.1M | case OP_NOT: |
1354 | 71.3M | case OP_NOTI: |
1355 | 71.5M | case OP_STAR: |
1356 | 71.7M | case OP_MINSTAR: |
1357 | 71.8M | case OP_PLUS: |
1358 | 72.0M | case OP_MINPLUS: |
1359 | 72.3M | case OP_QUERY: |
1360 | 72.8M | case OP_MINQUERY: |
1361 | 72.8M | case OP_UPTO: |
1362 | 72.9M | case OP_MINUPTO: |
1363 | 72.9M | case OP_EXACT: |
1364 | 73.0M | case OP_POSSTAR: |
1365 | 73.1M | case OP_POSPLUS: |
1366 | 73.3M | case OP_POSQUERY: |
1367 | 73.3M | case OP_POSUPTO: |
1368 | 73.4M | case OP_STARI: |
1369 | 73.7M | case OP_MINSTARI: |
1370 | 73.9M | case OP_PLUSI: |
1371 | 74.1M | case OP_MINPLUSI: |
1372 | 74.3M | case OP_QUERYI: |
1373 | 74.6M | case OP_MINQUERYI: |
1374 | 74.7M | case OP_UPTOI: |
1375 | 74.7M | case OP_MINUPTOI: |
1376 | 74.7M | case OP_EXACTI: |
1377 | 74.9M | case OP_POSSTARI: |
1378 | 75.1M | case OP_POSPLUSI: |
1379 | 75.3M | case OP_POSQUERYI: |
1380 | 75.4M | case OP_POSUPTOI: |
1381 | 75.4M | case OP_NOTSTAR: |
1382 | 75.4M | case OP_NOTMINSTAR: |
1383 | 75.4M | case OP_NOTPLUS: |
1384 | 75.5M | case OP_NOTMINPLUS: |
1385 | 75.5M | case OP_NOTQUERY: |
1386 | 75.5M | case OP_NOTMINQUERY: |
1387 | 75.6M | case OP_NOTUPTO: |
1388 | 75.6M | case OP_NOTMINUPTO: |
1389 | 75.6M | case OP_NOTEXACT: |
1390 | 75.6M | case OP_NOTPOSSTAR: |
1391 | 75.6M | case OP_NOTPOSPLUS: |
1392 | 75.6M | case OP_NOTPOSQUERY: |
1393 | 75.6M | case OP_NOTPOSUPTO: |
1394 | 75.6M | case OP_NOTSTARI: |
1395 | 75.7M | case OP_NOTMINSTARI: |
1396 | 75.7M | case OP_NOTPLUSI: |
1397 | 75.7M | case OP_NOTMINPLUSI: |
1398 | 75.8M | case OP_NOTQUERYI: |
1399 | 75.8M | case OP_NOTMINQUERYI: |
1400 | 75.8M | case OP_NOTUPTOI: |
1401 | 75.8M | case OP_NOTMINUPTOI: |
1402 | 75.8M | case OP_NOTEXACTI: |
1403 | 75.8M | case OP_NOTPOSSTARI: |
1404 | 75.8M | case OP_NOTPOSPLUSI: |
1405 | 75.8M | case OP_NOTPOSQUERYI: |
1406 | 75.8M | case OP_NOTPOSUPTOI: |
1407 | 75.8M | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
1408 | 75.8M | break; |
1409 | 151M | } |
1410 | | #else |
1411 | | (void)(utf); /* Keep compiler happy by referencing function argument */ |
1412 | | #endif /* SUPPORT_WIDE_CHARS */ |
1413 | 1.41G | } |
1414 | 530k | } Line | Count | Source | 1173 | 169k | { | 1174 | 169k | PCRE2_UCHAR c; | 1175 | 169k | PCRE2_SPTR end; | 1176 | 169k | PCRE2_UCHAR *repeat_opcode; | 1177 | 169k | uint32_t list[MAX_LIST]; | 1178 | 169k | int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ | 1179 | 169k | BOOL utf = (cb->external_options & PCRE2_UTF) != 0; | 1180 | 169k | BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; | 1181 | | | 1182 | 169k | for (;;) | 1183 | 644M | { | 1184 | 644M | c = *code; | 1185 | | | 1186 | | /* LCOV_EXCL_START */ | 1187 | 644M | if (c >= OP_TABLE_LENGTH) | 1188 | 0 | { | 1189 | 0 | PCRE2_DEBUG_UNREACHABLE(); | 1190 | 0 | return -1; /* Something gone wrong */ | 1191 | 0 | } | 1192 | | /* LCOV_EXCL_STOP */ | 1193 | | | 1194 | 644M | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) | 1195 | 34.8M | { | 1196 | 34.8M | c -= get_repeat_base(c) - OP_STAR; | 1197 | 34.8M | end = (c <= OP_MINUPTO) ? | 1198 | 34.8M | get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; | 1199 | 34.8M | list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; | 1200 | | | 1201 | 34.8M | if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, | 1202 | 33.1M | &rec_limit)) | 1203 | 5.36M | { | 1204 | 5.36M | switch(c) | 1205 | 5.36M | { | 1206 | 1.35M | case OP_STAR: | 1207 | 1.35M | *code += OP_POSSTAR - OP_STAR; | 1208 | 1.35M | break; | 1209 | | | 1210 | 256k | case OP_MINSTAR: | 1211 | 256k | *code += OP_POSSTAR - OP_MINSTAR; | 1212 | 256k | break; | 1213 | | | 1214 | 1.13M | case OP_PLUS: | 1215 | 1.13M | *code += OP_POSPLUS - OP_PLUS; | 1216 | 1.13M | break; | 1217 | | | 1218 | 210k | case OP_MINPLUS: | 1219 | 210k | *code += OP_POSPLUS - OP_MINPLUS; | 1220 | 210k | break; | 1221 | | | 1222 | 1.84M | case OP_QUERY: | 1223 | 1.84M | *code += OP_POSQUERY - OP_QUERY; | 1224 | 1.84M | break; | 1225 | | | 1226 | 338k | case OP_MINQUERY: | 1227 | 338k | *code += OP_POSQUERY - OP_MINQUERY; | 1228 | 338k | break; | 1229 | | | 1230 | 195k | case OP_UPTO: | 1231 | 195k | *code += OP_POSUPTO - OP_UPTO; | 1232 | 195k | break; | 1233 | | | 1234 | 41.1k | case OP_MINUPTO: | 1235 | 41.1k | *code += OP_POSUPTO - OP_MINUPTO; | 1236 | 41.1k | break; | 1237 | 5.36M | } | 1238 | 5.36M | } | 1239 | 34.8M | c = *code; | 1240 | 34.8M | } | 1241 | 609M | else if (c == OP_CLASS || c == OP_NCLASS | 1242 | 604M | #ifdef SUPPORT_WIDE_CHARS | 1243 | 604M | || c == OP_XCLASS || c == OP_ECLASS | 1244 | 609M | #endif | 1245 | 609M | ) | 1246 | 5.47M | { | 1247 | 5.47M | #ifdef SUPPORT_WIDE_CHARS | 1248 | 5.47M | if (c == OP_XCLASS || c == OP_ECLASS) | 1249 | 1.09M | repeat_opcode = code + GET(code, 1); | 1250 | 4.37M | else | 1251 | 4.37M | #endif | 1252 | 4.37M | repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); | 1253 | | | 1254 | 5.47M | c = *repeat_opcode; | 1255 | 5.47M | if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) | 1256 | 2.05M | { | 1257 | | /* The return from get_chr_property_list() will never be NULL when | 1258 | | *code (aka c) is one of the four class opcodes. However, gcc with | 1259 | | -fanalyzer notes that a NULL return is possible, and grumbles. Hence we | 1260 | | put in a check. */ | 1261 | | | 1262 | 2.05M | end = get_chr_property_list(code, utf, ucp, cb->fcc, list); | 1263 | 2.05M | list[1] = (c & 1) == 0; | 1264 | | | 1265 | 2.05M | if (end != NULL && | 1266 | 2.05M | compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) | 1267 | 307k | { | 1268 | 307k | switch (c) | 1269 | 307k | { | 1270 | 48.3k | case OP_CRSTAR: | 1271 | 59.3k | case OP_CRMINSTAR: | 1272 | 59.3k | *repeat_opcode = OP_CRPOSSTAR; | 1273 | 59.3k | break; | 1274 | | | 1275 | 63.1k | case OP_CRPLUS: | 1276 | 72.3k | case OP_CRMINPLUS: | 1277 | 72.3k | *repeat_opcode = OP_CRPOSPLUS; | 1278 | 72.3k | break; | 1279 | | | 1280 | 56.4k | case OP_CRQUERY: | 1281 | 64.4k | case OP_CRMINQUERY: | 1282 | 64.4k | *repeat_opcode = OP_CRPOSQUERY; | 1283 | 64.4k | break; | 1284 | | | 1285 | 74.8k | case OP_CRRANGE: | 1286 | 111k | case OP_CRMINRANGE: | 1287 | 111k | *repeat_opcode = OP_CRPOSRANGE; | 1288 | 111k | break; | 1289 | 307k | } | 1290 | 307k | } | 1291 | 2.05M | } | 1292 | 5.47M | c = *code; | 1293 | 5.47M | } | 1294 | | | 1295 | 644M | switch(c) | 1296 | 644M | { | 1297 | 169k | case OP_END: | 1298 | 169k | return 0; | 1299 | | | 1300 | 2.75M | case OP_TYPESTAR: | 1301 | 3.20M | case OP_TYPEMINSTAR: | 1302 | 5.03M | case OP_TYPEPLUS: | 1303 | 5.53M | case OP_TYPEMINPLUS: | 1304 | 7.46M | case OP_TYPEQUERY: | 1305 | 8.01M | case OP_TYPEMINQUERY: | 1306 | 8.27M | case OP_TYPEPOSSTAR: | 1307 | 8.55M | case OP_TYPEPOSPLUS: | 1308 | 8.76M | case OP_TYPEPOSQUERY: | 1309 | 8.76M | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; | 1310 | 8.76M | break; | 1311 | | | 1312 | 286k | case OP_TYPEUPTO: | 1313 | 335k | case OP_TYPEMINUPTO: | 1314 | 559k | case OP_TYPEEXACT: | 1315 | 686k | case OP_TYPEPOSUPTO: | 1316 | 686k | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) | 1317 | 90.8k | code += 2; | 1318 | 686k | break; | 1319 | | | 1320 | 37.8k | case OP_CALLOUT_STR: | 1321 | 37.8k | code += GET(code, 1 + 2*LINK_SIZE); | 1322 | 37.8k | break; | 1323 | | | 1324 | 0 | #ifdef SUPPORT_WIDE_CHARS | 1325 | 803k | case OP_XCLASS: | 1326 | 1.09M | case OP_ECLASS: | 1327 | 1.09M | code += GET(code, 1); | 1328 | 1.09M | break; | 1329 | 0 | #endif | 1330 | | | 1331 | 354k | case OP_MARK: | 1332 | 383k | case OP_COMMIT_ARG: | 1333 | 405k | case OP_PRUNE_ARG: | 1334 | 730k | case OP_SKIP_ARG: | 1335 | 790k | case OP_THEN_ARG: | 1336 | 790k | code += code[1]; | 1337 | 790k | break; | 1338 | 644M | } | 1339 | | | 1340 | | /* Add in the fixed length from the table */ | 1341 | | | 1342 | 643M | code += PRIV(OP_lengths)[c]; | 1343 | | | 1344 | | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be | 1345 | | followed by a multi-byte character. The length in the table is a minimum, so | 1346 | | we have to arrange to skip the extra code units. */ | 1347 | | | 1348 | 643M | #ifdef MAYBE_UTF_MULTI | 1349 | 643M | if (utf) switch(c) | 1350 | 56.5M | { | 1351 | 9.35M | case OP_CHAR: | 1352 | 23.5M | case OP_CHARI: | 1353 | 23.6M | case OP_NOT: | 1354 | 23.7M | case OP_NOTI: | 1355 | 23.8M | case OP_STAR: | 1356 | 23.9M | case OP_MINSTAR: | 1357 | 24.0M | case OP_PLUS: | 1358 | 24.1M | case OP_MINPLUS: | 1359 | 24.2M | case OP_QUERY: | 1360 | 24.3M | case OP_MINQUERY: | 1361 | 24.4M | case OP_UPTO: | 1362 | 24.4M | case OP_MINUPTO: | 1363 | 24.4M | case OP_EXACT: | 1364 | 24.5M | case OP_POSSTAR: | 1365 | 24.5M | case OP_POSPLUS: | 1366 | 24.6M | case OP_POSQUERY: | 1367 | 24.6M | case OP_POSUPTO: | 1368 | 24.6M | case OP_STARI: | 1369 | 24.9M | case OP_MINSTARI: | 1370 | 24.9M | case OP_PLUSI: | 1371 | 25.0M | case OP_MINPLUSI: | 1372 | 25.2M | case OP_QUERYI: | 1373 | 25.4M | case OP_MINQUERYI: | 1374 | 25.4M | case OP_UPTOI: | 1375 | 25.4M | case OP_MINUPTOI: | 1376 | 25.4M | case OP_EXACTI: | 1377 | 25.6M | case OP_POSSTARI: | 1378 | 25.6M | case OP_POSPLUSI: | 1379 | 25.7M | case OP_POSQUERYI: | 1380 | 25.8M | case OP_POSUPTOI: | 1381 | 25.8M | case OP_NOTSTAR: | 1382 | 25.8M | case OP_NOTMINSTAR: | 1383 | 25.8M | case OP_NOTPLUS: | 1384 | 25.9M | case OP_NOTMINPLUS: | 1385 | 25.9M | case OP_NOTQUERY: | 1386 | 25.9M | case OP_NOTMINQUERY: | 1387 | 25.9M | case OP_NOTUPTO: | 1388 | 26.0M | case OP_NOTMINUPTO: | 1389 | 26.0M | case OP_NOTEXACT: | 1390 | 26.0M | case OP_NOTPOSSTAR: | 1391 | 26.0M | case OP_NOTPOSPLUS: | 1392 | 26.0M | case OP_NOTPOSQUERY: | 1393 | 26.0M | case OP_NOTPOSUPTO: | 1394 | 26.0M | case OP_NOTSTARI: | 1395 | 26.0M | case OP_NOTMINSTARI: | 1396 | 26.0M | case OP_NOTPLUSI: | 1397 | 26.1M | case OP_NOTMINPLUSI: | 1398 | 26.1M | case OP_NOTQUERYI: | 1399 | 26.1M | case OP_NOTMINQUERYI: | 1400 | 26.1M | case OP_NOTUPTOI: | 1401 | 26.1M | case OP_NOTMINUPTOI: | 1402 | 26.1M | case OP_NOTEXACTI: | 1403 | 26.1M | case OP_NOTPOSSTARI: | 1404 | 26.1M | case OP_NOTPOSPLUSI: | 1405 | 26.1M | case OP_NOTPOSQUERYI: | 1406 | 26.1M | case OP_NOTPOSUPTOI: | 1407 | 26.1M | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); | 1408 | 26.1M | break; | 1409 | 56.5M | } | 1410 | | #else | 1411 | | (void)(utf); /* Keep compiler happy by referencing function argument */ | 1412 | | #endif /* SUPPORT_WIDE_CHARS */ | 1413 | 643M | } | 1414 | 169k | } |
_pcre2_auto_possessify_32 Line | Count | Source | 1173 | 179k | { | 1174 | 179k | PCRE2_UCHAR c; | 1175 | 179k | PCRE2_SPTR end; | 1176 | 179k | PCRE2_UCHAR *repeat_opcode; | 1177 | 179k | uint32_t list[MAX_LIST]; | 1178 | 179k | int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ | 1179 | 179k | BOOL utf = (cb->external_options & PCRE2_UTF) != 0; | 1180 | 179k | BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; | 1181 | | | 1182 | 179k | for (;;) | 1183 | 343M | { | 1184 | 343M | c = *code; | 1185 | | | 1186 | | /* LCOV_EXCL_START */ | 1187 | 343M | if (c >= OP_TABLE_LENGTH) | 1188 | 0 | { | 1189 | 0 | PCRE2_DEBUG_UNREACHABLE(); | 1190 | 0 | return -1; /* Something gone wrong */ | 1191 | 0 | } | 1192 | | /* LCOV_EXCL_STOP */ | 1193 | | | 1194 | 343M | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) | 1195 | 10.9M | { | 1196 | 10.9M | c -= get_repeat_base(c) - OP_STAR; | 1197 | 10.9M | end = (c <= OP_MINUPTO) ? | 1198 | 10.9M | get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; | 1199 | 10.9M | list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; | 1200 | | | 1201 | 10.9M | if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, | 1202 | 10.4M | &rec_limit)) | 1203 | 3.22M | { | 1204 | 3.22M | switch(c) | 1205 | 3.22M | { | 1206 | 765k | case OP_STAR: | 1207 | 765k | *code += OP_POSSTAR - OP_STAR; | 1208 | 765k | break; | 1209 | | | 1210 | 157k | case OP_MINSTAR: | 1211 | 157k | *code += OP_POSSTAR - OP_MINSTAR; | 1212 | 157k | break; | 1213 | | | 1214 | 773k | case OP_PLUS: | 1215 | 773k | *code += OP_POSPLUS - OP_PLUS; | 1216 | 773k | break; | 1217 | | | 1218 | 141k | case OP_MINPLUS: | 1219 | 141k | *code += OP_POSPLUS - OP_MINPLUS; | 1220 | 141k | break; | 1221 | | | 1222 | 977k | case OP_QUERY: | 1223 | 977k | *code += OP_POSQUERY - OP_QUERY; | 1224 | 977k | break; | 1225 | | | 1226 | 182k | case OP_MINQUERY: | 1227 | 182k | *code += OP_POSQUERY - OP_MINQUERY; | 1228 | 182k | break; | 1229 | | | 1230 | 200k | case OP_UPTO: | 1231 | 200k | *code += OP_POSUPTO - OP_UPTO; | 1232 | 200k | break; | 1233 | | | 1234 | 24.6k | case OP_MINUPTO: | 1235 | 24.6k | *code += OP_POSUPTO - OP_MINUPTO; | 1236 | 24.6k | break; | 1237 | 3.22M | } | 1238 | 3.22M | } | 1239 | 10.9M | c = *code; | 1240 | 10.9M | } | 1241 | 332M | else if (c == OP_CLASS || c == OP_NCLASS | 1242 | 332M | #ifdef SUPPORT_WIDE_CHARS | 1243 | 332M | || c == OP_XCLASS || c == OP_ECLASS | 1244 | 332M | #endif | 1245 | 332M | ) | 1246 | 2.24M | { | 1247 | 2.24M | #ifdef SUPPORT_WIDE_CHARS | 1248 | 2.24M | if (c == OP_XCLASS || c == OP_ECLASS) | 1249 | 1.85M | repeat_opcode = code + GET(code, 1); | 1250 | 390k | else | 1251 | 390k | #endif | 1252 | 390k | repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); | 1253 | | | 1254 | 2.24M | c = *repeat_opcode; | 1255 | 2.24M | if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) | 1256 | 540k | { | 1257 | | /* The return from get_chr_property_list() will never be NULL when | 1258 | | *code (aka c) is one of the four class opcodes. However, gcc with | 1259 | | -fanalyzer notes that a NULL return is possible, and grumbles. Hence we | 1260 | | put in a check. */ | 1261 | | | 1262 | 540k | end = get_chr_property_list(code, utf, ucp, cb->fcc, list); | 1263 | 540k | list[1] = (c & 1) == 0; | 1264 | | | 1265 | 540k | if (end != NULL && | 1266 | 540k | compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) | 1267 | 235k | { | 1268 | 235k | switch (c) | 1269 | 235k | { | 1270 | 51.4k | case OP_CRSTAR: | 1271 | 64.6k | case OP_CRMINSTAR: | 1272 | 64.6k | *repeat_opcode = OP_CRPOSSTAR; | 1273 | 64.6k | break; | 1274 | | | 1275 | 41.5k | case OP_CRPLUS: | 1276 | 46.4k | case OP_CRMINPLUS: | 1277 | 46.4k | *repeat_opcode = OP_CRPOSPLUS; | 1278 | 46.4k | break; | 1279 | | | 1280 | 48.2k | case OP_CRQUERY: | 1281 | 57.6k | case OP_CRMINQUERY: | 1282 | 57.6k | *repeat_opcode = OP_CRPOSQUERY; | 1283 | 57.6k | break; | 1284 | | | 1285 | 58.1k | case OP_CRRANGE: | 1286 | 67.3k | case OP_CRMINRANGE: | 1287 | 67.3k | *repeat_opcode = OP_CRPOSRANGE; | 1288 | 67.3k | break; | 1289 | 235k | } | 1290 | 235k | } | 1291 | 540k | } | 1292 | 2.24M | c = *code; | 1293 | 2.24M | } | 1294 | | | 1295 | 343M | switch(c) | 1296 | 343M | { | 1297 | 179k | case OP_END: | 1298 | 179k | return 0; | 1299 | | | 1300 | 789k | case OP_TYPESTAR: | 1301 | 977k | case OP_TYPEMINSTAR: | 1302 | 1.53M | case OP_TYPEPLUS: | 1303 | 1.73M | case OP_TYPEMINPLUS: | 1304 | 2.03M | case OP_TYPEQUERY: | 1305 | 2.08M | case OP_TYPEMINQUERY: | 1306 | 2.26M | case OP_TYPEPOSSTAR: | 1307 | 2.46M | case OP_TYPEPOSPLUS: | 1308 | 2.55M | case OP_TYPEPOSQUERY: | 1309 | 2.55M | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; | 1310 | 2.55M | break; | 1311 | | | 1312 | 155k | case OP_TYPEUPTO: | 1313 | 230k | case OP_TYPEMINUPTO: | 1314 | 318k | case OP_TYPEEXACT: | 1315 | 444k | case OP_TYPEPOSUPTO: | 1316 | 444k | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) | 1317 | 23.4k | code += 2; | 1318 | 444k | break; | 1319 | | | 1320 | 22.0k | case OP_CALLOUT_STR: | 1321 | 22.0k | code += GET(code, 1 + 2*LINK_SIZE); | 1322 | 22.0k | break; | 1323 | | | 1324 | 0 | #ifdef SUPPORT_WIDE_CHARS | 1325 | 1.68M | case OP_XCLASS: | 1326 | 1.85M | case OP_ECLASS: | 1327 | 1.85M | code += GET(code, 1); | 1328 | 1.85M | break; | 1329 | 0 | #endif | 1330 | | | 1331 | 123k | case OP_MARK: | 1332 | 167k | case OP_COMMIT_ARG: | 1333 | 190k | case OP_PRUNE_ARG: | 1334 | 315k | case OP_SKIP_ARG: | 1335 | 337k | case OP_THEN_ARG: | 1336 | 337k | code += code[1]; | 1337 | 337k | break; | 1338 | 343M | } | 1339 | | | 1340 | | /* Add in the fixed length from the table */ | 1341 | | | 1342 | 343M | code += PRIV(OP_lengths)[c]; | 1343 | | | 1344 | | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be | 1345 | | followed by a multi-byte character. The length in the table is a minimum, so | 1346 | | we have to arrange to skip the extra code units. */ | 1347 | | | 1348 | | #ifdef MAYBE_UTF_MULTI | 1349 | | if (utf) switch(c) | 1350 | | { | 1351 | | case OP_CHAR: | 1352 | | case OP_CHARI: | 1353 | | case OP_NOT: | 1354 | | case OP_NOTI: | 1355 | | case OP_STAR: | 1356 | | case OP_MINSTAR: | 1357 | | case OP_PLUS: | 1358 | | case OP_MINPLUS: | 1359 | | case OP_QUERY: | 1360 | | case OP_MINQUERY: | 1361 | | case OP_UPTO: | 1362 | | case OP_MINUPTO: | 1363 | | case OP_EXACT: | 1364 | | case OP_POSSTAR: | 1365 | | case OP_POSPLUS: | 1366 | | case OP_POSQUERY: | 1367 | | case OP_POSUPTO: | 1368 | | case OP_STARI: | 1369 | | case OP_MINSTARI: | 1370 | | case OP_PLUSI: | 1371 | | case OP_MINPLUSI: | 1372 | | case OP_QUERYI: | 1373 | | case OP_MINQUERYI: | 1374 | | case OP_UPTOI: | 1375 | | case OP_MINUPTOI: | 1376 | | case OP_EXACTI: | 1377 | | case OP_POSSTARI: | 1378 | | case OP_POSPLUSI: | 1379 | | case OP_POSQUERYI: | 1380 | | case OP_POSUPTOI: | 1381 | | case OP_NOTSTAR: | 1382 | | case OP_NOTMINSTAR: | 1383 | | case OP_NOTPLUS: | 1384 | | case OP_NOTMINPLUS: | 1385 | | case OP_NOTQUERY: | 1386 | | case OP_NOTMINQUERY: | 1387 | | case OP_NOTUPTO: | 1388 | | case OP_NOTMINUPTO: | 1389 | | case OP_NOTEXACT: | 1390 | | case OP_NOTPOSSTAR: | 1391 | | case OP_NOTPOSPLUS: | 1392 | | case OP_NOTPOSQUERY: | 1393 | | case OP_NOTPOSUPTO: | 1394 | | case OP_NOTSTARI: | 1395 | | case OP_NOTMINSTARI: | 1396 | | case OP_NOTPLUSI: | 1397 | | case OP_NOTMINPLUSI: | 1398 | | case OP_NOTQUERYI: | 1399 | | case OP_NOTMINQUERYI: | 1400 | | case OP_NOTUPTOI: | 1401 | | case OP_NOTMINUPTOI: | 1402 | | case OP_NOTEXACTI: | 1403 | | case OP_NOTPOSSTARI: | 1404 | | case OP_NOTPOSPLUSI: | 1405 | | case OP_NOTPOSQUERYI: | 1406 | | case OP_NOTPOSUPTOI: | 1407 | | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); | 1408 | | break; | 1409 | | } | 1410 | | #else | 1411 | 343M | (void)(utf); /* Keep compiler happy by referencing function argument */ | 1412 | 343M | #endif /* SUPPORT_WIDE_CHARS */ | 1413 | 343M | } | 1414 | 179k | } |
_pcre2_auto_possessify_16 Line | Count | Source | 1173 | 181k | { | 1174 | 181k | PCRE2_UCHAR c; | 1175 | 181k | PCRE2_SPTR end; | 1176 | 181k | PCRE2_UCHAR *repeat_opcode; | 1177 | 181k | uint32_t list[MAX_LIST]; | 1178 | 181k | int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ | 1179 | 181k | BOOL utf = (cb->external_options & PCRE2_UTF) != 0; | 1180 | 181k | BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; | 1181 | | | 1182 | 181k | for (;;) | 1183 | 424M | { | 1184 | 424M | c = *code; | 1185 | | | 1186 | | /* LCOV_EXCL_START */ | 1187 | 424M | if (c >= OP_TABLE_LENGTH) | 1188 | 0 | { | 1189 | 0 | PCRE2_DEBUG_UNREACHABLE(); | 1190 | 0 | return -1; /* Something gone wrong */ | 1191 | 0 | } | 1192 | | /* LCOV_EXCL_STOP */ | 1193 | | | 1194 | 424M | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) | 1195 | 19.2M | { | 1196 | 19.2M | c -= get_repeat_base(c) - OP_STAR; | 1197 | 19.2M | end = (c <= OP_MINUPTO) ? | 1198 | 19.2M | get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; | 1199 | 19.2M | list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; | 1200 | | | 1201 | 19.2M | if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, | 1202 | 18.4M | &rec_limit)) | 1203 | 3.60M | { | 1204 | 3.60M | switch(c) | 1205 | 3.60M | { | 1206 | 746k | case OP_STAR: | 1207 | 746k | *code += OP_POSSTAR - OP_STAR; | 1208 | 746k | break; | 1209 | | | 1210 | 140k | case OP_MINSTAR: | 1211 | 140k | *code += OP_POSSTAR - OP_MINSTAR; | 1212 | 140k | break; | 1213 | | | 1214 | 899k | case OP_PLUS: | 1215 | 899k | *code += OP_POSPLUS - OP_PLUS; | 1216 | 899k | break; | 1217 | | | 1218 | 163k | case OP_MINPLUS: | 1219 | 163k | *code += OP_POSPLUS - OP_MINPLUS; | 1220 | 163k | break; | 1221 | | | 1222 | 1.11M | case OP_QUERY: | 1223 | 1.11M | *code += OP_POSQUERY - OP_QUERY; | 1224 | 1.11M | break; | 1225 | | | 1226 | 265k | case OP_MINQUERY: | 1227 | 265k | *code += OP_POSQUERY - OP_MINQUERY; | 1228 | 265k | break; | 1229 | | | 1230 | 216k | case OP_UPTO: | 1231 | 216k | *code += OP_POSUPTO - OP_UPTO; | 1232 | 216k | break; | 1233 | | | 1234 | 60.5k | case OP_MINUPTO: | 1235 | 60.5k | *code += OP_POSUPTO - OP_MINUPTO; | 1236 | 60.5k | break; | 1237 | 3.60M | } | 1238 | 3.60M | } | 1239 | 19.2M | c = *code; | 1240 | 19.2M | } | 1241 | 405M | else if (c == OP_CLASS || c == OP_NCLASS | 1242 | 404M | #ifdef SUPPORT_WIDE_CHARS | 1243 | 404M | || c == OP_XCLASS || c == OP_ECLASS | 1244 | 405M | #endif | 1245 | 405M | ) | 1246 | 3.56M | { | 1247 | 3.56M | #ifdef SUPPORT_WIDE_CHARS | 1248 | 3.56M | if (c == OP_XCLASS || c == OP_ECLASS) | 1249 | 2.83M | repeat_opcode = code + GET(code, 1); | 1250 | 728k | else | 1251 | 728k | #endif | 1252 | 728k | repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); | 1253 | | | 1254 | 3.56M | c = *repeat_opcode; | 1255 | 3.56M | if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) | 1256 | 1.12M | { | 1257 | | /* The return from get_chr_property_list() will never be NULL when | 1258 | | *code (aka c) is one of the four class opcodes. However, gcc with | 1259 | | -fanalyzer notes that a NULL return is possible, and grumbles. Hence we | 1260 | | put in a check. */ | 1261 | | | 1262 | 1.12M | end = get_chr_property_list(code, utf, ucp, cb->fcc, list); | 1263 | 1.12M | list[1] = (c & 1) == 0; | 1264 | | | 1265 | 1.12M | if (end != NULL && | 1266 | 1.12M | compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) | 1267 | 400k | { | 1268 | 400k | switch (c) | 1269 | 400k | { | 1270 | 58.5k | case OP_CRSTAR: | 1271 | 68.4k | case OP_CRMINSTAR: | 1272 | 68.4k | *repeat_opcode = OP_CRPOSSTAR; | 1273 | 68.4k | break; | 1274 | | | 1275 | 118k | case OP_CRPLUS: | 1276 | 147k | case OP_CRMINPLUS: | 1277 | 147k | *repeat_opcode = OP_CRPOSPLUS; | 1278 | 147k | break; | 1279 | | | 1280 | 61.5k | case OP_CRQUERY: | 1281 | 75.3k | case OP_CRMINQUERY: | 1282 | 75.3k | *repeat_opcode = OP_CRPOSQUERY; | 1283 | 75.3k | break; | 1284 | | | 1285 | 88.1k | case OP_CRRANGE: | 1286 | 108k | case OP_CRMINRANGE: | 1287 | 108k | *repeat_opcode = OP_CRPOSRANGE; | 1288 | 108k | break; | 1289 | 400k | } | 1290 | 400k | } | 1291 | 1.12M | } | 1292 | 3.56M | c = *code; | 1293 | 3.56M | } | 1294 | | | 1295 | 424M | switch(c) | 1296 | 424M | { | 1297 | 181k | case OP_END: | 1298 | 181k | return 0; | 1299 | | | 1300 | 803k | case OP_TYPESTAR: | 1301 | 1.00M | case OP_TYPEMINSTAR: | 1302 | 3.09M | case OP_TYPEPLUS: | 1303 | 3.35M | case OP_TYPEMINPLUS: | 1304 | 6.83M | case OP_TYPEQUERY: | 1305 | 7.67M | case OP_TYPEMINQUERY: | 1306 | 7.92M | case OP_TYPEPOSSTAR: | 1307 | 8.21M | case OP_TYPEPOSPLUS: | 1308 | 8.37M | case OP_TYPEPOSQUERY: | 1309 | 8.37M | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; | 1310 | 8.37M | break; | 1311 | | | 1312 | 214k | case OP_TYPEUPTO: | 1313 | 235k | case OP_TYPEMINUPTO: | 1314 | 435k | case OP_TYPEEXACT: | 1315 | 505k | case OP_TYPEPOSUPTO: | 1316 | 505k | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) | 1317 | 108k | code += 2; | 1318 | 505k | break; | 1319 | | | 1320 | 60.2k | case OP_CALLOUT_STR: | 1321 | 60.2k | code += GET(code, 1 + 2*LINK_SIZE); | 1322 | 60.2k | break; | 1323 | | | 1324 | 0 | #ifdef SUPPORT_WIDE_CHARS | 1325 | 2.62M | case OP_XCLASS: | 1326 | 2.83M | case OP_ECLASS: | 1327 | 2.83M | code += GET(code, 1); | 1328 | 2.83M | break; | 1329 | 0 | #endif | 1330 | | | 1331 | 173k | case OP_MARK: | 1332 | 194k | case OP_COMMIT_ARG: | 1333 | 222k | case OP_PRUNE_ARG: | 1334 | 408k | case OP_SKIP_ARG: | 1335 | 472k | case OP_THEN_ARG: | 1336 | 472k | code += code[1]; | 1337 | 472k | break; | 1338 | 424M | } | 1339 | | | 1340 | | /* Add in the fixed length from the table */ | 1341 | | | 1342 | 424M | code += PRIV(OP_lengths)[c]; | 1343 | | | 1344 | | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be | 1345 | | followed by a multi-byte character. The length in the table is a minimum, so | 1346 | | we have to arrange to skip the extra code units. */ | 1347 | | | 1348 | 424M | #ifdef MAYBE_UTF_MULTI | 1349 | 424M | if (utf) switch(c) | 1350 | 95.1M | { | 1351 | 18.7M | case OP_CHAR: | 1352 | 47.5M | case OP_CHARI: | 1353 | 47.5M | case OP_NOT: | 1354 | 47.6M | case OP_NOTI: | 1355 | 47.6M | case OP_STAR: | 1356 | 47.7M | case OP_MINSTAR: | 1357 | 47.8M | case OP_PLUS: | 1358 | 47.9M | case OP_MINPLUS: | 1359 | 48.1M | case OP_QUERY: | 1360 | 48.4M | case OP_MINQUERY: | 1361 | 48.4M | case OP_UPTO: | 1362 | 48.4M | case OP_MINUPTO: | 1363 | 48.4M | case OP_EXACT: | 1364 | 48.5M | case OP_POSSTAR: | 1365 | 48.5M | case OP_POSPLUS: | 1366 | 48.6M | case OP_POSQUERY: | 1367 | 48.7M | case OP_POSUPTO: | 1368 | 48.7M | case OP_STARI: | 1369 | 48.8M | case OP_MINSTARI: | 1370 | 48.9M | case OP_PLUSI: | 1371 | 49.0M | case OP_MINPLUSI: | 1372 | 49.1M | case OP_QUERYI: | 1373 | 49.2M | case OP_MINQUERYI: | 1374 | 49.2M | case OP_UPTOI: | 1375 | 49.2M | case OP_MINUPTOI: | 1376 | 49.2M | case OP_EXACTI: | 1377 | 49.3M | case OP_POSSTARI: | 1378 | 49.4M | case OP_POSPLUSI: | 1379 | 49.5M | case OP_POSQUERYI: | 1380 | 49.6M | case OP_POSUPTOI: | 1381 | 49.6M | case OP_NOTSTAR: | 1382 | 49.6M | case OP_NOTMINSTAR: | 1383 | 49.6M | case OP_NOTPLUS: | 1384 | 49.6M | case OP_NOTMINPLUS: | 1385 | 49.6M | case OP_NOTQUERY: | 1386 | 49.6M | case OP_NOTMINQUERY: | 1387 | 49.6M | case OP_NOTUPTO: | 1388 | 49.6M | case OP_NOTMINUPTO: | 1389 | 49.6M | case OP_NOTEXACT: | 1390 | 49.6M | case OP_NOTPOSSTAR: | 1391 | 49.6M | case OP_NOTPOSPLUS: | 1392 | 49.6M | case OP_NOTPOSQUERY: | 1393 | 49.6M | case OP_NOTPOSUPTO: | 1394 | 49.6M | case OP_NOTSTARI: | 1395 | 49.6M | case OP_NOTMINSTARI: | 1396 | 49.6M | case OP_NOTPLUSI: | 1397 | 49.6M | case OP_NOTMINPLUSI: | 1398 | 49.6M | case OP_NOTQUERYI: | 1399 | 49.6M | case OP_NOTMINQUERYI: | 1400 | 49.6M | case OP_NOTUPTOI: | 1401 | 49.7M | case OP_NOTMINUPTOI: | 1402 | 49.7M | case OP_NOTEXACTI: | 1403 | 49.7M | case OP_NOTPOSSTARI: | 1404 | 49.7M | case OP_NOTPOSPLUSI: | 1405 | 49.7M | case OP_NOTPOSQUERYI: | 1406 | 49.7M | case OP_NOTPOSUPTOI: | 1407 | 49.7M | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); | 1408 | 49.7M | break; | 1409 | 95.1M | } | 1410 | | #else | 1411 | | (void)(utf); /* Keep compiler happy by referencing function argument */ | 1412 | | #endif /* SUPPORT_WIDE_CHARS */ | 1413 | 424M | } | 1414 | 181k | } |
|
1415 | | |
1416 | | /* End of pcre2_auto_possess.c */ |