/src/php-src/ext/pcre/pcre2lib/pcre2_match.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2015-2024 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | |
42 | | #ifdef HAVE_CONFIG_H |
43 | | #include "config.h" |
44 | | #endif |
45 | | |
46 | | #include "pcre2_internal.h" |
47 | | |
48 | | /* These defines enable debugging code */ |
49 | | |
50 | | /* #define DEBUG_FRAMES_DISPLAY */ |
51 | | /* #define DEBUG_SHOW_OPS */ |
52 | | /* #define DEBUG_SHOW_RMATCH */ |
53 | | |
54 | | #ifdef DEBUG_FRAMES_DISPLAY |
55 | | #include <stdarg.h> |
56 | | #endif |
57 | | |
58 | | #ifdef DEBUG_SHOW_OPS |
59 | | static const char *OP_names[] = { OP_NAME_LIST }; |
60 | | #endif |
61 | | |
62 | | /* These defines identify the name of the block containing "static" |
63 | | information, and fields within it. */ |
64 | | |
65 | 64.6M | #define NLBLOCK mb /* Block containing newline information */ |
66 | 7.92k | #define PSSTART start_subject /* Field containing processed string start */ |
67 | 16.1M | #define PSEND end_subject /* Field containing processed string end */ |
68 | | |
69 | 379k | #define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */ |
70 | | |
71 | | /* Masks for identifying the public options that are permitted at match time. */ |
72 | | |
73 | | #define PUBLIC_MATCH_OPTIONS \ |
74 | 4.32k | (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ |
75 | 4.32k | PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ |
76 | 4.32k | PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT| \ |
77 | 4.32k | PCRE2_DISABLE_RECURSELOOP_CHECK) |
78 | | |
79 | | #define PUBLIC_JIT_MATCH_OPTIONS \ |
80 | | (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\ |
81 | | PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\ |
82 | | PCRE2_COPY_MATCHED_SUBJECT) |
83 | | |
84 | | /* Non-error returns from and within the match() function. Error returns are |
85 | | externally defined PCRE2_ERROR_xxx codes, which are all negative. */ |
86 | | |
87 | 4.77k | #define MATCH_MATCH 1 |
88 | 428M | #define MATCH_NOMATCH 0 |
89 | | |
90 | | /* Special internal returns used in the match() function. Make them |
91 | | sufficiently negative to avoid the external error codes. */ |
92 | | |
93 | 1.42k | #define MATCH_ACCEPT (-999) |
94 | 1.96k | #define MATCH_KETRPOS (-998) |
95 | | /* The next 5 must be kept together and in sequence so that a test that checks |
96 | | for any one of them can use a range. */ |
97 | 2.00M | #define MATCH_COMMIT (-997) |
98 | 360k | #define MATCH_PRUNE (-996) |
99 | 0 | #define MATCH_SKIP (-995) |
100 | 2.94k | #define MATCH_SKIP_ARG (-994) |
101 | 9.61M | #define MATCH_THEN (-993) |
102 | 1.00M | #define MATCH_BACKTRACK_MAX MATCH_THEN |
103 | 1.00M | #define MATCH_BACKTRACK_MIN MATCH_COMMIT |
104 | | |
105 | | /* Group frame type values. Zero means the frame is not a group frame. The |
106 | | lower 16 bits are used for data (e.g. the capture number). Group frames are |
107 | | used for most groups so that information about the start is easily available at |
108 | | the end without having to scan back through intermediate frames (backtrack |
109 | | points). */ |
110 | | |
111 | 2.04M | #define GF_CAPTURE 0x00010000u |
112 | 462 | #define GF_NOCAPTURE 0x00020000u |
113 | 2.48M | #define GF_CONDASSERT 0x00030000u |
114 | 7.06M | #define GF_RECURSE 0x00040000u |
115 | | |
116 | | /* Masks for the identity and data parts of the group frame type. */ |
117 | | |
118 | 9.51M | #define GF_IDMASK(a) ((a) & 0xffff0000u) |
119 | 1.00M | #define GF_DATAMASK(a) ((a) & 0x0000ffffu) |
120 | | |
121 | | /* Repetition types */ |
122 | | |
123 | | enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS }; |
124 | | |
125 | | /* Min and max values for the common repeats; a maximum of UINT32_MAX => |
126 | | infinity. */ |
127 | | |
128 | | static const uint32_t rep_min[] = { |
129 | | 0, 0, /* * and *? */ |
130 | | 1, 1, /* + and +? */ |
131 | | 0, 0, /* ? and ?? */ |
132 | | 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ |
133 | | 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ |
134 | | |
135 | | static const uint32_t rep_max[] = { |
136 | | UINT32_MAX, UINT32_MAX, /* * and *? */ |
137 | | UINT32_MAX, UINT32_MAX, /* + and +? */ |
138 | | 1, 1, /* ? and ?? */ |
139 | | 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ |
140 | | UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ |
141 | | |
142 | | /* Repetition types - must include OP_CRPOSRANGE (not needed above) */ |
143 | | |
144 | | static const uint32_t rep_typ[] = { |
145 | | REPTYPE_MAX, REPTYPE_MIN, /* * and *? */ |
146 | | REPTYPE_MAX, REPTYPE_MIN, /* + and +? */ |
147 | | REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */ |
148 | | REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */ |
149 | | REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */ |
150 | | REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */ |
151 | | |
152 | | /* Numbers for RMATCH calls at backtracking points. When these lists are |
153 | | changed, the code at RETURN_SWITCH below must be updated in sync. */ |
154 | | |
155 | | enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, |
156 | | RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, |
157 | | RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, |
158 | | RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39 }; |
159 | | |
160 | | #ifdef SUPPORT_WIDE_CHARS |
161 | | enum { RM100=100, RM101, RM102, RM103 }; |
162 | | #endif |
163 | | |
164 | | #ifdef SUPPORT_UNICODE |
165 | | enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207, |
166 | | RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215, |
167 | | RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223, |
168 | | RM224 }; |
169 | | #endif |
170 | | |
171 | | /* Define short names for general fields in the current backtrack frame, which |
172 | | is always pointed to by the F variable. Occasional references to fields in |
173 | | other frames are written out explicitly. There are also some fields in the |
174 | | current frame whose names start with "temp" that are used for short-term, |
175 | | localised backtracking memory. These are #defined with Lxxx names at the point |
176 | | of use and undefined afterwards. */ |
177 | | |
178 | 855M | #define Fback_frame F->back_frame |
179 | 2.84M | #define Fcapture_last F->capture_last |
180 | 3.86M | #define Fcurrent_recurse F->current_recurse |
181 | 1.55G | #define Fecode F->ecode |
182 | 2.22G | #define Feptr F->eptr |
183 | 427M | #define Fgroup_frame_type F->group_frame_type |
184 | 12.3M | #define Flast_group_offset F->last_group_offset |
185 | 322M | #define Flength F->length |
186 | 362k | #define Fmark F->mark |
187 | 1.28G | #define Frdepth F->rdepth |
188 | 374k | #define Fstart_match F->start_match |
189 | 5.32M | #define Foffset_top F->offset_top |
190 | 0 | #define Foccu F->occu |
191 | 1.20G | #define Fop F->op |
192 | 4.96M | #define Fovector F->ovector |
193 | 855M | #define Freturn_id F->return_id |
194 | | |
195 | | |
196 | | #ifdef DEBUG_FRAMES_DISPLAY |
197 | | /************************************************* |
198 | | * Display current frames and contents * |
199 | | *************************************************/ |
200 | | |
201 | | /* This debugging function displays the current set of frames and their |
202 | | contents. It is not called automatically from anywhere, the intention being |
203 | | that calls can be inserted where necessary when debugging frame-related |
204 | | problems. |
205 | | |
206 | | Arguments: |
207 | | f the file to write to |
208 | | F the current top frame |
209 | | P a previous frame of interest |
210 | | frame_size the frame size |
211 | | mb points to the match block |
212 | | match_data points to the match data block |
213 | | s identification text |
214 | | |
215 | | Returns: nothing |
216 | | */ |
217 | | |
218 | | static void |
219 | | display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size, |
220 | | match_block *mb, pcre2_match_data *match_data, const char *s, ...) |
221 | | { |
222 | | uint32_t i; |
223 | | heapframe *Q; |
224 | | va_list ap; |
225 | | va_start(ap, s); |
226 | | |
227 | | fprintf(f, "FRAMES "); |
228 | | vfprintf(f, s, ap); |
229 | | va_end(ap); |
230 | | |
231 | | if (P != NULL) fprintf(f, " P=%lu", |
232 | | ((char *)P - (char *)(match_data->heapframes))/frame_size); |
233 | | fprintf(f, "\n"); |
234 | | |
235 | | for (i = 0, Q = match_data->heapframes; |
236 | | Q <= F; |
237 | | i++, Q = (heapframe *)((char *)Q + frame_size)) |
238 | | { |
239 | | fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d", |
240 | | i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode), |
241 | | Q->back_frame, Q->return_id); |
242 | | |
243 | | if (Q->last_group_offset == PCRE2_UNSET) |
244 | | fprintf(f, " lgoffset=unset\n"); |
245 | | else |
246 | | fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size); |
247 | | } |
248 | | } |
249 | | |
250 | | #endif |
251 | | |
252 | | |
253 | | |
254 | | /************************************************* |
255 | | * Process a callout * |
256 | | *************************************************/ |
257 | | |
258 | | /* This function is called for all callouts, whether "standalone" or at the |
259 | | start of a conditional group. Feptr will be pointing to either OP_CALLOUT or |
260 | | OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized |
261 | | with fixed values. |
262 | | |
263 | | Arguments: |
264 | | F points to the current backtracking frame |
265 | | mb points to the match block |
266 | | lengthptr where to return the length of the callout item |
267 | | |
268 | | Returns: the return from the callout |
269 | | or 0 if no callout function exists |
270 | | */ |
271 | | |
272 | | static int |
273 | | do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr) |
274 | 0 | { |
275 | 0 | int rc; |
276 | 0 | PCRE2_SIZE save0, save1; |
277 | 0 | PCRE2_SIZE *callout_ovector; |
278 | 0 | pcre2_callout_block *cb; |
279 | |
|
280 | 0 | *lengthptr = (*Fecode == OP_CALLOUT)? |
281 | 0 | PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE); |
282 | |
|
283 | 0 | if (mb->callout == NULL) return 0; /* No callout function provided */ |
284 | | |
285 | | /* The original matching code (pre 10.30) worked directly with the ovector |
286 | | passed by the user, and this was passed to callouts. Now that the working |
287 | | ovector is in the backtracking frame, it no longer needs to reserve space for |
288 | | the overall match offsets (which would waste space in the frame). For backward |
289 | | compatibility, however, we pass capture_top and offset_vector to the callout as |
290 | | if for the extended ovector, and we ensure that the first two slots are unset |
291 | | by preserving and restoring their current contents. Picky compilers complain if |
292 | | references such as Fovector[-2] are use directly, so we set up a separate |
293 | | pointer. */ |
294 | | |
295 | 0 | callout_ovector = (PCRE2_SIZE *)(Fovector) - 2; |
296 | | |
297 | | /* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields |
298 | | are set externally. The first 3 never change; the last is updated for each |
299 | | bumpalong. */ |
300 | |
|
301 | 0 | cb = mb->cb; |
302 | 0 | cb->capture_top = (uint32_t)Foffset_top/2 + 1; |
303 | 0 | cb->capture_last = Fcapture_last; |
304 | 0 | cb->offset_vector = callout_ovector; |
305 | 0 | cb->mark = mb->nomatch_mark; |
306 | 0 | cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject); |
307 | 0 | cb->pattern_position = GET(Fecode, 1); |
308 | 0 | cb->next_item_length = GET(Fecode, 1 + LINK_SIZE); |
309 | |
|
310 | 0 | if (*Fecode == OP_CALLOUT) /* Numerical callout */ |
311 | 0 | { |
312 | 0 | cb->callout_number = Fecode[1 + 2*LINK_SIZE]; |
313 | 0 | cb->callout_string_offset = 0; |
314 | 0 | cb->callout_string = NULL; |
315 | 0 | cb->callout_string_length = 0; |
316 | 0 | } |
317 | 0 | else /* String callout */ |
318 | 0 | { |
319 | 0 | cb->callout_number = 0; |
320 | 0 | cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE); |
321 | 0 | cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1; |
322 | 0 | cb->callout_string_length = |
323 | 0 | *lengthptr - (1 + 4*LINK_SIZE) - 2; |
324 | 0 | } |
325 | |
|
326 | 0 | save0 = callout_ovector[0]; |
327 | 0 | save1 = callout_ovector[1]; |
328 | 0 | callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET; |
329 | 0 | rc = mb->callout(cb, mb->callout_data); |
330 | 0 | callout_ovector[0] = save0; |
331 | 0 | callout_ovector[1] = save1; |
332 | 0 | cb->callout_flags = 0; |
333 | 0 | return rc; |
334 | 0 | } |
335 | | |
336 | | |
337 | | |
338 | | /************************************************* |
339 | | * Match a back-reference * |
340 | | *************************************************/ |
341 | | |
342 | | /* This function is called only when it is known that the offset lies within |
343 | | the offsets that have so far been used in the match. Note that in caseless |
344 | | UTF-8 mode, the number of subject bytes matched may be different to the number |
345 | | of reference bytes. (In theory this could also happen in UTF-16 mode, but it |
346 | | seems unlikely.) |
347 | | |
348 | | Arguments: |
349 | | offset index into the offset vector |
350 | | caseless TRUE if caseless |
351 | | caseopts bitmask of REFI_FLAG_XYZ values |
352 | | F the current backtracking frame pointer |
353 | | mb points to match block |
354 | | lengthptr pointer for returning the length matched |
355 | | |
356 | | Returns: = 0 sucessful match; number of code units matched is set |
357 | | < 0 no match |
358 | | > 0 partial match |
359 | | */ |
360 | | |
361 | | static int |
362 | | match_ref(PCRE2_SIZE offset, BOOL caseless, int caseopts, heapframe *F, |
363 | | match_block *mb, PCRE2_SIZE *lengthptr) |
364 | 0 | { |
365 | 0 | PCRE2_SPTR p; |
366 | 0 | PCRE2_SIZE length; |
367 | 0 | PCRE2_SPTR eptr; |
368 | 0 | PCRE2_SPTR eptr_start; |
369 | | |
370 | | /* Deal with an unset group. The default is no match, but there is an option to |
371 | | match an empty string. */ |
372 | |
|
373 | 0 | if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET) |
374 | 0 | { |
375 | 0 | if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) |
376 | 0 | { |
377 | 0 | *lengthptr = 0; |
378 | 0 | return 0; /* Match */ |
379 | 0 | } |
380 | 0 | else return -1; /* No match */ |
381 | 0 | } |
382 | | |
383 | | /* Separate the caseless and UTF cases for speed. */ |
384 | | |
385 | 0 | eptr = eptr_start = Feptr; |
386 | 0 | p = mb->start_subject + Fovector[offset]; |
387 | 0 | length = Fovector[offset+1] - Fovector[offset]; |
388 | |
|
389 | 0 | if (caseless) |
390 | 0 | { |
391 | 0 | #if defined SUPPORT_UNICODE |
392 | 0 | BOOL utf = (mb->poptions & PCRE2_UTF) != 0; |
393 | 0 | BOOL caseless_restrict = (caseopts & REFI_FLAG_CASELESS_RESTRICT) != 0; |
394 | 0 | BOOL turkish_casing = !caseless_restrict && (caseopts & REFI_FLAG_TURKISH_CASING) != 0; |
395 | |
|
396 | 0 | if (utf || (mb->poptions & PCRE2_UCP) != 0) |
397 | 0 | { |
398 | 0 | PCRE2_SPTR endptr = p + length; |
399 | | |
400 | | /* Match characters up to the end of the reference. NOTE: the number of |
401 | | code units matched may differ, because in UTF-8 there are some characters |
402 | | whose upper and lower case codes have different numbers of bytes. For |
403 | | example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3 |
404 | | bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a |
405 | | sequence of two of the latter. It is important, therefore, to check the |
406 | | length along the reference, not along the subject (earlier code did this |
407 | | wrong). UCP without uses Unicode properties but without UTF encoding. */ |
408 | |
|
409 | 0 | while (p < endptr) |
410 | 0 | { |
411 | 0 | uint32_t c, d; |
412 | 0 | const ucd_record *ur; |
413 | 0 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
414 | | |
415 | 0 | if (utf) |
416 | 0 | { |
417 | 0 | GETCHARINC(c, eptr); |
418 | 0 | GETCHARINC(d, p); |
419 | 0 | } |
420 | 0 | else |
421 | 0 | { |
422 | 0 | c = *eptr++; |
423 | 0 | d = *p++; |
424 | 0 | } |
425 | |
|
426 | 0 | if (turkish_casing && UCD_ANY_I(d)) |
427 | 0 | { |
428 | 0 | c = UCD_FOLD_I_TURKISH(c); |
429 | 0 | d = UCD_FOLD_I_TURKISH(d); |
430 | 0 | if (c != d) return -1; /* No match */ |
431 | 0 | } |
432 | 0 | else if (c != d && c != (uint32_t)((int)d + (ur = GET_UCD(d))->other_case)) |
433 | 0 | { |
434 | 0 | const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; |
435 | | |
436 | | /* When PCRE2_EXTRA_CASELESS_RESTRICT is set, ignore any caseless sets |
437 | | that start with an ASCII character. */ |
438 | 0 | if (caseless_restrict && *pp < 128) return -1; /* No match */ |
439 | | |
440 | 0 | for (;;) |
441 | 0 | { |
442 | 0 | if (c < *pp) return -1; /* No match */ |
443 | 0 | if (c == *pp++) break; |
444 | 0 | } |
445 | 0 | } |
446 | 0 | } |
447 | 0 | } |
448 | 0 | else |
449 | 0 | #endif |
450 | | |
451 | | /* Not in UTF or UCP mode */ |
452 | 0 | { |
453 | 0 | for (; length > 0; length--) |
454 | 0 | { |
455 | 0 | uint32_t cc, cp; |
456 | 0 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
457 | 0 | cc = UCHAR21TEST(eptr); |
458 | 0 | cp = UCHAR21TEST(p); |
459 | 0 | if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) |
460 | 0 | return -1; /* No match */ |
461 | 0 | p++; |
462 | 0 | eptr++; |
463 | 0 | } |
464 | 0 | } |
465 | 0 | } |
466 | | |
467 | | /* In the caseful case, we can just compare the code units, whether or not we |
468 | | are in UTF and/or UCP mode. When partial matching, we have to do this unit by |
469 | | unit. */ |
470 | | |
471 | 0 | else |
472 | 0 | { |
473 | 0 | if (mb->partial != 0) |
474 | 0 | { |
475 | 0 | for (; length > 0; length--) |
476 | 0 | { |
477 | 0 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
478 | 0 | if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */ |
479 | 0 | } |
480 | 0 | } |
481 | | |
482 | | /* Not partial matching */ |
483 | | |
484 | 0 | else |
485 | 0 | { |
486 | 0 | if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */ |
487 | 0 | if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */ |
488 | 0 | eptr += length; |
489 | 0 | } |
490 | 0 | } |
491 | | |
492 | 0 | *lengthptr = eptr - eptr_start; |
493 | 0 | return 0; /* Match */ |
494 | 0 | } |
495 | | |
496 | | |
497 | | |
498 | | /****************************************************************************** |
499 | | ******************************************************************************* |
500 | | "Recursion" in the match() function |
501 | | |
502 | | The original match() function was highly recursive, but this proved to be the |
503 | | source of a number of problems over the years, mostly because of the relatively |
504 | | small system stacks that are commonly found. As new features were added to |
505 | | patterns, various kludges were invented to reduce the amount of stack used, |
506 | | making the code hard to understand in places. |
507 | | |
508 | | A version did exist that used individual frames on the heap instead of calling |
509 | | match() recursively, but this ran substantially slower. The current version is |
510 | | a refactoring that uses a vector of frames to remember backtracking points. |
511 | | This runs no slower, and possibly even a bit faster than the original recursive |
512 | | implementation. |
513 | | |
514 | | At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50 |
515 | | frames) was allocated on the system stack. If this was not big enough, the heap |
516 | | was used for a larger vector. However, it turns out that there are environments |
517 | | where taking as little as 20KiB from the system stack is an embarrassment. |
518 | | After another refactoring, the heap is used exclusively, but a pointer the |
519 | | frames vector and its size are cached in the match_data block, so that there is |
520 | | no new memory allocation if the same match_data block is used for multiple |
521 | | matches (unless the frames vector has to be extended). |
522 | | ******************************************************************************* |
523 | | ******************************************************************************/ |
524 | | |
525 | | |
526 | | |
527 | | |
528 | | /************************************************* |
529 | | * Macros for the match() function * |
530 | | *************************************************/ |
531 | | |
532 | | /* These macros pack up tests that are used for partial matching several times |
533 | | in the code. The second one is used when we already know we are past the end of |
534 | | the subject. We set the "hit end" flag if the pointer is at the end of the |
535 | | subject and either (a) the pointer is past the earliest inspected character |
536 | | (i.e. something has been matched, even if not part of the actual matched |
537 | | string), or (b) the pattern contains a lookbehind. These are the conditions for |
538 | | which adding more characters may allow the current match to continue. |
539 | | |
540 | | For hard partial matching, we immediately return a partial match. Otherwise, |
541 | | carrying on means that a complete match on the current subject will be sought. |
542 | | A partial match is returned only if no complete match can be found. */ |
543 | | |
544 | | #define CHECK_PARTIAL() \ |
545 | 8.56M | do { \ |
546 | 8.56M | if (Feptr >= mb->end_subject) \ |
547 | 8.56M | { \ |
548 | 1.04M | SCHECK_PARTIAL(); \ |
549 | 1.04M | } \ |
550 | 8.56M | } \ |
551 | 8.56M | while (0) |
552 | | |
553 | | #define SCHECK_PARTIAL() \ |
554 | 10.1M | do { \ |
555 | 10.1M | if (mb->partial != 0 && \ |
556 | 10.1M | (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \ |
557 | 10.1M | { \ |
558 | 0 | mb->hitend = TRUE; \ |
559 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ |
560 | 0 | } \ |
561 | 10.1M | } \ |
562 | 10.1M | while (0) |
563 | | |
564 | | |
565 | | /* These macros are used to implement backtracking. They simulate a recursive |
566 | | call to the match() function by means of a local vector of frames which |
567 | | remember the backtracking points. */ |
568 | | |
569 | | #define RMATCH(ra,rb) \ |
570 | 427M | do { \ |
571 | 427M | start_ecode = ra; \ |
572 | 427M | Freturn_id = rb; \ |
573 | 427M | goto MATCH_RECURSE; \ |
574 | 427M | L_##rb:; \ |
575 | 427M | } \ |
576 | 427M | while (0) |
577 | | |
578 | | #define RRETURN(ra) \ |
579 | 427M | do { \ |
580 | 427M | rrc = ra; \ |
581 | 427M | goto RETURN_SWITCH; \ |
582 | 427M | } \ |
583 | 427M | while (0) |
584 | | |
585 | | |
586 | | |
587 | | /************************************************* |
588 | | * Match from current position * |
589 | | *************************************************/ |
590 | | |
591 | | /* This function is called to run one match attempt at a single starting point |
592 | | in the subject. |
593 | | |
594 | | Performance note: It might be tempting to extract commonly used fields from the |
595 | | mb structure (e.g. end_subject) into individual variables to improve |
596 | | performance. Tests using gcc on a SPARC disproved this; in the first case, it |
597 | | made performance worse. |
598 | | |
599 | | Arguments: |
600 | | start_eptr starting character in subject |
601 | | start_ecode starting position in compiled code |
602 | | top_bracket number of capturing parentheses in the pattern |
603 | | frame_size size of each backtracking frame |
604 | | match_data pointer to the match_data block |
605 | | mb pointer to "static" variables block |
606 | | |
607 | | Returns: MATCH_MATCH if matched ) these values are >= 0 |
608 | | MATCH_NOMATCH if failed to match ) |
609 | | negative MATCH_xxx value for PRUNE, SKIP, etc |
610 | | negative PCRE2_ERROR_xxx value if aborted by an error condition |
611 | | (e.g. stopped by repeated call or depth limit) |
612 | | */ |
613 | | |
614 | | static int |
615 | | match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, |
616 | | PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb) |
617 | 360k | { |
618 | | /* Frame-handling variables */ |
619 | | |
620 | 360k | heapframe *F; /* Current frame pointer */ |
621 | 360k | heapframe *N = NULL; /* Temporary frame pointers */ |
622 | 360k | heapframe *P = NULL; |
623 | | |
624 | 360k | heapframe *frames_top; /* End of frames vector */ |
625 | 360k | heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */ |
626 | 360k | PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ |
627 | | |
628 | | /* Local variables that do not need to be preserved over calls to RRMATCH(). */ |
629 | | |
630 | 360k | PCRE2_SPTR branch_end = NULL; |
631 | 360k | PCRE2_SPTR branch_start; |
632 | 360k | PCRE2_SPTR bracode; /* Temp pointer to start of group */ |
633 | 360k | PCRE2_SIZE offset; /* Used for group offsets */ |
634 | 360k | PCRE2_SIZE length; /* Used for various length calculations */ |
635 | | |
636 | 360k | int rrc; /* Return from functions & backtracking "recursions" */ |
637 | 360k | #ifdef SUPPORT_UNICODE |
638 | 360k | int proptype; /* Type of character property */ |
639 | 360k | #endif |
640 | | |
641 | 360k | uint32_t i; /* Used for local loops */ |
642 | 360k | uint32_t fc; /* Character values */ |
643 | 360k | uint32_t number; /* Used for group and other numbers */ |
644 | 360k | uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */ |
645 | 360k | uint32_t group_frame_type; /* Specifies type for new group frames */ |
646 | | |
647 | 360k | BOOL condition; /* Used in conditional groups */ |
648 | 360k | BOOL cur_is_word; /* Used in "word" tests */ |
649 | 360k | BOOL prev_is_word; /* Used in "word" tests */ |
650 | | |
651 | | /* UTF and UCP flags */ |
652 | | |
653 | 360k | #ifdef SUPPORT_UNICODE |
654 | 360k | BOOL utf = (mb->poptions & PCRE2_UTF) != 0; |
655 | 360k | BOOL ucp = (mb->poptions & PCRE2_UCP) != 0; |
656 | | #else |
657 | | BOOL utf = FALSE; /* Required for convenience even when no Unicode support */ |
658 | | #endif |
659 | | |
660 | | /* This is the length of the last part of a backtracking frame that must be |
661 | | copied when a new frame is created. */ |
662 | | |
663 | 360k | frame_copy_size = frame_size - offsetof(heapframe, eptr); |
664 | | |
665 | | /* Set up the first frame and the end of the frames vector. */ |
666 | | |
667 | 360k | F = match_data->heapframes; |
668 | 360k | frames_top = (heapframe *)((char *)F + match_data->heapframes_size); |
669 | | |
670 | 360k | Frdepth = 0; /* "Recursion" depth */ |
671 | 360k | Fcapture_last = 0; /* Number of most recent capture */ |
672 | 360k | Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */ |
673 | 360k | Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */ |
674 | 360k | Fmark = NULL; /* Most recent mark */ |
675 | 360k | Foffset_top = 0; /* End of captures within the frame */ |
676 | 360k | Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */ |
677 | 360k | group_frame_type = 0; /* Not a start of group frame */ |
678 | 360k | goto NEW_FRAME; /* Start processing with this frame */ |
679 | | |
680 | | /* Come back here when we want to create a new frame for remembering a |
681 | | backtracking point. */ |
682 | | |
683 | 427M | MATCH_RECURSE: |
684 | | |
685 | | /* Set up a new backtracking frame. If the vector is full, get a new one, |
686 | | doubling the size, but constrained by the heap limit (which is in KiB). */ |
687 | | |
688 | 427M | N = (heapframe *)((char *)F + frame_size); |
689 | 427M | if ((heapframe *)((char *)N + frame_size) >= frames_top) |
690 | 5 | { |
691 | 5 | heapframe *new; |
692 | 5 | PCRE2_SIZE newsize; |
693 | 5 | PCRE2_SIZE usedsize = (char *)N - (char *)(match_data->heapframes); |
694 | | |
695 | 5 | if (match_data->heapframes_size >= PCRE2_SIZE_MAX / 2) |
696 | 0 | { |
697 | 0 | if (match_data->heapframes_size == PCRE2_SIZE_MAX - 1) |
698 | 0 | return PCRE2_ERROR_NOMEMORY; |
699 | 0 | newsize = PCRE2_SIZE_MAX - 1; |
700 | 0 | } |
701 | 5 | else |
702 | 5 | newsize = match_data->heapframes_size * 2; |
703 | | |
704 | 5 | if (newsize / 1024 >= mb->heap_limit) |
705 | 0 | { |
706 | 0 | PCRE2_SIZE old_size = match_data->heapframes_size / 1024; |
707 | 0 | if (mb->heap_limit <= old_size) |
708 | 0 | return PCRE2_ERROR_HEAPLIMIT; |
709 | 0 | else |
710 | 0 | { |
711 | 0 | PCRE2_SIZE max_delta = 1024 * (mb->heap_limit - old_size); |
712 | 0 | int over_bytes = match_data->heapframes_size % 1024; |
713 | 0 | if (over_bytes) max_delta -= (1024 - over_bytes); |
714 | 0 | newsize = match_data->heapframes_size + max_delta; |
715 | 0 | } |
716 | 0 | } |
717 | | |
718 | | /* With a heap limit set, the permitted additional size may not be enough for |
719 | | another frame, so do a final check. */ |
720 | | |
721 | 5 | if (newsize - usedsize < frame_size) return PCRE2_ERROR_HEAPLIMIT; |
722 | 5 | new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data); |
723 | 5 | if (new == NULL) return PCRE2_ERROR_NOMEMORY; |
724 | 5 | memcpy(new, match_data->heapframes, usedsize); |
725 | | |
726 | 5 | N = (heapframe *)((char *)new + usedsize); |
727 | 5 | F = (heapframe *)((char *)N - frame_size); |
728 | | |
729 | 5 | match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data); |
730 | 5 | match_data->heapframes = new; |
731 | 5 | match_data->heapframes_size = newsize; |
732 | 5 | frames_top = (heapframe *)((char *)new + newsize); |
733 | 5 | } |
734 | | |
735 | | #ifdef DEBUG_SHOW_RMATCH |
736 | | fprintf(stderr, "++ RMATCH %d frame=%d", Freturn_id, Frdepth + 1); |
737 | | if (group_frame_type != 0) |
738 | | { |
739 | | fprintf(stderr, " type=%x ", group_frame_type); |
740 | | switch (GF_IDMASK(group_frame_type)) |
741 | | { |
742 | | case GF_CAPTURE: |
743 | | fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type)); |
744 | | break; |
745 | | |
746 | | case GF_NOCAPTURE: |
747 | | fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type)); |
748 | | break; |
749 | | |
750 | | case GF_CONDASSERT: |
751 | | fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type)); |
752 | | break; |
753 | | |
754 | | case GF_RECURSE: |
755 | | fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type)); |
756 | | break; |
757 | | |
758 | | default: |
759 | | fprintf(stderr, "*** unknown ***"); |
760 | | break; |
761 | | } |
762 | | } |
763 | | fprintf(stderr, "\n"); |
764 | | #endif |
765 | | |
766 | | /* Copy those fields that must be copied into the new frame, increase the |
767 | | "recursion" depth (i.e. the new frame's index) and then make the new frame |
768 | | current. */ |
769 | | |
770 | 427M | memcpy((char *)N + offsetof(heapframe, eptr), |
771 | 427M | (char *)F + offsetof(heapframe, eptr), |
772 | 427M | frame_copy_size); |
773 | | |
774 | 427M | N->rdepth = Frdepth + 1; |
775 | 427M | F = N; |
776 | | |
777 | | /* Carry on processing with a new frame. */ |
778 | | |
779 | 427M | NEW_FRAME: |
780 | 427M | Fgroup_frame_type = group_frame_type; |
781 | 427M | Fecode = start_ecode; /* Starting code pointer */ |
782 | 427M | Fback_frame = frame_size; /* Default is go back one frame */ |
783 | | |
784 | | /* If this is a special type of group frame, remember its offset for quick |
785 | | access at the end of the group. If this is a recursion, set a new current |
786 | | recursion value. */ |
787 | | |
788 | 427M | if (group_frame_type != 0) |
789 | 7.03M | { |
790 | 7.03M | Flast_group_offset = (char *)F - (char *)match_data->heapframes; |
791 | 7.03M | if (GF_IDMASK(group_frame_type) == GF_RECURSE) |
792 | 1.00M | Fcurrent_recurse = GF_DATAMASK(group_frame_type); |
793 | 7.03M | group_frame_type = 0; |
794 | 7.03M | } |
795 | | |
796 | | |
797 | | /* ========================================================================= */ |
798 | | /* This is the main processing loop. First check that we haven't recorded too |
799 | | many backtracks (search tree is too large), or that we haven't exceeded the |
800 | | recursive depth limit (used too many backtracking frames). If not, process the |
801 | | opcodes. */ |
802 | | |
803 | 427M | if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT; |
804 | 427M | if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT; |
805 | | |
806 | | #ifdef DEBUG_SHOW_OPS |
807 | | fprintf(stderr, "\n++ New frame: type=0x%x subject offset %ld\n", |
808 | | GF_IDMASK(Fgroup_frame_type), Feptr - mb->start_subject); |
809 | | #endif |
810 | | |
811 | 427M | for (;;) |
812 | 572M | { |
813 | | #ifdef DEBUG_SHOW_OPS |
814 | | fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, |
815 | | OP_names[*Fecode]); |
816 | | #endif |
817 | | |
818 | 572M | Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */ |
819 | 572M | switch(Fop) |
820 | 572M | { |
821 | | /* ===================================================================== */ |
822 | | /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close |
823 | | any currently open capturing brackets. Unlike reaching the end of a group, |
824 | | where we know the starting frame is at the top of the chained frames, in |
825 | | this case we have to search back for the relevant frame in case other types |
826 | | of group that use chained frames have intervened. Multiple OP_CLOSEs always |
827 | | come innermost first, which matches the chain order. We can ignore this in |
828 | | a recursion, because captures are not passed out of recursions. */ |
829 | | |
830 | 0 | case OP_CLOSE: |
831 | 0 | if (Fcurrent_recurse == RECURSE_UNSET) |
832 | 0 | { |
833 | 0 | number = GET2(Fecode, 1); |
834 | 0 | offset = Flast_group_offset; |
835 | 0 | for(;;) |
836 | 0 | { |
837 | | /* Corrupted heapframes?. Trigger an assert and return an error */ |
838 | 0 | PCRE2_ASSERT(offset != PCRE2_UNSET); |
839 | 0 | if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; |
840 | | |
841 | 0 | N = (heapframe *)((char *)match_data->heapframes + offset); |
842 | 0 | P = (heapframe *)((char *)N - frame_size); |
843 | 0 | if (N->group_frame_type == (GF_CAPTURE | number)) break; |
844 | 0 | offset = P->last_group_offset; |
845 | 0 | } |
846 | 0 | offset = (number << 1) - 2; |
847 | 0 | Fcapture_last = number; |
848 | 0 | Fovector[offset] = P->eptr - mb->start_subject; |
849 | 0 | Fovector[offset+1] = Feptr - mb->start_subject; |
850 | 0 | if (offset >= Foffset_top) Foffset_top = offset + 2; |
851 | 0 | } |
852 | 0 | Fecode += PRIV(OP_lengths)[*Fecode]; |
853 | 0 | break; |
854 | | |
855 | | |
856 | | /* ===================================================================== */ |
857 | | /* Real or forced end of the pattern, assertion, or recursion. In an |
858 | | assertion ACCEPT, update the last used pointer and remember the current |
859 | | frame so that the captures and mark can be fished out of it. */ |
860 | | |
861 | 0 | case OP_ASSERT_ACCEPT: |
862 | 0 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
863 | 0 | assert_accept_frame = F; |
864 | 0 | RRETURN(MATCH_ACCEPT); |
865 | | |
866 | | /* For ACCEPT within a recursion, we have to find the most recent |
867 | | recursion. If not in a recursion, fall through to code that is common with |
868 | | OP_END. */ |
869 | | |
870 | 0 | case OP_ACCEPT: |
871 | 0 | if (Fcurrent_recurse != RECURSE_UNSET) |
872 | 0 | { |
873 | | #ifdef DEBUG_SHOW_OPS |
874 | | fprintf(stderr, "++ Accept within recursion\n"); |
875 | | #endif |
876 | 0 | offset = Flast_group_offset; |
877 | 0 | for(;;) |
878 | 0 | { |
879 | | /* Corrupted heapframes?. Trigger an assert and return an error */ |
880 | 0 | PCRE2_ASSERT(offset != PCRE2_UNSET); |
881 | 0 | if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; |
882 | | |
883 | 0 | N = (heapframe *)((char *)match_data->heapframes + offset); |
884 | 0 | P = (heapframe *)((char *)N - frame_size); |
885 | 0 | if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break; |
886 | 0 | offset = P->last_group_offset; |
887 | 0 | } |
888 | | |
889 | | /* N is now the frame of the recursion; the previous frame is at the |
890 | | OP_RECURSE position. Go back there, copying the current subject position |
891 | | and mark, and the start_match position (\K might have changed it), and |
892 | | then move on past the OP_RECURSE. */ |
893 | | |
894 | 0 | P->eptr = Feptr; |
895 | 0 | P->mark = Fmark; |
896 | 0 | P->start_match = Fstart_match; |
897 | 0 | F = P; |
898 | 0 | Fecode += 1 + LINK_SIZE; |
899 | 0 | continue; |
900 | 0 | } |
901 | | /* Fall through */ |
902 | | |
903 | | /* OP_END itself can never be reached within a recursion because that is |
904 | | picked up when the OP_KET that always precedes OP_END is reached. */ |
905 | | |
906 | 4.40k | case OP_END: |
907 | | |
908 | | /* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if |
909 | | PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the |
910 | | subject. In both cases, backtracking will then try other alternatives, if |
911 | | any. */ |
912 | | |
913 | 4.40k | if (Feptr == Fstart_match && |
914 | 4.40k | ((mb->moptions & PCRE2_NOTEMPTY) != 0 || |
915 | 3.95k | ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 && |
916 | 3.95k | Fstart_match == mb->start_subject + mb->start_offset))) |
917 | 3.84k | { |
918 | | #ifdef DEBUG_SHOW_OPS |
919 | | fprintf(stderr, "++ Backtrack because empty string\n"); |
920 | | #endif |
921 | 3.84k | RRETURN(MATCH_NOMATCH); |
922 | 3.84k | } |
923 | | |
924 | | /* Fail if PCRE2_ENDANCHORED is set and the end of the match is not |
925 | | the end of the subject. After (*ACCEPT) we fail the entire match (at this |
926 | | position) but backtrack if we've reached the end of the pattern. This |
927 | | applies whether or not we are in a recursion. */ |
928 | | |
929 | 559 | if (Feptr < mb->end_subject && |
930 | 559 | ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0) |
931 | 0 | { |
932 | 0 | if (Fop == OP_END) |
933 | 0 | { |
934 | | #ifdef DEBUG_SHOW_OPS |
935 | | fprintf(stderr, "++ Backtrack because not at end (endanchored set)\n"); |
936 | | #endif |
937 | 0 | RRETURN(MATCH_NOMATCH); |
938 | 0 | } |
939 | | |
940 | | #ifdef DEBUG_SHOW_OPS |
941 | | fprintf(stderr, "++ Failed ACCEPT not at end (endanchnored set)\n"); |
942 | | #endif |
943 | 0 | return MATCH_NOMATCH; /* (*ACCEPT) */ |
944 | 0 | } |
945 | | |
946 | | /* We have a successful match of the whole pattern. Record the result and |
947 | | then do a direct return from the function. If there is space in the offset |
948 | | vector, set any pairs that follow the highest-numbered captured string but |
949 | | are less than the number of capturing groups in the pattern to PCRE2_UNSET. |
950 | | It is documented that this happens. "Gaps" are set to PCRE2_UNSET |
951 | | dynamically. It is only those at the end that need setting here. */ |
952 | | |
953 | 559 | mb->end_match_ptr = Feptr; /* Record where we ended */ |
954 | 559 | mb->end_offset_top = Foffset_top; /* and how many extracts were taken */ |
955 | 559 | mb->mark = Fmark; /* and the last success mark */ |
956 | 559 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
957 | | |
958 | 559 | match_data->ovector[0] = Fstart_match - mb->start_subject; |
959 | 559 | match_data->ovector[1] = Feptr - mb->start_subject; |
960 | | |
961 | | /* Set i to the smaller of the sizes of the external and frame ovectors. */ |
962 | | |
963 | 559 | i = 2 * ((top_bracket + 1 > match_data->oveccount)? |
964 | 559 | match_data->oveccount : top_bracket + 1); |
965 | 559 | memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE)); |
966 | 1.07k | while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET; |
967 | 559 | return MATCH_MATCH; /* Note: NOT RRETURN */ |
968 | | |
969 | | |
970 | | /*===================================================================== */ |
971 | | /* Match any single character type except newline; have to take care with |
972 | | CRLF newlines and partial matching. */ |
973 | | |
974 | 6.95M | case OP_ANY: |
975 | 6.95M | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
976 | 6.77M | if (mb->partial != 0 && |
977 | 6.77M | Feptr == mb->end_subject - 1 && |
978 | 6.77M | NLBLOCK->nltype == NLTYPE_FIXED && |
979 | 6.77M | NLBLOCK->nllen == 2 && |
980 | 6.77M | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
981 | 0 | { |
982 | 0 | mb->hitend = TRUE; |
983 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
984 | 0 | } |
985 | | /* Fall through */ |
986 | | |
987 | | /* Match any single character whatsoever. */ |
988 | | |
989 | 8.13M | case OP_ALLANY: |
990 | 8.13M | if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ |
991 | 101k | { /* not be updated before SCHECK_PARTIAL. */ |
992 | 101k | SCHECK_PARTIAL(); |
993 | 101k | RRETURN(MATCH_NOMATCH); |
994 | 101k | } |
995 | 8.03M | Feptr++; |
996 | 8.03M | #ifdef SUPPORT_UNICODE |
997 | 8.03M | if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
998 | 8.03M | #endif |
999 | 8.03M | Fecode++; |
1000 | 8.03M | break; |
1001 | | |
1002 | | |
1003 | | /* ===================================================================== */ |
1004 | | /* Match a single code unit, even in UTF mode. This opcode really does |
1005 | | match any code unit, even newline. (It really should be called ANYCODEUNIT, |
1006 | | of course - the byte name is from pre-16 bit days.) */ |
1007 | | |
1008 | 302 | case OP_ANYBYTE: |
1009 | 302 | if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ |
1010 | 3 | { /* not be updated before SCHECK_PARTIAL. */ |
1011 | 3 | SCHECK_PARTIAL(); |
1012 | 3 | RRETURN(MATCH_NOMATCH); |
1013 | 3 | } |
1014 | 299 | Feptr++; |
1015 | 299 | Fecode++; |
1016 | 299 | break; |
1017 | | |
1018 | | |
1019 | | /* ===================================================================== */ |
1020 | | /* Match a single character, casefully */ |
1021 | | |
1022 | 127M | case OP_CHAR: |
1023 | 127M | #ifdef SUPPORT_UNICODE |
1024 | 127M | if (utf) |
1025 | 83.4M | { |
1026 | 83.4M | Flength = 1; |
1027 | 83.4M | Fecode++; |
1028 | 83.4M | GETCHARLEN(fc, Fecode, Flength); |
1029 | 83.4M | if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr)) |
1030 | 988k | { |
1031 | 988k | CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ |
1032 | 988k | RRETURN(MATCH_NOMATCH); |
1033 | 988k | } |
1034 | 82.4M | for (; Flength > 0; Flength--) |
1035 | 82.4M | { |
1036 | 82.4M | if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH); |
1037 | 82.4M | } |
1038 | 82.4M | } |
1039 | 43.9M | else |
1040 | 43.9M | #endif |
1041 | | |
1042 | | /* Not UTF mode */ |
1043 | 43.9M | { |
1044 | 43.9M | if (mb->end_subject - Feptr < 1) |
1045 | 120k | { |
1046 | 120k | SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ |
1047 | 120k | RRETURN(MATCH_NOMATCH); |
1048 | 120k | } |
1049 | 43.8M | if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH); |
1050 | 795k | Fecode += 2; |
1051 | 795k | } |
1052 | 833k | break; |
1053 | | |
1054 | | |
1055 | | /* ===================================================================== */ |
1056 | | /* Match a single character, caselessly. If we are at the end of the |
1057 | | subject, give up immediately. We get here only when the pattern character |
1058 | | has at most one other case. Characters with more than two cases are coded |
1059 | | as OP_PROP with the pseudo-property PT_CLIST. */ |
1060 | | |
1061 | 24.0M | case OP_CHARI: |
1062 | 24.0M | if (Feptr >= mb->end_subject) |
1063 | 220k | { |
1064 | 220k | SCHECK_PARTIAL(); |
1065 | 220k | RRETURN(MATCH_NOMATCH); |
1066 | 220k | } |
1067 | | |
1068 | 23.8M | #ifdef SUPPORT_UNICODE |
1069 | 23.8M | if (utf) |
1070 | 5.09M | { |
1071 | 5.09M | Flength = 1; |
1072 | 5.09M | Fecode++; |
1073 | 5.09M | GETCHARLEN(fc, Fecode, Flength); |
1074 | | |
1075 | | /* If the pattern character's value is < 128, we know that its other case |
1076 | | (if any) is also < 128 (and therefore only one code unit long in all |
1077 | | code-unit widths), so we can use the fast lookup table. We checked above |
1078 | | that there is at least one character left in the subject. */ |
1079 | | |
1080 | 5.09M | if (fc < 128) |
1081 | 5.06M | { |
1082 | 5.06M | uint32_t cc = UCHAR21(Feptr); |
1083 | 5.06M | if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); |
1084 | 194k | Fecode++; |
1085 | 194k | Feptr++; |
1086 | 194k | } |
1087 | | |
1088 | | /* Otherwise we must pick up the subject character and use Unicode |
1089 | | property support to test its other case. Note that we cannot use the |
1090 | | value of "Flength" to check for sufficient bytes left, because the other |
1091 | | case of the character may have more or fewer code units. */ |
1092 | | |
1093 | 22.6k | else |
1094 | 22.6k | { |
1095 | 22.6k | uint32_t dc; |
1096 | 22.6k | GETCHARINC(dc, Feptr); |
1097 | 22.6k | Fecode += Flength; |
1098 | 22.6k | if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); |
1099 | 22.6k | } |
1100 | 5.09M | } |
1101 | | |
1102 | | /* If UCP is set without UTF we must do the same as above, but with one |
1103 | | character per code unit. */ |
1104 | | |
1105 | 18.7M | else if (ucp) |
1106 | 0 | { |
1107 | 0 | uint32_t cc = UCHAR21(Feptr); |
1108 | 0 | fc = Fecode[1]; |
1109 | 0 | if (fc < 128) |
1110 | 0 | { |
1111 | 0 | if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); |
1112 | 0 | } |
1113 | 0 | else |
1114 | 0 | { |
1115 | 0 | if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); |
1116 | 0 | } |
1117 | 0 | Feptr++; |
1118 | 0 | Fecode += 2; |
1119 | 0 | } |
1120 | | |
1121 | 18.7M | else |
1122 | 18.7M | #endif /* SUPPORT_UNICODE */ |
1123 | | |
1124 | | /* Not UTF or UCP mode; use the table for characters < 256. */ |
1125 | 18.7M | { |
1126 | 18.7M | if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) |
1127 | 18.7M | != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); |
1128 | 266k | Feptr++; |
1129 | 266k | Fecode += 2; |
1130 | 266k | } |
1131 | 461k | break; |
1132 | | |
1133 | | |
1134 | | /* ===================================================================== */ |
1135 | | /* Match not a single character. */ |
1136 | | |
1137 | 461k | case OP_NOT: |
1138 | 686k | case OP_NOTI: |
1139 | 686k | if (Feptr >= mb->end_subject) |
1140 | 2.14k | { |
1141 | 2.14k | SCHECK_PARTIAL(); |
1142 | 2.14k | RRETURN(MATCH_NOMATCH); |
1143 | 2.14k | } |
1144 | | |
1145 | 684k | #ifdef SUPPORT_UNICODE |
1146 | 684k | if (utf) |
1147 | 3.10k | { |
1148 | 3.10k | uint32_t ch; |
1149 | 3.10k | Fecode++; |
1150 | 3.10k | GETCHARINC(ch, Fecode); |
1151 | 3.10k | GETCHARINC(fc, Feptr); |
1152 | 3.10k | if (ch == fc) |
1153 | 120 | { |
1154 | 120 | RRETURN(MATCH_NOMATCH); /* Caseful match */ |
1155 | 120 | } |
1156 | 2.98k | else if (Fop == OP_NOTI) /* If caseless */ |
1157 | 2.97k | { |
1158 | 2.97k | if (ch > 127) |
1159 | 0 | ch = UCD_OTHERCASE(ch); |
1160 | 2.97k | else |
1161 | 2.97k | ch = (mb->fcc)[ch]; |
1162 | 2.97k | if (ch == fc) RRETURN(MATCH_NOMATCH); |
1163 | 2.97k | } |
1164 | 3.10k | } |
1165 | | |
1166 | | /* UCP without UTF is as above, but with one character per code unit. */ |
1167 | | |
1168 | 681k | else if (ucp) |
1169 | 0 | { |
1170 | 0 | uint32_t ch; |
1171 | 0 | fc = UCHAR21INC(Feptr); |
1172 | 0 | ch = Fecode[1]; |
1173 | 0 | Fecode += 2; |
1174 | |
|
1175 | 0 | if (ch == fc) |
1176 | 0 | { |
1177 | 0 | RRETURN(MATCH_NOMATCH); /* Caseful match */ |
1178 | 0 | } |
1179 | 0 | else if (Fop == OP_NOTI) /* If caseless */ |
1180 | 0 | { |
1181 | 0 | if (ch > 127) |
1182 | 0 | ch = UCD_OTHERCASE(ch); |
1183 | 0 | else |
1184 | 0 | ch = (mb->fcc)[ch]; |
1185 | 0 | if (ch == fc) RRETURN(MATCH_NOMATCH); |
1186 | 0 | } |
1187 | 0 | } |
1188 | | |
1189 | 681k | else |
1190 | 681k | #endif /* SUPPORT_UNICODE */ |
1191 | | |
1192 | | /* Neither UTF nor UCP is set */ |
1193 | | |
1194 | 681k | { |
1195 | 681k | uint32_t ch = Fecode[1]; |
1196 | 681k | fc = UCHAR21INC(Feptr); |
1197 | 681k | if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) |
1198 | 5.24k | RRETURN(MATCH_NOMATCH); |
1199 | 676k | Fecode += 2; |
1200 | 676k | } |
1201 | 679k | break; |
1202 | | |
1203 | | |
1204 | | /* ===================================================================== */ |
1205 | | /* Match a single character repeatedly. */ |
1206 | | |
1207 | 679k | #define Loclength F->temp_size |
1208 | 45.3M | #define Lstart_eptr F->temp_sptr[0] |
1209 | 22.6M | #define Lcharptr F->temp_sptr[1] |
1210 | 152M | #define Lmin F->temp_32[0] |
1211 | 114M | #define Lmax F->temp_32[1] |
1212 | 83.1M | #define Lc F->temp_32[2] |
1213 | 13.1M | #define Loc F->temp_32[3] |
1214 | | |
1215 | 679k | case OP_EXACT: |
1216 | 0 | case OP_EXACTI: |
1217 | 0 | Lmin = Lmax = GET2(Fecode, 1); |
1218 | 0 | Fecode += 1 + IMM2_SIZE; |
1219 | 0 | goto REPEATCHAR; |
1220 | | |
1221 | 0 | case OP_POSUPTO: |
1222 | 0 | case OP_POSUPTOI: |
1223 | 0 | reptype = REPTYPE_POS; |
1224 | 0 | Lmin = 0; |
1225 | 0 | Lmax = GET2(Fecode, 1); |
1226 | 0 | Fecode += 1 + IMM2_SIZE; |
1227 | 0 | goto REPEATCHAR; |
1228 | | |
1229 | 0 | case OP_UPTO: |
1230 | 0 | case OP_UPTOI: |
1231 | 0 | reptype = REPTYPE_MAX; |
1232 | 0 | Lmin = 0; |
1233 | 0 | Lmax = GET2(Fecode, 1); |
1234 | 0 | Fecode += 1 + IMM2_SIZE; |
1235 | 0 | goto REPEATCHAR; |
1236 | | |
1237 | 0 | case OP_MINUPTO: |
1238 | 0 | case OP_MINUPTOI: |
1239 | 0 | reptype = REPTYPE_MIN; |
1240 | 0 | Lmin = 0; |
1241 | 0 | Lmax = GET2(Fecode, 1); |
1242 | 0 | Fecode += 1 + IMM2_SIZE; |
1243 | 0 | goto REPEATCHAR; |
1244 | | |
1245 | 12.1k | case OP_POSSTAR: |
1246 | 60.3k | case OP_POSSTARI: |
1247 | 60.3k | reptype = REPTYPE_POS; |
1248 | 60.3k | Lmin = 0; |
1249 | 60.3k | Lmax = UINT32_MAX; |
1250 | 60.3k | Fecode++; |
1251 | 60.3k | goto REPEATCHAR; |
1252 | | |
1253 | 153k | case OP_POSPLUS: |
1254 | 215k | case OP_POSPLUSI: |
1255 | 215k | reptype = REPTYPE_POS; |
1256 | 215k | Lmin = 1; |
1257 | 215k | Lmax = UINT32_MAX; |
1258 | 215k | Fecode++; |
1259 | 215k | goto REPEATCHAR; |
1260 | | |
1261 | 20.2M | case OP_POSQUERY: |
1262 | 23.1M | case OP_POSQUERYI: |
1263 | 23.1M | reptype = REPTYPE_POS; |
1264 | 23.1M | Lmin = 0; |
1265 | 23.1M | Lmax = 1; |
1266 | 23.1M | Fecode++; |
1267 | 23.1M | goto REPEATCHAR; |
1268 | | |
1269 | 410 | case OP_STAR: |
1270 | 6.80k | case OP_STARI: |
1271 | 7.78k | case OP_MINSTAR: |
1272 | 18.2k | case OP_MINSTARI: |
1273 | 20.9k | case OP_PLUS: |
1274 | 23.7k | case OP_PLUSI: |
1275 | 24.1k | case OP_MINPLUS: |
1276 | 55.3k | case OP_MINPLUSI: |
1277 | 9.22M | case OP_QUERY: |
1278 | 11.0M | case OP_QUERYI: |
1279 | 13.2M | case OP_MINQUERY: |
1280 | 14.8M | case OP_MINQUERYI: |
1281 | 14.8M | fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI); |
1282 | 14.8M | Lmin = rep_min[fc]; |
1283 | 14.8M | Lmax = rep_max[fc]; |
1284 | 14.8M | reptype = rep_typ[fc]; |
1285 | | |
1286 | | /* Common code for all repeated single-character matches. We first check |
1287 | | for the minimum number of characters. If the minimum equals the maximum, we |
1288 | | are done. Otherwise, if minimizing, check the rest of the pattern for a |
1289 | | match; if there isn't one, advance up to the maximum, one character at a |
1290 | | time. |
1291 | | |
1292 | | If maximizing, advance up to the maximum number of matching characters, |
1293 | | until Feptr is past the end of the maximum run. If possessive, we are |
1294 | | then done (no backing up). Otherwise, match at this position; anything |
1295 | | other than no match is immediately returned. For nomatch, back up one |
1296 | | character, unless we are matching \R and the last thing matched was |
1297 | | \r\n, in which case, back up two code units until we reach the first |
1298 | | optional character position. |
1299 | | |
1300 | | The various UTF/non-UTF and caseful/caseless cases are handled separately, |
1301 | | for speed. */ |
1302 | | |
1303 | 38.3M | REPEATCHAR: |
1304 | 38.3M | #ifdef SUPPORT_UNICODE |
1305 | 38.3M | if (utf) |
1306 | 22.6M | { |
1307 | 22.6M | Flength = 1; |
1308 | 22.6M | Lcharptr = Fecode; |
1309 | 22.6M | GETCHARLEN(fc, Fecode, Flength); |
1310 | 22.6M | Fecode += Flength; |
1311 | | |
1312 | | /* Handle multi-code-unit character matching, caseful and caseless. */ |
1313 | | |
1314 | 22.6M | if (Flength > 1) |
1315 | 4.37k | { |
1316 | 4.37k | uint32_t othercase; |
1317 | | |
1318 | 4.37k | if (Fop >= OP_STARI && /* Caseless */ |
1319 | 4.37k | (othercase = UCD_OTHERCASE(fc)) != fc) |
1320 | 0 | Loclength = PRIV(ord2utf)(othercase, Foccu); |
1321 | 4.37k | else Loclength = 0; |
1322 | | |
1323 | 4.37k | for (i = 1; i <= Lmin; i++) |
1324 | 4.37k | { |
1325 | 4.37k | if (Feptr <= mb->end_subject - Flength && |
1326 | 4.37k | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; |
1327 | 4.37k | else if (Loclength > 0 && |
1328 | 4.37k | Feptr <= mb->end_subject - Loclength && |
1329 | 4.37k | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1330 | 0 | Feptr += Loclength; |
1331 | 4.37k | else |
1332 | 4.37k | { |
1333 | 4.37k | CHECK_PARTIAL(); |
1334 | 4.37k | RRETURN(MATCH_NOMATCH); |
1335 | 4.37k | } |
1336 | 4.37k | } |
1337 | | |
1338 | 0 | if (Lmin == Lmax) continue; |
1339 | | |
1340 | 0 | if (reptype == REPTYPE_MIN) |
1341 | 0 | { |
1342 | 0 | for (;;) |
1343 | 0 | { |
1344 | 0 | RMATCH(Fecode, RM202); |
1345 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1346 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1347 | 0 | if (Feptr <= mb->end_subject - Flength && |
1348 | 0 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; |
1349 | 0 | else if (Loclength > 0 && |
1350 | 0 | Feptr <= mb->end_subject - Loclength && |
1351 | 0 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1352 | 0 | Feptr += Loclength; |
1353 | 0 | else |
1354 | 0 | { |
1355 | 0 | CHECK_PARTIAL(); |
1356 | 0 | RRETURN(MATCH_NOMATCH); |
1357 | 0 | } |
1358 | 0 | } |
1359 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
1360 | 0 | } |
1361 | | |
1362 | 0 | else /* Maximize */ |
1363 | 0 | { |
1364 | 0 | Lstart_eptr = Feptr; |
1365 | 0 | for (i = Lmin; i < Lmax; i++) |
1366 | 0 | { |
1367 | 0 | if (Feptr <= mb->end_subject - Flength && |
1368 | 0 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) |
1369 | 0 | Feptr += Flength; |
1370 | 0 | else if (Loclength > 0 && |
1371 | 0 | Feptr <= mb->end_subject - Loclength && |
1372 | 0 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1373 | 0 | Feptr += Loclength; |
1374 | 0 | else |
1375 | 0 | { |
1376 | 0 | CHECK_PARTIAL(); |
1377 | 0 | break; |
1378 | 0 | } |
1379 | 0 | } |
1380 | | |
1381 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1382 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1383 | | go too far. */ |
1384 | | |
1385 | 0 | if (reptype != REPTYPE_POS) for(;;) |
1386 | 0 | { |
1387 | 0 | if (Feptr <= Lstart_eptr) break; |
1388 | 0 | RMATCH(Fecode, RM203); |
1389 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1390 | 0 | Feptr--; |
1391 | 0 | BACKCHAR(Feptr); |
1392 | 0 | } |
1393 | 0 | } |
1394 | 0 | break; /* End of repeated wide character handling */ |
1395 | 0 | } |
1396 | | |
1397 | | /* Length of UTF character is 1. Put it into the preserved variable and |
1398 | | fall through to the non-UTF code. */ |
1399 | | |
1400 | 22.6M | Lc = fc; |
1401 | 22.6M | } |
1402 | 15.7M | else |
1403 | 15.7M | #endif /* SUPPORT_UNICODE */ |
1404 | | |
1405 | | /* When not in UTF mode, load a single-code-unit character. Then proceed as |
1406 | | above, using Unicode casing if either UTF or UCP is set. */ |
1407 | | |
1408 | 15.7M | Lc = *Fecode++; |
1409 | | |
1410 | | /* Caseless comparison */ |
1411 | | |
1412 | 38.3M | if (Fop >= OP_STARI) |
1413 | 6.61M | { |
1414 | 6.61M | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1415 | 6.61M | #ifdef SUPPORT_UNICODE |
1416 | 6.61M | if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); |
1417 | 6.61M | else |
1418 | 6.61M | #endif /* SUPPORT_UNICODE */ |
1419 | | /* Lc will be < 128 in UTF-8 mode. */ |
1420 | 6.61M | Loc = mb->fcc[Lc]; |
1421 | | #else /* 16-bit & 32-bit */ |
1422 | | #ifdef SUPPORT_UNICODE |
1423 | | if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); |
1424 | | else |
1425 | | #endif /* SUPPORT_UNICODE */ |
1426 | | Loc = TABLE_GET(Lc, mb->fcc, Lc); |
1427 | | #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
1428 | | |
1429 | 6.61M | for (i = 1; i <= Lmin; i++) |
1430 | 91.7k | { |
1431 | 91.7k | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1432 | 91.7k | if (Feptr >= mb->end_subject) |
1433 | 2.91k | { |
1434 | 2.91k | SCHECK_PARTIAL(); |
1435 | 2.91k | RRETURN(MATCH_NOMATCH); |
1436 | 2.91k | } |
1437 | 88.8k | cc = UCHAR21TEST(Feptr); |
1438 | 88.8k | if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); |
1439 | 2.51k | Feptr++; |
1440 | 2.51k | } |
1441 | 6.52M | if (Lmin == Lmax) continue; |
1442 | | |
1443 | 6.52M | if (reptype == REPTYPE_MIN) |
1444 | 1.65M | { |
1445 | 1.65M | for (;;) |
1446 | 1.69M | { |
1447 | 1.69M | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1448 | 1.69M | RMATCH(Fecode, RM25); |
1449 | 1.69M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1450 | 1.69M | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1451 | 1.66M | if (Feptr >= mb->end_subject) |
1452 | 14.9k | { |
1453 | 14.9k | SCHECK_PARTIAL(); |
1454 | 14.9k | RRETURN(MATCH_NOMATCH); |
1455 | 14.9k | } |
1456 | 1.64M | cc = UCHAR21TEST(Feptr); |
1457 | 1.64M | if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); |
1458 | 40.5k | Feptr++; |
1459 | 40.5k | } |
1460 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
1461 | 0 | } |
1462 | | |
1463 | 4.86M | else /* Maximize */ |
1464 | 4.86M | { |
1465 | 4.86M | Lstart_eptr = Feptr; |
1466 | 4.89M | for (i = Lmin; i < Lmax; i++) |
1467 | 4.87M | { |
1468 | 4.87M | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1469 | 4.87M | if (Feptr >= mb->end_subject) |
1470 | 23.2k | { |
1471 | 23.2k | SCHECK_PARTIAL(); |
1472 | 23.2k | break; |
1473 | 23.2k | } |
1474 | 4.85M | cc = UCHAR21TEST(Feptr); |
1475 | 4.85M | if (Lc != cc && Loc != cc) break; |
1476 | 22.5k | Feptr++; |
1477 | 22.5k | } |
1478 | 4.86M | if (reptype != REPTYPE_POS) for (;;) |
1479 | 1.87M | { |
1480 | 1.87M | if (Feptr == Lstart_eptr) break; |
1481 | 2.40k | RMATCH(Fecode, RM26); |
1482 | 2.40k | Feptr--; |
1483 | 2.40k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1484 | 2.40k | } |
1485 | 4.86M | } |
1486 | 6.52M | } |
1487 | | |
1488 | | /* Caseful comparisons (includes all multi-byte characters) */ |
1489 | | |
1490 | 31.7M | else |
1491 | 31.7M | { |
1492 | 31.7M | for (i = 1; i <= Lmin; i++) |
1493 | 156k | { |
1494 | 156k | if (Feptr >= mb->end_subject) |
1495 | 949 | { |
1496 | 949 | SCHECK_PARTIAL(); |
1497 | 949 | RRETURN(MATCH_NOMATCH); |
1498 | 949 | } |
1499 | 155k | if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); |
1500 | 155k | } |
1501 | | |
1502 | 31.5M | if (Lmin == Lmax) continue; |
1503 | | |
1504 | 31.5M | if (reptype == REPTYPE_MIN) |
1505 | 2.15M | { |
1506 | 2.15M | for (;;) |
1507 | 2.15M | { |
1508 | 2.15M | RMATCH(Fecode, RM27); |
1509 | 2.15M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1510 | 2.15M | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1511 | 2.15M | if (Feptr >= mb->end_subject) |
1512 | 8.64k | { |
1513 | 8.64k | SCHECK_PARTIAL(); |
1514 | 8.64k | RRETURN(MATCH_NOMATCH); |
1515 | 8.64k | } |
1516 | 2.14M | if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); |
1517 | 2.14M | } |
1518 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
1519 | 0 | } |
1520 | 29.4M | else /* Maximize */ |
1521 | 29.4M | { |
1522 | 29.4M | Lstart_eptr = Feptr; |
1523 | 29.5M | for (i = Lmin; i < Lmax; i++) |
1524 | 29.4M | { |
1525 | 29.4M | if (Feptr >= mb->end_subject) |
1526 | 153k | { |
1527 | 153k | SCHECK_PARTIAL(); |
1528 | 153k | break; |
1529 | 153k | } |
1530 | | |
1531 | 29.3M | if (Lc != UCHAR21TEST(Feptr)) break; |
1532 | 102k | Feptr++; |
1533 | 102k | } |
1534 | | |
1535 | 29.4M | if (reptype != REPTYPE_POS) for (;;) |
1536 | 9.17M | { |
1537 | 9.17M | if (Feptr <= Lstart_eptr) break; |
1538 | 8.95k | RMATCH(Fecode, RM28); |
1539 | 8.93k | Feptr--; |
1540 | 8.93k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1541 | 8.93k | } |
1542 | 29.4M | } |
1543 | 31.5M | } |
1544 | 34.2M | break; |
1545 | | |
1546 | 34.2M | #undef Loclength |
1547 | 34.2M | #undef Lstart_eptr |
1548 | 34.2M | #undef Lcharptr |
1549 | 34.2M | #undef Lmin |
1550 | 34.2M | #undef Lmax |
1551 | 34.2M | #undef Lc |
1552 | 34.2M | #undef Loc |
1553 | | |
1554 | | |
1555 | | /* ===================================================================== */ |
1556 | | /* Match a negated single one-byte character repeatedly. This is almost a |
1557 | | repeat of the code for a repeated single character, but I haven't found a |
1558 | | nice way of commoning these up that doesn't require a test of the |
1559 | | positive/negative option for each character match. Maybe that wouldn't add |
1560 | | very much to the time taken, but character matching *is* what this is all |
1561 | | about... */ |
1562 | | |
1563 | 34.2M | #define Lstart_eptr F->temp_sptr[0] |
1564 | 34.2M | #define Lmin F->temp_32[0] |
1565 | 34.2M | #define Lmax F->temp_32[1] |
1566 | 34.2M | #define Lc F->temp_32[2] |
1567 | 34.2M | #define Loc F->temp_32[3] |
1568 | | |
1569 | 34.2M | case OP_NOTEXACT: |
1570 | 0 | case OP_NOTEXACTI: |
1571 | 0 | Lmin = Lmax = GET2(Fecode, 1); |
1572 | 0 | Fecode += 1 + IMM2_SIZE; |
1573 | 0 | goto REPEATNOTCHAR; |
1574 | | |
1575 | 0 | case OP_NOTUPTO: |
1576 | 0 | case OP_NOTUPTOI: |
1577 | 0 | Lmin = 0; |
1578 | 0 | Lmax = GET2(Fecode, 1); |
1579 | 0 | reptype = REPTYPE_MAX; |
1580 | 0 | Fecode += 1 + IMM2_SIZE; |
1581 | 0 | goto REPEATNOTCHAR; |
1582 | | |
1583 | 0 | case OP_NOTMINUPTO: |
1584 | 0 | case OP_NOTMINUPTOI: |
1585 | 0 | Lmin = 0; |
1586 | 0 | Lmax = GET2(Fecode, 1); |
1587 | 0 | reptype = REPTYPE_MIN; |
1588 | 0 | Fecode += 1 + IMM2_SIZE; |
1589 | 0 | goto REPEATNOTCHAR; |
1590 | | |
1591 | 0 | case OP_NOTPOSSTAR: |
1592 | 0 | case OP_NOTPOSSTARI: |
1593 | 0 | reptype = REPTYPE_POS; |
1594 | 0 | Lmin = 0; |
1595 | 0 | Lmax = UINT32_MAX; |
1596 | 0 | Fecode++; |
1597 | 0 | goto REPEATNOTCHAR; |
1598 | | |
1599 | 345 | case OP_NOTPOSPLUS: |
1600 | 1.23k | case OP_NOTPOSPLUSI: |
1601 | 1.23k | reptype = REPTYPE_POS; |
1602 | 1.23k | Lmin = 1; |
1603 | 1.23k | Lmax = UINT32_MAX; |
1604 | 1.23k | Fecode++; |
1605 | 1.23k | goto REPEATNOTCHAR; |
1606 | | |
1607 | 32 | case OP_NOTPOSQUERY: |
1608 | 1.20k | case OP_NOTPOSQUERYI: |
1609 | 1.20k | reptype = REPTYPE_POS; |
1610 | 1.20k | Lmin = 0; |
1611 | 1.20k | Lmax = 1; |
1612 | 1.20k | Fecode++; |
1613 | 1.20k | goto REPEATNOTCHAR; |
1614 | | |
1615 | 0 | case OP_NOTPOSUPTO: |
1616 | 0 | case OP_NOTPOSUPTOI: |
1617 | 0 | reptype = REPTYPE_POS; |
1618 | 0 | Lmin = 0; |
1619 | 0 | Lmax = GET2(Fecode, 1); |
1620 | 0 | Fecode += 1 + IMM2_SIZE; |
1621 | 0 | goto REPEATNOTCHAR; |
1622 | | |
1623 | 157 | case OP_NOTSTAR: |
1624 | 970 | case OP_NOTSTARI: |
1625 | 970 | case OP_NOTMINSTAR: |
1626 | 1.28k | case OP_NOTMINSTARI: |
1627 | 2.37k | case OP_NOTPLUS: |
1628 | 122k | case OP_NOTPLUSI: |
1629 | 124k | case OP_NOTMINPLUS: |
1630 | 135k | case OP_NOTMINPLUSI: |
1631 | 136k | case OP_NOTQUERY: |
1632 | 149k | case OP_NOTQUERYI: |
1633 | 149k | case OP_NOTMINQUERY: |
1634 | 285k | case OP_NOTMINQUERYI: |
1635 | 285k | fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); |
1636 | 285k | Lmin = rep_min[fc]; |
1637 | 285k | Lmax = rep_max[fc]; |
1638 | 285k | reptype = rep_typ[fc]; |
1639 | | |
1640 | | /* Common code for all repeated single-character non-matches. */ |
1641 | | |
1642 | 288k | REPEATNOTCHAR: |
1643 | 288k | GETCHARINCTEST(Lc, Fecode); |
1644 | | |
1645 | | /* The code is duplicated for the caseless and caseful cases, for speed, |
1646 | | since matching characters is likely to be quite common. First, ensure the |
1647 | | minimum number of matches are present. If Lmin = Lmax, we are done. |
1648 | | Otherwise, if minimizing, keep trying the rest of the expression and |
1649 | | advancing one matching character if failing, up to the maximum. |
1650 | | Alternatively, if maximizing, find the maximum number of characters and |
1651 | | work backwards. */ |
1652 | | |
1653 | 288k | if (Fop >= OP_NOTSTARI) /* Caseless */ |
1654 | 284k | { |
1655 | 284k | #ifdef SUPPORT_UNICODE |
1656 | 284k | if ((utf || ucp) && Lc > 127) |
1657 | 0 | Loc = UCD_OTHERCASE(Lc); |
1658 | 284k | else |
1659 | 284k | #endif /* SUPPORT_UNICODE */ |
1660 | | |
1661 | 284k | Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */ |
1662 | | |
1663 | 284k | #ifdef SUPPORT_UNICODE |
1664 | 284k | if (utf) |
1665 | 143k | { |
1666 | 143k | uint32_t d; |
1667 | 143k | for (i = 1; i <= Lmin; i++) |
1668 | 696 | { |
1669 | 696 | if (Feptr >= mb->end_subject) |
1670 | 54 | { |
1671 | 54 | SCHECK_PARTIAL(); |
1672 | 54 | RRETURN(MATCH_NOMATCH); |
1673 | 54 | } |
1674 | 642 | GETCHARINC(d, Feptr); |
1675 | 642 | if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); |
1676 | 642 | } |
1677 | 143k | } |
1678 | 140k | else |
1679 | 140k | #endif /* SUPPORT_UNICODE */ |
1680 | | |
1681 | | /* Not UTF mode */ |
1682 | 140k | { |
1683 | 271k | for (i = 1; i <= Lmin; i++) |
1684 | 132k | { |
1685 | 132k | if (Feptr >= mb->end_subject) |
1686 | 682 | { |
1687 | 682 | SCHECK_PARTIAL(); |
1688 | 682 | RRETURN(MATCH_NOMATCH); |
1689 | 682 | } |
1690 | 131k | if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); |
1691 | 130k | Feptr++; |
1692 | 130k | } |
1693 | 140k | } |
1694 | | |
1695 | 281k | if (Lmin == Lmax) continue; /* Finished for exact count */ |
1696 | | |
1697 | 281k | if (reptype == REPTYPE_MIN) |
1698 | 147k | { |
1699 | 147k | #ifdef SUPPORT_UNICODE |
1700 | 147k | if (utf) |
1701 | 135k | { |
1702 | 135k | uint32_t d; |
1703 | 135k | for (;;) |
1704 | 269k | { |
1705 | 269k | RMATCH(Fecode, RM204); |
1706 | 269k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1707 | 269k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1708 | 142k | if (Feptr >= mb->end_subject) |
1709 | 1.39k | { |
1710 | 1.39k | SCHECK_PARTIAL(); |
1711 | 1.39k | RRETURN(MATCH_NOMATCH); |
1712 | 1.39k | } |
1713 | 141k | GETCHARINC(d, Feptr); |
1714 | 141k | if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); |
1715 | 141k | } |
1716 | 135k | } |
1717 | 11.9k | else |
1718 | 11.9k | #endif /*SUPPORT_UNICODE */ |
1719 | | |
1720 | | /* Not UTF mode */ |
1721 | 11.9k | { |
1722 | 11.9k | for (;;) |
1723 | 500k | { |
1724 | 500k | RMATCH(Fecode, RM29); |
1725 | 500k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1726 | 500k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1727 | 500k | if (Feptr >= mb->end_subject) |
1728 | 1.80k | { |
1729 | 1.80k | SCHECK_PARTIAL(); |
1730 | 1.80k | RRETURN(MATCH_NOMATCH); |
1731 | 1.80k | } |
1732 | 498k | if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); |
1733 | 488k | Feptr++; |
1734 | 488k | } |
1735 | 11.9k | } |
1736 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
1737 | 0 | } |
1738 | | |
1739 | | /* Maximize case */ |
1740 | | |
1741 | 134k | else |
1742 | 134k | { |
1743 | 134k | Lstart_eptr = Feptr; |
1744 | | |
1745 | 134k | #ifdef SUPPORT_UNICODE |
1746 | 134k | if (utf) |
1747 | 7.90k | { |
1748 | 7.90k | uint32_t d; |
1749 | 17.1k | for (i = Lmin; i < Lmax; i++) |
1750 | 10.2k | { |
1751 | 10.2k | int len = 1; |
1752 | 10.2k | if (Feptr >= mb->end_subject) |
1753 | 447 | { |
1754 | 447 | SCHECK_PARTIAL(); |
1755 | 447 | break; |
1756 | 447 | } |
1757 | 9.80k | GETCHARLEN(d, Feptr, len); |
1758 | 9.80k | if (Lc == d || Loc == d) break; |
1759 | 9.28k | Feptr += len; |
1760 | 9.28k | } |
1761 | | |
1762 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1763 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1764 | | go too far. */ |
1765 | | |
1766 | 7.90k | if (reptype != REPTYPE_POS) for(;;) |
1767 | 15.4k | { |
1768 | 15.4k | if (Feptr <= Lstart_eptr) break; |
1769 | 8.41k | RMATCH(Fecode, RM205); |
1770 | 8.41k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1771 | 8.41k | Feptr--; |
1772 | 8.41k | BACKCHAR(Feptr); |
1773 | 8.41k | } |
1774 | 7.90k | } |
1775 | 126k | else |
1776 | 126k | #endif /* SUPPORT_UNICODE */ |
1777 | | |
1778 | | /* Not UTF mode */ |
1779 | 126k | { |
1780 | 6.72M | for (i = Lmin; i < Lmax; i++) |
1781 | 6.71M | { |
1782 | 6.71M | if (Feptr >= mb->end_subject) |
1783 | 35.9k | { |
1784 | 35.9k | SCHECK_PARTIAL(); |
1785 | 35.9k | break; |
1786 | 35.9k | } |
1787 | 6.68M | if (Lc == *Feptr || Loc == *Feptr) break; |
1788 | 6.59M | Feptr++; |
1789 | 6.59M | } |
1790 | 126k | if (reptype != REPTYPE_POS) for (;;) |
1791 | 6.71M | { |
1792 | 6.71M | if (Feptr == Lstart_eptr) break; |
1793 | 6.58M | RMATCH(Fecode, RM30); |
1794 | 6.58M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1795 | 6.58M | Feptr--; |
1796 | 6.58M | } |
1797 | 126k | } |
1798 | 134k | } |
1799 | 281k | } |
1800 | | |
1801 | | /* Caseful comparisons */ |
1802 | | |
1803 | 3.86k | else |
1804 | 3.86k | { |
1805 | 3.86k | #ifdef SUPPORT_UNICODE |
1806 | 3.86k | if (utf) |
1807 | 186 | { |
1808 | 186 | uint32_t d; |
1809 | 186 | for (i = 1; i <= Lmin; i++) |
1810 | 0 | { |
1811 | 0 | if (Feptr >= mb->end_subject) |
1812 | 0 | { |
1813 | 0 | SCHECK_PARTIAL(); |
1814 | 0 | RRETURN(MATCH_NOMATCH); |
1815 | 0 | } |
1816 | 0 | GETCHARINC(d, Feptr); |
1817 | 0 | if (Lc == d) RRETURN(MATCH_NOMATCH); |
1818 | 0 | } |
1819 | 186 | } |
1820 | 3.67k | else |
1821 | 3.67k | #endif |
1822 | | /* Not UTF mode */ |
1823 | 3.67k | { |
1824 | 6.30k | for (i = 1; i <= Lmin; i++) |
1825 | 2.70k | { |
1826 | 2.70k | if (Feptr >= mb->end_subject) |
1827 | 0 | { |
1828 | 0 | SCHECK_PARTIAL(); |
1829 | 0 | RRETURN(MATCH_NOMATCH); |
1830 | 0 | } |
1831 | 2.70k | if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); |
1832 | 2.70k | } |
1833 | 3.67k | } |
1834 | | |
1835 | 3.77k | if (Lmin == Lmax) continue; |
1836 | | |
1837 | 3.77k | if (reptype == REPTYPE_MIN) |
1838 | 1.25k | { |
1839 | 1.25k | #ifdef SUPPORT_UNICODE |
1840 | 1.25k | if (utf) |
1841 | 0 | { |
1842 | 0 | uint32_t d; |
1843 | 0 | for (;;) |
1844 | 0 | { |
1845 | 0 | RMATCH(Fecode, RM206); |
1846 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1847 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1848 | 0 | if (Feptr >= mb->end_subject) |
1849 | 0 | { |
1850 | 0 | SCHECK_PARTIAL(); |
1851 | 0 | RRETURN(MATCH_NOMATCH); |
1852 | 0 | } |
1853 | 0 | GETCHARINC(d, Feptr); |
1854 | 0 | if (Lc == d) RRETURN(MATCH_NOMATCH); |
1855 | 0 | } |
1856 | 0 | } |
1857 | 1.25k | else |
1858 | 1.25k | #endif |
1859 | | /* Not UTF mode */ |
1860 | 1.25k | { |
1861 | 1.25k | for (;;) |
1862 | 78.2k | { |
1863 | 78.2k | RMATCH(Fecode, RM31); |
1864 | 78.2k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1865 | 78.2k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1866 | 78.2k | if (Feptr >= mb->end_subject) |
1867 | 351 | { |
1868 | 351 | SCHECK_PARTIAL(); |
1869 | 351 | RRETURN(MATCH_NOMATCH); |
1870 | 351 | } |
1871 | 77.9k | if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); |
1872 | 77.9k | } |
1873 | 1.25k | } |
1874 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
1875 | 0 | } |
1876 | | |
1877 | | /* Maximize case */ |
1878 | | |
1879 | 2.52k | else |
1880 | 2.52k | { |
1881 | 2.52k | Lstart_eptr = Feptr; |
1882 | | |
1883 | 2.52k | #ifdef SUPPORT_UNICODE |
1884 | 2.52k | if (utf) |
1885 | 186 | { |
1886 | 186 | uint32_t d; |
1887 | 369 | for (i = Lmin; i < Lmax; i++) |
1888 | 186 | { |
1889 | 186 | int len = 1; |
1890 | 186 | if (Feptr >= mb->end_subject) |
1891 | 0 | { |
1892 | 0 | SCHECK_PARTIAL(); |
1893 | 0 | break; |
1894 | 0 | } |
1895 | 186 | GETCHARLEN(d, Feptr, len); |
1896 | 186 | if (Lc == d) break; |
1897 | 183 | Feptr += len; |
1898 | 183 | } |
1899 | | |
1900 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1901 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1902 | | go too far. */ |
1903 | | |
1904 | 186 | if (reptype != REPTYPE_POS) for(;;) |
1905 | 369 | { |
1906 | 369 | if (Feptr <= Lstart_eptr) break; |
1907 | 183 | RMATCH(Fecode, RM207); |
1908 | 183 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1909 | 183 | Feptr--; |
1910 | 183 | BACKCHAR(Feptr); |
1911 | 183 | } |
1912 | 186 | } |
1913 | 2.34k | else |
1914 | 2.34k | #endif |
1915 | | /* Not UTF mode */ |
1916 | 2.34k | { |
1917 | 31.4k | for (i = Lmin; i < Lmax; i++) |
1918 | 30.6k | { |
1919 | 30.6k | if (Feptr >= mb->end_subject) |
1920 | 490 | { |
1921 | 490 | SCHECK_PARTIAL(); |
1922 | 490 | break; |
1923 | 490 | } |
1924 | 30.1k | if (Lc == *Feptr) break; |
1925 | 29.0k | Feptr++; |
1926 | 29.0k | } |
1927 | 2.34k | if (reptype != REPTYPE_POS) for (;;) |
1928 | 28.6k | { |
1929 | 28.6k | if (Feptr == Lstart_eptr) break; |
1930 | 26.6k | RMATCH(Fecode, RM32); |
1931 | 26.6k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1932 | 26.6k | Feptr--; |
1933 | 26.6k | } |
1934 | 2.34k | } |
1935 | 2.52k | } |
1936 | 3.77k | } |
1937 | 137k | break; |
1938 | | |
1939 | 137k | #undef Lstart_eptr |
1940 | 137k | #undef Lmin |
1941 | 137k | #undef Lmax |
1942 | 137k | #undef Lc |
1943 | 137k | #undef Loc |
1944 | | |
1945 | | |
1946 | | /* ===================================================================== */ |
1947 | | /* Match a bit-mapped character class, possibly repeatedly. These opcodes |
1948 | | are used when all the characters in the class have values in the range |
1949 | | 0-255, and either the matching is caseful, or the characters are in the |
1950 | | range 0-127 when UTF processing is enabled. The only difference between |
1951 | | OP_CLASS and OP_NCLASS occurs when a data character outside the range is |
1952 | | encountered. */ |
1953 | | |
1954 | 9.49M | #define Lmin F->temp_32[0] |
1955 | 8.63M | #define Lmax F->temp_32[1] |
1956 | 6.35M | #define Lstart_eptr F->temp_sptr[0] |
1957 | 6.46M | #define Lbyte_map_address F->temp_sptr[1] |
1958 | 4.16M | #define Lbyte_map ((const unsigned char *)Lbyte_map_address) |
1959 | | |
1960 | 369k | case OP_NCLASS: |
1961 | 2.29M | case OP_CLASS: |
1962 | 2.29M | { |
1963 | 2.29M | Lbyte_map_address = Fecode + 1; /* Save for matching */ |
1964 | 2.29M | Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */ |
1965 | | |
1966 | | /* Look past the end of the item to see if there is repeat information |
1967 | | following. Then obey similar code to character type repeats. */ |
1968 | | |
1969 | 2.29M | switch (*Fecode) |
1970 | 2.29M | { |
1971 | 1.29M | case OP_CRSTAR: |
1972 | 1.33M | case OP_CRMINSTAR: |
1973 | 1.36M | case OP_CRPLUS: |
1974 | 1.36M | case OP_CRMINPLUS: |
1975 | 1.63M | case OP_CRQUERY: |
1976 | 1.71M | case OP_CRMINQUERY: |
1977 | 2.03M | case OP_CRPOSSTAR: |
1978 | 2.04M | case OP_CRPOSPLUS: |
1979 | 2.06M | case OP_CRPOSQUERY: |
1980 | 2.06M | fc = *Fecode++ - OP_CRSTAR; |
1981 | 2.06M | Lmin = rep_min[fc]; |
1982 | 2.06M | Lmax = rep_max[fc]; |
1983 | 2.06M | reptype = rep_typ[fc]; |
1984 | 2.06M | break; |
1985 | | |
1986 | 0 | case OP_CRRANGE: |
1987 | 0 | case OP_CRMINRANGE: |
1988 | 0 | case OP_CRPOSRANGE: |
1989 | 0 | Lmin = GET2(Fecode, 1); |
1990 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
1991 | 0 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
1992 | 0 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
1993 | 0 | Fecode += 1 + 2 * IMM2_SIZE; |
1994 | 0 | break; |
1995 | | |
1996 | 233k | default: /* No repeat follows */ |
1997 | 233k | Lmin = Lmax = 1; |
1998 | 233k | break; |
1999 | 2.29M | } |
2000 | | |
2001 | | /* First, ensure the minimum number of matches are present. */ |
2002 | | |
2003 | 2.29M | #ifdef SUPPORT_UNICODE |
2004 | 2.29M | if (utf) |
2005 | 113k | { |
2006 | 146k | for (i = 1; i <= Lmin; i++) |
2007 | 54.6k | { |
2008 | 54.6k | if (Feptr >= mb->end_subject) |
2009 | 612 | { |
2010 | 612 | SCHECK_PARTIAL(); |
2011 | 612 | RRETURN(MATCH_NOMATCH); |
2012 | 612 | } |
2013 | 54.0k | GETCHARINC(fc, Feptr); |
2014 | 54.0k | if (fc > 255) |
2015 | 1.07k | { |
2016 | 1.07k | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2017 | 1.07k | } |
2018 | 52.9k | else |
2019 | 52.9k | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
2020 | 54.0k | } |
2021 | 113k | } |
2022 | 2.17M | else |
2023 | 2.17M | #endif |
2024 | | /* Not UTF mode */ |
2025 | 2.17M | { |
2026 | 2.31M | for (i = 1; i <= Lmin; i++) |
2027 | 217k | { |
2028 | 217k | if (Feptr >= mb->end_subject) |
2029 | 1.43k | { |
2030 | 1.43k | SCHECK_PARTIAL(); |
2031 | 1.43k | RRETURN(MATCH_NOMATCH); |
2032 | 1.43k | } |
2033 | 215k | fc = *Feptr++; |
2034 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
2035 | | if (fc > 255) |
2036 | | { |
2037 | | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2038 | | } |
2039 | | else |
2040 | | #endif |
2041 | 215k | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
2042 | 215k | } |
2043 | 2.17M | } |
2044 | | |
2045 | | /* If Lmax == Lmin we are done. Continue with main loop. */ |
2046 | | |
2047 | 2.19M | if (Lmin == Lmax) continue; |
2048 | | |
2049 | | /* If minimizing, keep testing the rest of the expression and advancing |
2050 | | the pointer while it matches the class. */ |
2051 | | |
2052 | 2.03M | if (reptype == REPTYPE_MIN) |
2053 | 124k | { |
2054 | 124k | #ifdef SUPPORT_UNICODE |
2055 | 124k | if (utf) |
2056 | 38.4k | { |
2057 | 38.4k | for (;;) |
2058 | 59.7k | { |
2059 | 59.7k | RMATCH(Fecode, RM200); |
2060 | 59.7k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2061 | 59.7k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2062 | 48.1k | if (Feptr >= mb->end_subject) |
2063 | 1.00k | { |
2064 | 1.00k | SCHECK_PARTIAL(); |
2065 | 1.00k | RRETURN(MATCH_NOMATCH); |
2066 | 1.00k | } |
2067 | 47.1k | GETCHARINC(fc, Feptr); |
2068 | 47.1k | if (fc > 255) |
2069 | 1.26k | { |
2070 | 1.26k | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2071 | 1.26k | } |
2072 | 45.8k | else |
2073 | 45.8k | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
2074 | 47.1k | } |
2075 | 38.4k | } |
2076 | 86.5k | else |
2077 | 86.5k | #endif |
2078 | | /* Not UTF mode */ |
2079 | 86.5k | { |
2080 | 86.5k | for (;;) |
2081 | 573k | { |
2082 | 573k | RMATCH(Fecode, RM23); |
2083 | 573k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2084 | 573k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2085 | 537k | if (Feptr >= mb->end_subject) |
2086 | 6.62k | { |
2087 | 6.62k | SCHECK_PARTIAL(); |
2088 | 6.62k | RRETURN(MATCH_NOMATCH); |
2089 | 6.62k | } |
2090 | 530k | fc = *Feptr++; |
2091 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
2092 | | if (fc > 255) |
2093 | | { |
2094 | | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2095 | | } |
2096 | | else |
2097 | | #endif |
2098 | 530k | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
2099 | 530k | } |
2100 | 86.5k | } |
2101 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2102 | 0 | } |
2103 | | |
2104 | | /* If maximizing, find the longest possible run, then work backwards. */ |
2105 | | |
2106 | 1.91M | else |
2107 | 1.91M | { |
2108 | 1.91M | Lstart_eptr = Feptr; |
2109 | | |
2110 | 1.91M | #ifdef SUPPORT_UNICODE |
2111 | 1.91M | if (utf) |
2112 | 26.8k | { |
2113 | 243k | for (i = Lmin; i < Lmax; i++) |
2114 | 241k | { |
2115 | 241k | int len = 1; |
2116 | 241k | if (Feptr >= mb->end_subject) |
2117 | 2.39k | { |
2118 | 2.39k | SCHECK_PARTIAL(); |
2119 | 2.39k | break; |
2120 | 2.39k | } |
2121 | 238k | GETCHARLEN(fc, Feptr, len); |
2122 | 238k | if (fc > 255) |
2123 | 9.18k | { |
2124 | 9.18k | if (Fop == OP_CLASS) break; |
2125 | 9.18k | } |
2126 | 229k | else |
2127 | 229k | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; |
2128 | 216k | Feptr += len; |
2129 | 216k | } |
2130 | | |
2131 | 26.8k | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2132 | | |
2133 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
2134 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
2135 | | go too far. */ |
2136 | | |
2137 | 24.4k | for (;;) |
2138 | 235k | { |
2139 | 235k | RMATCH(Fecode, RM201); |
2140 | 235k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2141 | 235k | if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ |
2142 | 211k | BACKCHAR(Feptr); |
2143 | 211k | } |
2144 | 24.4k | } |
2145 | 1.88M | else |
2146 | 1.88M | #endif |
2147 | | /* Not UTF mode */ |
2148 | 1.88M | { |
2149 | 3.27M | for (i = Lmin; i < Lmax; i++) |
2150 | 3.11M | { |
2151 | 3.11M | if (Feptr >= mb->end_subject) |
2152 | 22.6k | { |
2153 | 22.6k | SCHECK_PARTIAL(); |
2154 | 22.6k | break; |
2155 | 22.6k | } |
2156 | 3.09M | fc = *Feptr; |
2157 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
2158 | | if (fc > 255) |
2159 | | { |
2160 | | if (Fop == OP_CLASS) break; |
2161 | | } |
2162 | | else |
2163 | | #endif |
2164 | 3.09M | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; |
2165 | 1.38M | Feptr++; |
2166 | 1.38M | } |
2167 | | |
2168 | 1.88M | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2169 | | |
2170 | 4.20M | while (Feptr >= Lstart_eptr) |
2171 | 2.64M | { |
2172 | 2.64M | RMATCH(Fecode, RM24); |
2173 | 2.64M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2174 | 2.64M | Feptr--; |
2175 | 2.64M | } |
2176 | 1.55M | } |
2177 | | |
2178 | 1.58M | RRETURN(MATCH_NOMATCH); |
2179 | 1.58M | } |
2180 | 2.03M | } |
2181 | | |
2182 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2183 | |
|
2184 | 0 | #undef Lbyte_map_address |
2185 | 0 | #undef Lbyte_map |
2186 | 0 | #undef Lstart_eptr |
2187 | 0 | #undef Lmin |
2188 | 0 | #undef Lmax |
2189 | | |
2190 | | |
2191 | | /* ===================================================================== */ |
2192 | | /* Match an extended character class. In the 8-bit library, this opcode is |
2193 | | encountered only when UTF-8 mode mode is supported. In the 16-bit and |
2194 | | 32-bit libraries, codepoints greater than 255 may be encountered even when |
2195 | | UTF is not supported. */ |
2196 | |
|
2197 | 751k | #define Lstart_eptr F->temp_sptr[0] |
2198 | 1.13M | #define Lxclass_data F->temp_sptr[1] |
2199 | 1.11M | #define Lmin F->temp_32[0] |
2200 | 1.40M | #define Lmax F->temp_32[1] |
2201 | |
|
2202 | 0 | #ifdef SUPPORT_WIDE_CHARS |
2203 | 270k | case OP_XCLASS: |
2204 | 270k | { |
2205 | 270k | Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */ |
2206 | 270k | Fecode += GET(Fecode, 1); /* Advance past the item */ |
2207 | | |
2208 | 270k | switch (*Fecode) |
2209 | 270k | { |
2210 | 9.59k | case OP_CRSTAR: |
2211 | 13.0k | case OP_CRMINSTAR: |
2212 | 20.0k | case OP_CRPLUS: |
2213 | 20.9k | case OP_CRMINPLUS: |
2214 | 120k | case OP_CRQUERY: |
2215 | 157k | case OP_CRMINQUERY: |
2216 | 164k | case OP_CRPOSSTAR: |
2217 | 167k | case OP_CRPOSPLUS: |
2218 | 172k | case OP_CRPOSQUERY: |
2219 | 172k | fc = *Fecode++ - OP_CRSTAR; |
2220 | 172k | Lmin = rep_min[fc]; |
2221 | 172k | Lmax = rep_max[fc]; |
2222 | 172k | reptype = rep_typ[fc]; |
2223 | 172k | break; |
2224 | | |
2225 | 0 | case OP_CRRANGE: |
2226 | 0 | case OP_CRMINRANGE: |
2227 | 0 | case OP_CRPOSRANGE: |
2228 | 0 | Lmin = GET2(Fecode, 1); |
2229 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
2230 | 0 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
2231 | 0 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
2232 | 0 | Fecode += 1 + 2 * IMM2_SIZE; |
2233 | 0 | break; |
2234 | | |
2235 | 97.9k | default: /* No repeat follows */ |
2236 | 97.9k | Lmin = Lmax = 1; |
2237 | 97.9k | break; |
2238 | 270k | } |
2239 | | |
2240 | | /* First, ensure the minimum number of matches are present. */ |
2241 | | |
2242 | 351k | for (i = 1; i <= Lmin; i++) |
2243 | 108k | { |
2244 | 108k | if (Feptr >= mb->end_subject) |
2245 | 1.66k | { |
2246 | 1.66k | SCHECK_PARTIAL(); |
2247 | 1.66k | RRETURN(MATCH_NOMATCH); |
2248 | 1.66k | } |
2249 | 107k | GETCHARINCTEST(fc, Feptr); |
2250 | 107k | if (!PRIV(xclass)(fc, Lxclass_data, |
2251 | 107k | (const uint8_t*)mb->start_code, utf)) |
2252 | 26.0k | RRETURN(MATCH_NOMATCH); |
2253 | 107k | } |
2254 | | |
2255 | | /* If Lmax == Lmin we can just continue with the main loop. */ |
2256 | | |
2257 | 242k | if (Lmin == Lmax) continue; |
2258 | | |
2259 | | /* If minimizing, keep testing the rest of the expression and advancing |
2260 | | the pointer while it matches the class. */ |
2261 | | |
2262 | 167k | if (reptype == REPTYPE_MIN) |
2263 | 41.4k | { |
2264 | 41.4k | for (;;) |
2265 | 128k | { |
2266 | 128k | RMATCH(Fecode, RM100); |
2267 | 128k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2268 | 128k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2269 | 100k | if (Feptr >= mb->end_subject) |
2270 | 485 | { |
2271 | 485 | SCHECK_PARTIAL(); |
2272 | 485 | RRETURN(MATCH_NOMATCH); |
2273 | 485 | } |
2274 | 100k | GETCHARINCTEST(fc, Feptr); |
2275 | 100k | if (!PRIV(xclass)(fc, Lxclass_data, |
2276 | 100k | (const uint8_t*)mb->start_code, utf)) |
2277 | 12.8k | RRETURN(MATCH_NOMATCH); |
2278 | 100k | } |
2279 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2280 | 0 | } |
2281 | | |
2282 | | /* If maximizing, find the longest possible run, then work backwards. */ |
2283 | | |
2284 | 125k | else |
2285 | 125k | { |
2286 | 125k | Lstart_eptr = Feptr; |
2287 | 758k | for (i = Lmin; i < Lmax; i++) |
2288 | 661k | { |
2289 | 661k | int len = 1; |
2290 | 661k | if (Feptr >= mb->end_subject) |
2291 | 8.65k | { |
2292 | 8.65k | SCHECK_PARTIAL(); |
2293 | 8.65k | break; |
2294 | 8.65k | } |
2295 | 652k | #ifdef SUPPORT_UNICODE |
2296 | 652k | GETCHARLENTEST(fc, Feptr, len); |
2297 | | #else |
2298 | | fc = *Feptr; |
2299 | | #endif |
2300 | 652k | if (!PRIV(xclass)(fc, Lxclass_data, |
2301 | 652k | (const uint8_t*)mb->start_code, utf)) break; |
2302 | 633k | Feptr += len; |
2303 | 633k | } |
2304 | | |
2305 | 125k | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2306 | | |
2307 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
2308 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
2309 | | go too far. */ |
2310 | | |
2311 | 113k | for(;;) |
2312 | 625k | { |
2313 | 625k | RMATCH(Fecode, RM101); |
2314 | 625k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2315 | 625k | if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ |
2316 | 512k | #ifdef SUPPORT_UNICODE |
2317 | 512k | if (utf) BACKCHAR(Feptr); |
2318 | 512k | #endif |
2319 | 512k | } |
2320 | 113k | RRETURN(MATCH_NOMATCH); |
2321 | 113k | } |
2322 | | |
2323 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2324 | 0 | } |
2325 | 0 | #endif /* SUPPORT_WIDE_CHARS: end of XCLASS */ |
2326 | | |
2327 | 0 | #undef Lstart_eptr |
2328 | 0 | #undef Lxclass_data |
2329 | 0 | #undef Lmin |
2330 | 0 | #undef Lmax |
2331 | | |
2332 | | |
2333 | | /* ===================================================================== */ |
2334 | | /* Match a complex, set-based character class. This opcodes are used when |
2335 | | there is complex nesting or logical operations within the character |
2336 | | class. */ |
2337 | | |
2338 | 0 | #define Lstart_eptr F->temp_sptr[0] |
2339 | 0 | #define Leclass_data F->temp_sptr[1] |
2340 | 0 | #define Leclass_len F->temp_size |
2341 | 0 | #define Lmin F->temp_32[0] |
2342 | 0 | #define Lmax F->temp_32[1] |
2343 | | |
2344 | 0 | #ifdef SUPPORT_WIDE_CHARS |
2345 | 0 | case OP_ECLASS: |
2346 | 0 | { |
2347 | 0 | Leclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */ |
2348 | 0 | Fecode += GET(Fecode, 1); /* Advance past the item */ |
2349 | 0 | Leclass_len = (PCRE2_SIZE)(Fecode - Leclass_data); |
2350 | |
|
2351 | 0 | switch (*Fecode) |
2352 | 0 | { |
2353 | 0 | case OP_CRSTAR: |
2354 | 0 | case OP_CRMINSTAR: |
2355 | 0 | case OP_CRPLUS: |
2356 | 0 | case OP_CRMINPLUS: |
2357 | 0 | case OP_CRQUERY: |
2358 | 0 | case OP_CRMINQUERY: |
2359 | 0 | case OP_CRPOSSTAR: |
2360 | 0 | case OP_CRPOSPLUS: |
2361 | 0 | case OP_CRPOSQUERY: |
2362 | 0 | fc = *Fecode++ - OP_CRSTAR; |
2363 | 0 | Lmin = rep_min[fc]; |
2364 | 0 | Lmax = rep_max[fc]; |
2365 | 0 | reptype = rep_typ[fc]; |
2366 | 0 | break; |
2367 | | |
2368 | 0 | case OP_CRRANGE: |
2369 | 0 | case OP_CRMINRANGE: |
2370 | 0 | case OP_CRPOSRANGE: |
2371 | 0 | Lmin = GET2(Fecode, 1); |
2372 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
2373 | 0 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
2374 | 0 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
2375 | 0 | Fecode += 1 + 2 * IMM2_SIZE; |
2376 | 0 | break; |
2377 | | |
2378 | 0 | default: /* No repeat follows */ |
2379 | 0 | Lmin = Lmax = 1; |
2380 | 0 | break; |
2381 | 0 | } |
2382 | | |
2383 | | /* First, ensure the minimum number of matches are present. */ |
2384 | | |
2385 | 0 | for (i = 1; i <= Lmin; i++) |
2386 | 0 | { |
2387 | 0 | if (Feptr >= mb->end_subject) |
2388 | 0 | { |
2389 | 0 | SCHECK_PARTIAL(); |
2390 | 0 | RRETURN(MATCH_NOMATCH); |
2391 | 0 | } |
2392 | 0 | GETCHARINCTEST(fc, Feptr); |
2393 | 0 | if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len, |
2394 | 0 | (const uint8_t*)mb->start_code, utf)) |
2395 | 0 | RRETURN(MATCH_NOMATCH); |
2396 | 0 | } |
2397 | | |
2398 | | /* If Lmax == Lmin we can just continue with the main loop. */ |
2399 | | |
2400 | 0 | if (Lmin == Lmax) continue; |
2401 | | |
2402 | | /* If minimizing, keep testing the rest of the expression and advancing |
2403 | | the pointer while it matches the class. */ |
2404 | | |
2405 | 0 | if (reptype == REPTYPE_MIN) |
2406 | 0 | { |
2407 | 0 | for (;;) |
2408 | 0 | { |
2409 | 0 | RMATCH(Fecode, RM102); |
2410 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2411 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2412 | 0 | if (Feptr >= mb->end_subject) |
2413 | 0 | { |
2414 | 0 | SCHECK_PARTIAL(); |
2415 | 0 | RRETURN(MATCH_NOMATCH); |
2416 | 0 | } |
2417 | 0 | GETCHARINCTEST(fc, Feptr); |
2418 | 0 | if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len, |
2419 | 0 | (const uint8_t*)mb->start_code, utf)) |
2420 | 0 | RRETURN(MATCH_NOMATCH); |
2421 | 0 | } |
2422 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2423 | 0 | } |
2424 | | |
2425 | | /* If maximizing, find the longest possible run, then work backwards. */ |
2426 | | |
2427 | 0 | else |
2428 | 0 | { |
2429 | 0 | Lstart_eptr = Feptr; |
2430 | 0 | for (i = Lmin; i < Lmax; i++) |
2431 | 0 | { |
2432 | 0 | int len = 1; |
2433 | 0 | if (Feptr >= mb->end_subject) |
2434 | 0 | { |
2435 | 0 | SCHECK_PARTIAL(); |
2436 | 0 | break; |
2437 | 0 | } |
2438 | 0 | #ifdef SUPPORT_UNICODE |
2439 | 0 | GETCHARLENTEST(fc, Feptr, len); |
2440 | | #else |
2441 | | fc = *Feptr; |
2442 | | #endif |
2443 | 0 | if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len, |
2444 | 0 | (const uint8_t*)mb->start_code, utf)) |
2445 | 0 | break; |
2446 | 0 | Feptr += len; |
2447 | 0 | } |
2448 | | |
2449 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2450 | | |
2451 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
2452 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
2453 | | go too far. */ |
2454 | | |
2455 | 0 | for(;;) |
2456 | 0 | { |
2457 | 0 | RMATCH(Fecode, RM103); |
2458 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2459 | 0 | if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ |
2460 | 0 | #ifdef SUPPORT_UNICODE |
2461 | 0 | if (utf) BACKCHAR(Feptr); |
2462 | 0 | #endif |
2463 | 0 | } |
2464 | 0 | RRETURN(MATCH_NOMATCH); |
2465 | 0 | } |
2466 | | |
2467 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2468 | 0 | } |
2469 | 0 | #endif /* SUPPORT_WIDE_CHARS: end of ECLASS */ |
2470 | | |
2471 | 0 | #undef Lstart_eptr |
2472 | 0 | #undef Leclass_data |
2473 | 0 | #undef Leclass_len |
2474 | 0 | #undef Lmin |
2475 | 0 | #undef Lmax |
2476 | | |
2477 | | |
2478 | | /* ===================================================================== */ |
2479 | | /* Match various character types when PCRE2_UCP is not set. These opcodes |
2480 | | are not generated when PCRE2_UCP is set - instead appropriate property |
2481 | | tests are compiled. */ |
2482 | | |
2483 | 221k | case OP_NOT_DIGIT: |
2484 | 221k | if (Feptr >= mb->end_subject) |
2485 | 7.72k | { |
2486 | 7.72k | SCHECK_PARTIAL(); |
2487 | 7.72k | RRETURN(MATCH_NOMATCH); |
2488 | 7.72k | } |
2489 | 213k | GETCHARINCTEST(fc, Feptr); |
2490 | 213k | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) |
2491 | 251 | RRETURN(MATCH_NOMATCH); |
2492 | 213k | Fecode++; |
2493 | 213k | break; |
2494 | | |
2495 | 191k | case OP_DIGIT: |
2496 | 191k | if (Feptr >= mb->end_subject) |
2497 | 7.42k | { |
2498 | 7.42k | SCHECK_PARTIAL(); |
2499 | 7.42k | RRETURN(MATCH_NOMATCH); |
2500 | 7.42k | } |
2501 | 184k | GETCHARINCTEST(fc, Feptr); |
2502 | 184k | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) |
2503 | 184k | RRETURN(MATCH_NOMATCH); |
2504 | 464 | Fecode++; |
2505 | 464 | break; |
2506 | | |
2507 | 397k | case OP_NOT_WHITESPACE: |
2508 | 397k | if (Feptr >= mb->end_subject) |
2509 | 1.53k | { |
2510 | 1.53k | SCHECK_PARTIAL(); |
2511 | 1.53k | RRETURN(MATCH_NOMATCH); |
2512 | 1.53k | } |
2513 | 395k | GETCHARINCTEST(fc, Feptr); |
2514 | 395k | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) |
2515 | 14.4k | RRETURN(MATCH_NOMATCH); |
2516 | 381k | Fecode++; |
2517 | 381k | break; |
2518 | | |
2519 | 14.8k | case OP_WHITESPACE: |
2520 | 14.8k | if (Feptr >= mb->end_subject) |
2521 | 0 | { |
2522 | 0 | SCHECK_PARTIAL(); |
2523 | 0 | RRETURN(MATCH_NOMATCH); |
2524 | 0 | } |
2525 | 14.8k | GETCHARINCTEST(fc, Feptr); |
2526 | 14.8k | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) |
2527 | 12.7k | RRETURN(MATCH_NOMATCH); |
2528 | 2.09k | Fecode++; |
2529 | 2.09k | break; |
2530 | | |
2531 | 5.82M | case OP_NOT_WORDCHAR: |
2532 | 5.82M | if (Feptr >= mb->end_subject) |
2533 | 91.8k | { |
2534 | 91.8k | SCHECK_PARTIAL(); |
2535 | 91.8k | RRETURN(MATCH_NOMATCH); |
2536 | 91.8k | } |
2537 | 5.73M | GETCHARINCTEST(fc, Feptr); |
2538 | 5.73M | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) |
2539 | 438k | RRETURN(MATCH_NOMATCH); |
2540 | 5.29M | Fecode++; |
2541 | 5.29M | break; |
2542 | | |
2543 | 5.77M | case OP_WORDCHAR: |
2544 | 5.77M | if (Feptr >= mb->end_subject) |
2545 | 1.10k | { |
2546 | 1.10k | SCHECK_PARTIAL(); |
2547 | 1.10k | RRETURN(MATCH_NOMATCH); |
2548 | 1.10k | } |
2549 | 5.77M | GETCHARINCTEST(fc, Feptr); |
2550 | 5.77M | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) |
2551 | 5.07M | RRETURN(MATCH_NOMATCH); |
2552 | 694k | Fecode++; |
2553 | 694k | break; |
2554 | | |
2555 | 272M | case OP_ANYNL: |
2556 | 272M | if (Feptr >= mb->end_subject) |
2557 | 3.04M | { |
2558 | 3.04M | SCHECK_PARTIAL(); |
2559 | 3.04M | RRETURN(MATCH_NOMATCH); |
2560 | 3.04M | } |
2561 | 269M | GETCHARINCTEST(fc, Feptr); |
2562 | 269M | switch(fc) |
2563 | 269M | { |
2564 | 251M | default: RRETURN(MATCH_NOMATCH); |
2565 | | |
2566 | 52.4k | case CHAR_CR: |
2567 | 52.4k | if (Feptr >= mb->end_subject) |
2568 | 474 | { |
2569 | 474 | SCHECK_PARTIAL(); |
2570 | 474 | } |
2571 | 51.9k | else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++; |
2572 | 52.4k | break; |
2573 | | |
2574 | 11.7M | case CHAR_LF: |
2575 | 11.7M | break; |
2576 | | |
2577 | 2.94M | case CHAR_VT: |
2578 | 5.89M | case CHAR_FF: |
2579 | 5.93M | case CHAR_NEL: |
2580 | 5.93M | #ifndef EBCDIC |
2581 | 5.93M | case 0x2028: |
2582 | 5.93M | case 0x2029: |
2583 | 5.93M | #endif /* Not EBCDIC */ |
2584 | 5.93M | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
2585 | 5.93M | break; |
2586 | 269M | } |
2587 | 17.7M | Fecode++; |
2588 | 17.7M | break; |
2589 | | |
2590 | 7.47M | case OP_NOT_HSPACE: |
2591 | 7.47M | if (Feptr >= mb->end_subject) |
2592 | 145k | { |
2593 | 145k | SCHECK_PARTIAL(); |
2594 | 145k | RRETURN(MATCH_NOMATCH); |
2595 | 145k | } |
2596 | 7.33M | GETCHARINCTEST(fc, Feptr); |
2597 | 7.33M | switch(fc) |
2598 | 7.33M | { |
2599 | 3.48M | HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ |
2600 | 7.13M | default: break; |
2601 | 7.33M | } |
2602 | 7.13M | Fecode++; |
2603 | 7.13M | break; |
2604 | | |
2605 | 51.6k | case OP_HSPACE: |
2606 | 51.6k | if (Feptr >= mb->end_subject) |
2607 | 580 | { |
2608 | 580 | SCHECK_PARTIAL(); |
2609 | 580 | RRETURN(MATCH_NOMATCH); |
2610 | 580 | } |
2611 | 51.1k | GETCHARINCTEST(fc, Feptr); |
2612 | 51.1k | switch(fc) |
2613 | 51.1k | { |
2614 | 880 | HSPACE_CASES: break; /* Byte and multibyte cases */ |
2615 | 50.2k | default: RRETURN(MATCH_NOMATCH); |
2616 | 51.1k | } |
2617 | 880 | Fecode++; |
2618 | 880 | break; |
2619 | | |
2620 | 3.45M | case OP_NOT_VSPACE: |
2621 | 3.45M | if (Feptr >= mb->end_subject) |
2622 | 1.84k | { |
2623 | 1.84k | SCHECK_PARTIAL(); |
2624 | 1.84k | RRETURN(MATCH_NOMATCH); |
2625 | 1.84k | } |
2626 | 3.45M | GETCHARINCTEST(fc, Feptr); |
2627 | 3.45M | switch(fc) |
2628 | 3.45M | { |
2629 | 704k | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
2630 | 3.33M | default: break; |
2631 | 3.45M | } |
2632 | 3.33M | Fecode++; |
2633 | 3.33M | break; |
2634 | | |
2635 | 1.84M | case OP_VSPACE: |
2636 | 1.84M | if (Feptr >= mb->end_subject) |
2637 | 13.4k | { |
2638 | 13.4k | SCHECK_PARTIAL(); |
2639 | 13.4k | RRETURN(MATCH_NOMATCH); |
2640 | 13.4k | } |
2641 | 1.83M | GETCHARINCTEST(fc, Feptr); |
2642 | 1.83M | switch(fc) |
2643 | 1.83M | { |
2644 | 73.9k | VSPACE_CASES: break; |
2645 | 1.76M | default: RRETURN(MATCH_NOMATCH); |
2646 | 1.83M | } |
2647 | 73.9k | Fecode++; |
2648 | 73.9k | break; |
2649 | | |
2650 | | |
2651 | 0 | #ifdef SUPPORT_UNICODE |
2652 | | |
2653 | | /* ===================================================================== */ |
2654 | | /* Check the next character by Unicode property. We will get here only |
2655 | | if the support is in the binary; otherwise a compile-time error occurs. */ |
2656 | | |
2657 | 283k | case OP_PROP: |
2658 | 335k | case OP_NOTPROP: |
2659 | 335k | if (Feptr >= mb->end_subject) |
2660 | 2.40k | { |
2661 | 2.40k | SCHECK_PARTIAL(); |
2662 | 2.40k | RRETURN(MATCH_NOMATCH); |
2663 | 2.40k | } |
2664 | 333k | GETCHARINCTEST(fc, Feptr); |
2665 | 333k | { |
2666 | 333k | const uint32_t *cp; |
2667 | 333k | uint32_t chartype; |
2668 | 333k | const ucd_record *prop = GET_UCD(fc); |
2669 | 333k | BOOL notmatch = Fop == OP_NOTPROP; |
2670 | | |
2671 | 333k | switch(Fecode[1]) |
2672 | 333k | { |
2673 | 0 | case PT_LAMP: |
2674 | 0 | chartype = prop->chartype; |
2675 | 0 | if ((chartype == ucp_Lu || |
2676 | 0 | chartype == ucp_Ll || |
2677 | 0 | chartype == ucp_Lt) == notmatch) |
2678 | 0 | RRETURN(MATCH_NOMATCH); |
2679 | 0 | break; |
2680 | | |
2681 | 4.06k | case PT_GC: |
2682 | 4.06k | if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch) |
2683 | 555 | RRETURN(MATCH_NOMATCH); |
2684 | 3.50k | break; |
2685 | | |
2686 | 3.50k | case PT_PC: |
2687 | 0 | if ((Fecode[2] == prop->chartype) == notmatch) |
2688 | 0 | RRETURN(MATCH_NOMATCH); |
2689 | 0 | break; |
2690 | | |
2691 | 0 | case PT_SC: |
2692 | 0 | if ((Fecode[2] == prop->script) == notmatch) |
2693 | 0 | RRETURN(MATCH_NOMATCH); |
2694 | 0 | break; |
2695 | | |
2696 | 0 | case PT_SCX: |
2697 | 0 | { |
2698 | 0 | BOOL ok = (Fecode[2] == prop->script || |
2699 | 0 | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0); |
2700 | 0 | if (ok == notmatch) RRETURN(MATCH_NOMATCH); |
2701 | 0 | } |
2702 | 0 | break; |
2703 | | |
2704 | | /* These are specials */ |
2705 | | |
2706 | 0 | case PT_ALNUM: |
2707 | 0 | chartype = prop->chartype; |
2708 | 0 | if ((PRIV(ucp_gentype)[chartype] == ucp_L || |
2709 | 0 | PRIV(ucp_gentype)[chartype] == ucp_N) == notmatch) |
2710 | 0 | RRETURN(MATCH_NOMATCH); |
2711 | 0 | break; |
2712 | | |
2713 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
2714 | | which means that Perl space and POSIX space are now identical. PCRE |
2715 | | was changed at release 8.34. */ |
2716 | | |
2717 | 52.5k | case PT_SPACE: /* Perl space */ |
2718 | 52.5k | case PT_PXSPACE: /* POSIX space */ |
2719 | 52.5k | switch(fc) |
2720 | 52.5k | { |
2721 | 122k | HSPACE_CASES: |
2722 | 122k | VSPACE_CASES: |
2723 | 63.2k | if (notmatch) RRETURN(MATCH_NOMATCH); |
2724 | 778 | break; |
2725 | | |
2726 | 43.4k | default: |
2727 | 43.4k | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch) |
2728 | 5.51k | RRETURN(MATCH_NOMATCH); |
2729 | 37.9k | break; |
2730 | 52.5k | } |
2731 | 38.7k | break; |
2732 | | |
2733 | 115k | case PT_WORD: |
2734 | 115k | chartype = prop->chartype; |
2735 | 115k | if ((PRIV(ucp_gentype)[chartype] == ucp_L || |
2736 | 115k | PRIV(ucp_gentype)[chartype] == ucp_N || |
2737 | 115k | chartype == ucp_Mn || |
2738 | 115k | chartype == ucp_Pc) == notmatch) |
2739 | 70.4k | RRETURN(MATCH_NOMATCH); |
2740 | 45.1k | break; |
2741 | | |
2742 | 161k | case PT_CLIST: |
2743 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
2744 | | if (fc > MAX_UTF_CODE_POINT) |
2745 | | { |
2746 | | if (notmatch) break;; |
2747 | | RRETURN(MATCH_NOMATCH); |
2748 | | } |
2749 | | #endif |
2750 | 161k | cp = PRIV(ucd_caseless_sets) + Fecode[2]; |
2751 | 161k | for (;;) |
2752 | 211k | { |
2753 | 211k | if (fc < *cp) |
2754 | 154k | { if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } } |
2755 | 56.5k | if (fc == *cp++) |
2756 | 6.42k | { if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; } |
2757 | 56.5k | } |
2758 | 7.61k | break; |
2759 | | |
2760 | 7.61k | case PT_UCNC: |
2761 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
2762 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
2763 | 0 | fc >= 0xe000) == notmatch) |
2764 | 0 | RRETURN(MATCH_NOMATCH); |
2765 | 0 | break; |
2766 | | |
2767 | 0 | case PT_BIDICL: |
2768 | 0 | if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch) |
2769 | 0 | RRETURN(MATCH_NOMATCH); |
2770 | 0 | break; |
2771 | | |
2772 | 0 | case PT_BOOL: |
2773 | 0 | { |
2774 | 0 | BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
2775 | 0 | UCD_BPROPS_PROP(prop), Fecode[2]) != 0; |
2776 | 0 | if (ok == notmatch) RRETURN(MATCH_NOMATCH); |
2777 | 0 | } |
2778 | 0 | break; |
2779 | | |
2780 | | /* This should never occur */ |
2781 | | |
2782 | 0 | default: |
2783 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
2784 | 0 | return PCRE2_ERROR_INTERNAL; |
2785 | 333k | } |
2786 | | |
2787 | 94.9k | Fecode += 3; |
2788 | 94.9k | } |
2789 | 0 | break; |
2790 | | |
2791 | | |
2792 | | /* ===================================================================== */ |
2793 | | /* Match an extended Unicode sequence. We will get here only if the support |
2794 | | is in the binary; otherwise a compile-time error occurs. */ |
2795 | | |
2796 | 61.1k | case OP_EXTUNI: |
2797 | 61.1k | if (Feptr >= mb->end_subject) |
2798 | 387 | { |
2799 | 387 | SCHECK_PARTIAL(); |
2800 | 387 | RRETURN(MATCH_NOMATCH); |
2801 | 387 | } |
2802 | 60.7k | else |
2803 | 60.7k | { |
2804 | 60.7k | GETCHARINCTEST(fc, Feptr); |
2805 | 60.7k | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf, |
2806 | 60.7k | NULL); |
2807 | 60.7k | } |
2808 | 60.7k | CHECK_PARTIAL(); |
2809 | 60.7k | Fecode++; |
2810 | 60.7k | break; |
2811 | | |
2812 | 0 | #endif /* SUPPORT_UNICODE */ |
2813 | | |
2814 | | |
2815 | | /* ===================================================================== */ |
2816 | | /* Match a single character type repeatedly. Note that the property type |
2817 | | does not need to be in a stack frame as it is not used within an RMATCH() |
2818 | | loop. */ |
2819 | | |
2820 | 834M | #define Lstart_eptr F->temp_sptr[0] |
2821 | 282M | #define Lmin F->temp_32[0] |
2822 | 263M | #define Lmax F->temp_32[1] |
2823 | 739M | #define Lctype F->temp_32[2] |
2824 | 2.40M | #define Lpropvalue F->temp_32[3] |
2825 | | |
2826 | 0 | case OP_TYPEEXACT: |
2827 | 0 | Lmin = Lmax = GET2(Fecode, 1); |
2828 | 0 | Fecode += 1 + IMM2_SIZE; |
2829 | 0 | goto REPEATTYPE; |
2830 | | |
2831 | 0 | case OP_TYPEUPTO: |
2832 | 0 | case OP_TYPEMINUPTO: |
2833 | 0 | Lmin = 0; |
2834 | 0 | Lmax = GET2(Fecode, 1); |
2835 | 0 | reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX; |
2836 | 0 | Fecode += 1 + IMM2_SIZE; |
2837 | 0 | goto REPEATTYPE; |
2838 | | |
2839 | 7.40k | case OP_TYPEPOSSTAR: |
2840 | 7.40k | reptype = REPTYPE_POS; |
2841 | 7.40k | Lmin = 0; |
2842 | 7.40k | Lmax = UINT32_MAX; |
2843 | 7.40k | Fecode++; |
2844 | 7.40k | goto REPEATTYPE; |
2845 | | |
2846 | 3.59M | case OP_TYPEPOSPLUS: |
2847 | 3.59M | reptype = REPTYPE_POS; |
2848 | 3.59M | Lmin = 1; |
2849 | 3.59M | Lmax = UINT32_MAX; |
2850 | 3.59M | Fecode++; |
2851 | 3.59M | goto REPEATTYPE; |
2852 | | |
2853 | 47.1M | case OP_TYPEPOSQUERY: |
2854 | 47.1M | reptype = REPTYPE_POS; |
2855 | 47.1M | Lmin = 0; |
2856 | 47.1M | Lmax = 1; |
2857 | 47.1M | Fecode++; |
2858 | 47.1M | goto REPEATTYPE; |
2859 | | |
2860 | 0 | case OP_TYPEPOSUPTO: |
2861 | 0 | reptype = REPTYPE_POS; |
2862 | 0 | Lmin = 0; |
2863 | 0 | Lmax = GET2(Fecode, 1); |
2864 | 0 | Fecode += 1 + IMM2_SIZE; |
2865 | 0 | goto REPEATTYPE; |
2866 | | |
2867 | 17.0k | case OP_TYPESTAR: |
2868 | 17.8k | case OP_TYPEMINSTAR: |
2869 | 6.49M | case OP_TYPEPLUS: |
2870 | 6.91M | case OP_TYPEMINPLUS: |
2871 | 13.8M | case OP_TYPEQUERY: |
2872 | 14.0M | case OP_TYPEMINQUERY: |
2873 | 14.0M | fc = *Fecode++ - OP_TYPESTAR; |
2874 | 14.0M | Lmin = rep_min[fc]; |
2875 | 14.0M | Lmax = rep_max[fc]; |
2876 | 14.0M | reptype = rep_typ[fc]; |
2877 | | |
2878 | | /* Common code for all repeated character type matches. */ |
2879 | | |
2880 | 64.7M | REPEATTYPE: |
2881 | 64.7M | Lctype = *Fecode++; /* Code for the character type */ |
2882 | | |
2883 | 64.7M | #ifdef SUPPORT_UNICODE |
2884 | 64.7M | if (Lctype == OP_PROP || Lctype == OP_NOTPROP) |
2885 | 1.83M | { |
2886 | 1.83M | proptype = *Fecode++; |
2887 | 1.83M | Lpropvalue = *Fecode++; |
2888 | 1.83M | } |
2889 | 62.9M | else proptype = -1; |
2890 | 64.7M | #endif |
2891 | | |
2892 | | /* First, ensure the minimum number of matches are present. Use inline |
2893 | | code for maximizing the speed, and do the type test once at the start |
2894 | | (i.e. keep it out of the loops). As there are no calls to RMATCH in the |
2895 | | loops, we can use an ordinary variable for "notmatch". The code for UTF |
2896 | | mode is separated out for tidiness, except for Unicode property tests. */ |
2897 | | |
2898 | 64.7M | if (Lmin > 0) |
2899 | 10.4M | { |
2900 | 10.4M | #ifdef SUPPORT_UNICODE |
2901 | 10.4M | if (proptype >= 0) /* Property tests in all modes */ |
2902 | 1.80M | { |
2903 | 1.80M | BOOL notmatch = Lctype == OP_NOTPROP; |
2904 | 1.80M | switch(proptype) |
2905 | 1.80M | { |
2906 | 0 | case PT_LAMP: |
2907 | 0 | for (i = 1; i <= Lmin; i++) |
2908 | 0 | { |
2909 | 0 | int chartype; |
2910 | 0 | if (Feptr >= mb->end_subject) |
2911 | 0 | { |
2912 | 0 | SCHECK_PARTIAL(); |
2913 | 0 | RRETURN(MATCH_NOMATCH); |
2914 | 0 | } |
2915 | 0 | GETCHARINCTEST(fc, Feptr); |
2916 | 0 | chartype = UCD_CHARTYPE(fc); |
2917 | 0 | if ((chartype == ucp_Lu || |
2918 | 0 | chartype == ucp_Ll || |
2919 | 0 | chartype == ucp_Lt) == notmatch) |
2920 | 0 | RRETURN(MATCH_NOMATCH); |
2921 | 0 | } |
2922 | 0 | break; |
2923 | | |
2924 | 7.89k | case PT_GC: |
2925 | 15.2k | for (i = 1; i <= Lmin; i++) |
2926 | 7.89k | { |
2927 | 7.89k | if (Feptr >= mb->end_subject) |
2928 | 0 | { |
2929 | 0 | SCHECK_PARTIAL(); |
2930 | 0 | RRETURN(MATCH_NOMATCH); |
2931 | 0 | } |
2932 | 7.89k | GETCHARINCTEST(fc, Feptr); |
2933 | 7.89k | if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) |
2934 | 566 | RRETURN(MATCH_NOMATCH); |
2935 | 7.89k | } |
2936 | 7.32k | break; |
2937 | | |
2938 | 337k | case PT_PC: |
2939 | 350k | for (i = 1; i <= Lmin; i++) |
2940 | 337k | { |
2941 | 337k | if (Feptr >= mb->end_subject) |
2942 | 0 | { |
2943 | 0 | SCHECK_PARTIAL(); |
2944 | 0 | RRETURN(MATCH_NOMATCH); |
2945 | 0 | } |
2946 | 337k | GETCHARINCTEST(fc, Feptr); |
2947 | 337k | if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) |
2948 | 325k | RRETURN(MATCH_NOMATCH); |
2949 | 337k | } |
2950 | 12.2k | break; |
2951 | | |
2952 | 12.2k | case PT_SC: |
2953 | 0 | for (i = 1; i <= Lmin; i++) |
2954 | 0 | { |
2955 | 0 | if (Feptr >= mb->end_subject) |
2956 | 0 | { |
2957 | 0 | SCHECK_PARTIAL(); |
2958 | 0 | RRETURN(MATCH_NOMATCH); |
2959 | 0 | } |
2960 | 0 | GETCHARINCTEST(fc, Feptr); |
2961 | 0 | if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) |
2962 | 0 | RRETURN(MATCH_NOMATCH); |
2963 | 0 | } |
2964 | 0 | break; |
2965 | | |
2966 | 0 | case PT_SCX: |
2967 | 0 | for (i = 1; i <= Lmin; i++) |
2968 | 0 | { |
2969 | 0 | BOOL ok; |
2970 | 0 | const ucd_record *prop; |
2971 | 0 | if (Feptr >= mb->end_subject) |
2972 | 0 | { |
2973 | 0 | SCHECK_PARTIAL(); |
2974 | 0 | RRETURN(MATCH_NOMATCH); |
2975 | 0 | } |
2976 | 0 | GETCHARINCTEST(fc, Feptr); |
2977 | 0 | prop = GET_UCD(fc); |
2978 | 0 | ok = (prop->script == Lpropvalue || |
2979 | 0 | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); |
2980 | 0 | if (ok == notmatch) |
2981 | 0 | RRETURN(MATCH_NOMATCH); |
2982 | 0 | } |
2983 | 0 | break; |
2984 | | |
2985 | 0 | case PT_ALNUM: |
2986 | 0 | for (i = 1; i <= Lmin; i++) |
2987 | 0 | { |
2988 | 0 | int category; |
2989 | 0 | if (Feptr >= mb->end_subject) |
2990 | 0 | { |
2991 | 0 | SCHECK_PARTIAL(); |
2992 | 0 | RRETURN(MATCH_NOMATCH); |
2993 | 0 | } |
2994 | 0 | GETCHARINCTEST(fc, Feptr); |
2995 | 0 | category = UCD_CATEGORY(fc); |
2996 | 0 | if ((category == ucp_L || category == ucp_N) == notmatch) |
2997 | 0 | RRETURN(MATCH_NOMATCH); |
2998 | 0 | } |
2999 | 0 | break; |
3000 | | |
3001 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
3002 | | which means that Perl space and POSIX space are now identical. PCRE |
3003 | | was changed at release 8.34. */ |
3004 | | |
3005 | 1.39M | case PT_SPACE: /* Perl space */ |
3006 | 1.39M | case PT_PXSPACE: /* POSIX space */ |
3007 | 2.02M | for (i = 1; i <= Lmin; i++) |
3008 | 1.39M | { |
3009 | 1.39M | if (Feptr >= mb->end_subject) |
3010 | 9.21k | { |
3011 | 9.21k | SCHECK_PARTIAL(); |
3012 | 9.21k | RRETURN(MATCH_NOMATCH); |
3013 | 9.21k | } |
3014 | 1.39M | GETCHARINCTEST(fc, Feptr); |
3015 | 1.39M | switch(fc) |
3016 | 1.39M | { |
3017 | 5.97M | HSPACE_CASES: |
3018 | 5.97M | VSPACE_CASES: |
3019 | 2.79M | if (notmatch) RRETURN(MATCH_NOMATCH); |
3020 | 342k | break; |
3021 | | |
3022 | 988k | default: |
3023 | 988k | if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch) |
3024 | 703k | RRETURN(MATCH_NOMATCH); |
3025 | 284k | break; |
3026 | 1.39M | } |
3027 | 1.39M | } |
3028 | 627k | break; |
3029 | | |
3030 | 627k | case PT_WORD: |
3031 | 99.3k | for (i = 1; i <= Lmin; i++) |
3032 | 62.9k | { |
3033 | 62.9k | int chartype, category; |
3034 | 62.9k | if (Feptr >= mb->end_subject) |
3035 | 1.15k | { |
3036 | 1.15k | SCHECK_PARTIAL(); |
3037 | 1.15k | RRETURN(MATCH_NOMATCH); |
3038 | 1.15k | } |
3039 | 61.7k | GETCHARINCTEST(fc, Feptr); |
3040 | 61.7k | chartype = UCD_CHARTYPE(fc); |
3041 | 61.7k | category = PRIV(ucp_gentype)[chartype]; |
3042 | 61.7k | if ((category == ucp_L || category == ucp_N || |
3043 | 61.7k | chartype == ucp_Mn || chartype == ucp_Pc) == notmatch) |
3044 | 25.2k | RRETURN(MATCH_NOMATCH); |
3045 | 61.7k | } |
3046 | 36.4k | break; |
3047 | | |
3048 | 36.4k | case PT_CLIST: |
3049 | 0 | for (i = 1; i <= Lmin; i++) |
3050 | 0 | { |
3051 | 0 | const uint32_t *cp; |
3052 | 0 | if (Feptr >= mb->end_subject) |
3053 | 0 | { |
3054 | 0 | SCHECK_PARTIAL(); |
3055 | 0 | RRETURN(MATCH_NOMATCH); |
3056 | 0 | } |
3057 | 0 | GETCHARINCTEST(fc, Feptr); |
3058 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
3059 | | if (fc > MAX_UTF_CODE_POINT) |
3060 | | { |
3061 | | if (notmatch) continue; |
3062 | | RRETURN(MATCH_NOMATCH); |
3063 | | } |
3064 | | #endif |
3065 | 0 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
3066 | 0 | for (;;) |
3067 | 0 | { |
3068 | 0 | if (fc < *cp) |
3069 | 0 | { |
3070 | 0 | if (notmatch) break; |
3071 | 0 | RRETURN(MATCH_NOMATCH); |
3072 | 0 | } |
3073 | 0 | if (fc == *cp++) |
3074 | 0 | { |
3075 | 0 | if (notmatch) RRETURN(MATCH_NOMATCH); |
3076 | 0 | break; |
3077 | 0 | } |
3078 | 0 | } |
3079 | 0 | } |
3080 | 0 | break; |
3081 | | |
3082 | 0 | case PT_UCNC: |
3083 | 0 | for (i = 1; i <= Lmin; i++) |
3084 | 0 | { |
3085 | 0 | if (Feptr >= mb->end_subject) |
3086 | 0 | { |
3087 | 0 | SCHECK_PARTIAL(); |
3088 | 0 | RRETURN(MATCH_NOMATCH); |
3089 | 0 | } |
3090 | 0 | GETCHARINCTEST(fc, Feptr); |
3091 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
3092 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
3093 | 0 | fc >= 0xe000) == notmatch) |
3094 | 0 | RRETURN(MATCH_NOMATCH); |
3095 | 0 | } |
3096 | 0 | break; |
3097 | | |
3098 | 0 | case PT_BIDICL: |
3099 | 0 | for (i = 1; i <= Lmin; i++) |
3100 | 0 | { |
3101 | 0 | if (Feptr >= mb->end_subject) |
3102 | 0 | { |
3103 | 0 | SCHECK_PARTIAL(); |
3104 | 0 | RRETURN(MATCH_NOMATCH); |
3105 | 0 | } |
3106 | 0 | GETCHARINCTEST(fc, Feptr); |
3107 | 0 | if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) |
3108 | 0 | RRETURN(MATCH_NOMATCH); |
3109 | 0 | } |
3110 | 0 | break; |
3111 | | |
3112 | 0 | case PT_BOOL: |
3113 | 0 | for (i = 1; i <= Lmin; i++) |
3114 | 0 | { |
3115 | 0 | BOOL ok; |
3116 | 0 | const ucd_record *prop; |
3117 | 0 | if (Feptr >= mb->end_subject) |
3118 | 0 | { |
3119 | 0 | SCHECK_PARTIAL(); |
3120 | 0 | RRETURN(MATCH_NOMATCH); |
3121 | 0 | } |
3122 | 0 | GETCHARINCTEST(fc, Feptr); |
3123 | 0 | prop = GET_UCD(fc); |
3124 | 0 | ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
3125 | 0 | UCD_BPROPS_PROP(prop), Lpropvalue) != 0; |
3126 | 0 | if (ok == notmatch) |
3127 | 0 | RRETURN(MATCH_NOMATCH); |
3128 | 0 | } |
3129 | 0 | break; |
3130 | | |
3131 | | /* This should not occur */ |
3132 | | |
3133 | 0 | default: |
3134 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
3135 | 0 | return PCRE2_ERROR_INTERNAL; |
3136 | 1.80M | } |
3137 | 1.80M | } |
3138 | | |
3139 | | /* Match extended Unicode sequences. We will get here only if the |
3140 | | support is in the binary; otherwise a compile-time error occurs. */ |
3141 | | |
3142 | 8.68M | else if (Lctype == OP_EXTUNI) |
3143 | 45.7k | { |
3144 | 91.4k | for (i = 1; i <= Lmin; i++) |
3145 | 45.7k | { |
3146 | 45.7k | if (Feptr >= mb->end_subject) |
3147 | 0 | { |
3148 | 0 | SCHECK_PARTIAL(); |
3149 | 0 | RRETURN(MATCH_NOMATCH); |
3150 | 0 | } |
3151 | 45.7k | else |
3152 | 45.7k | { |
3153 | 45.7k | GETCHARINCTEST(fc, Feptr); |
3154 | 45.7k | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, |
3155 | 45.7k | mb->end_subject, utf, NULL); |
3156 | 45.7k | } |
3157 | 45.7k | CHECK_PARTIAL(); |
3158 | 45.7k | } |
3159 | 45.7k | } |
3160 | 8.64M | else |
3161 | 8.64M | #endif /* SUPPORT_UNICODE */ |
3162 | | |
3163 | | /* Handle all other cases in UTF mode */ |
3164 | | |
3165 | 8.64M | #ifdef SUPPORT_UNICODE |
3166 | 8.64M | if (utf) switch(Lctype) |
3167 | 4.15M | { |
3168 | 773 | case OP_ANY: |
3169 | 1.54k | for (i = 1; i <= Lmin; i++) |
3170 | 773 | { |
3171 | 773 | if (Feptr >= mb->end_subject) |
3172 | 0 | { |
3173 | 0 | SCHECK_PARTIAL(); |
3174 | 0 | RRETURN(MATCH_NOMATCH); |
3175 | 0 | } |
3176 | 773 | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
3177 | 769 | if (mb->partial != 0 && |
3178 | 769 | Feptr + 1 >= mb->end_subject && |
3179 | 769 | NLBLOCK->nltype == NLTYPE_FIXED && |
3180 | 769 | NLBLOCK->nllen == 2 && |
3181 | 769 | UCHAR21(Feptr) == NLBLOCK->nl[0]) |
3182 | 0 | { |
3183 | 0 | mb->hitend = TRUE; |
3184 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
3185 | 0 | } |
3186 | 769 | Feptr++; |
3187 | 769 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3188 | 769 | } |
3189 | 769 | break; |
3190 | | |
3191 | 4.05M | case OP_ALLANY: |
3192 | 8.08M | for (i = 1; i <= Lmin; i++) |
3193 | 4.05M | { |
3194 | 4.05M | if (Feptr >= mb->end_subject) |
3195 | 18.3k | { |
3196 | 18.3k | SCHECK_PARTIAL(); |
3197 | 18.3k | RRETURN(MATCH_NOMATCH); |
3198 | 18.3k | } |
3199 | 4.03M | Feptr++; |
3200 | 4.03M | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3201 | 4.03M | } |
3202 | 4.03M | break; |
3203 | | |
3204 | 4.03M | case OP_ANYBYTE: |
3205 | 49.0k | if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH); |
3206 | 49.0k | Feptr += Lmin; |
3207 | 49.0k | break; |
3208 | | |
3209 | 0 | case OP_ANYNL: |
3210 | 0 | for (i = 1; i <= Lmin; i++) |
3211 | 0 | { |
3212 | 0 | if (Feptr >= mb->end_subject) |
3213 | 0 | { |
3214 | 0 | SCHECK_PARTIAL(); |
3215 | 0 | RRETURN(MATCH_NOMATCH); |
3216 | 0 | } |
3217 | 0 | GETCHARINC(fc, Feptr); |
3218 | 0 | switch(fc) |
3219 | 0 | { |
3220 | 0 | default: RRETURN(MATCH_NOMATCH); |
3221 | | |
3222 | 0 | case CHAR_CR: |
3223 | 0 | if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; |
3224 | 0 | break; |
3225 | | |
3226 | 0 | case CHAR_LF: |
3227 | 0 | break; |
3228 | | |
3229 | 0 | case CHAR_VT: |
3230 | 0 | case CHAR_FF: |
3231 | 0 | case CHAR_NEL: |
3232 | 0 | #ifndef EBCDIC |
3233 | 0 | case 0x2028: |
3234 | 0 | case 0x2029: |
3235 | 0 | #endif /* Not EBCDIC */ |
3236 | 0 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
3237 | 0 | break; |
3238 | 0 | } |
3239 | 0 | } |
3240 | 0 | break; |
3241 | | |
3242 | 30.3k | case OP_NOT_HSPACE: |
3243 | 57.9k | for (i = 1; i <= Lmin; i++) |
3244 | 30.3k | { |
3245 | 30.3k | if (Feptr >= mb->end_subject) |
3246 | 5 | { |
3247 | 5 | SCHECK_PARTIAL(); |
3248 | 5 | RRETURN(MATCH_NOMATCH); |
3249 | 5 | } |
3250 | 30.3k | GETCHARINC(fc, Feptr); |
3251 | 30.3k | switch(fc) |
3252 | 30.3k | { |
3253 | 44.3k | HSPACE_CASES: RRETURN(MATCH_NOMATCH); |
3254 | 27.6k | default: break; |
3255 | 30.3k | } |
3256 | 30.3k | } |
3257 | 27.6k | break; |
3258 | | |
3259 | 27.6k | case OP_HSPACE: |
3260 | 0 | for (i = 1; i <= Lmin; i++) |
3261 | 0 | { |
3262 | 0 | if (Feptr >= mb->end_subject) |
3263 | 0 | { |
3264 | 0 | SCHECK_PARTIAL(); |
3265 | 0 | RRETURN(MATCH_NOMATCH); |
3266 | 0 | } |
3267 | 0 | GETCHARINC(fc, Feptr); |
3268 | 0 | switch(fc) |
3269 | 0 | { |
3270 | 0 | HSPACE_CASES: break; |
3271 | 0 | default: RRETURN(MATCH_NOMATCH); |
3272 | 0 | } |
3273 | 0 | } |
3274 | 0 | break; |
3275 | | |
3276 | 18.8k | case OP_NOT_VSPACE: |
3277 | 36.4k | for (i = 1; i <= Lmin; i++) |
3278 | 18.8k | { |
3279 | 18.8k | if (Feptr >= mb->end_subject) |
3280 | 0 | { |
3281 | 0 | SCHECK_PARTIAL(); |
3282 | 0 | RRETURN(MATCH_NOMATCH); |
3283 | 0 | } |
3284 | 18.8k | GETCHARINC(fc, Feptr); |
3285 | 18.8k | switch(fc) |
3286 | 18.8k | { |
3287 | 8.45k | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
3288 | 17.5k | default: break; |
3289 | 18.8k | } |
3290 | 18.8k | } |
3291 | 17.5k | break; |
3292 | | |
3293 | 17.5k | case OP_VSPACE: |
3294 | 2.64k | for (i = 1; i <= Lmin; i++) |
3295 | 2.51k | { |
3296 | 2.51k | if (Feptr >= mb->end_subject) |
3297 | 0 | { |
3298 | 0 | SCHECK_PARTIAL(); |
3299 | 0 | RRETURN(MATCH_NOMATCH); |
3300 | 0 | } |
3301 | 2.51k | GETCHARINC(fc, Feptr); |
3302 | 2.51k | switch(fc) |
3303 | 2.51k | { |
3304 | 132 | VSPACE_CASES: break; |
3305 | 2.38k | default: RRETURN(MATCH_NOMATCH); |
3306 | 2.51k | } |
3307 | 2.51k | } |
3308 | 132 | break; |
3309 | | |
3310 | 132 | case OP_NOT_DIGIT: |
3311 | 0 | for (i = 1; i <= Lmin; i++) |
3312 | 0 | { |
3313 | 0 | if (Feptr >= mb->end_subject) |
3314 | 0 | { |
3315 | 0 | SCHECK_PARTIAL(); |
3316 | 0 | RRETURN(MATCH_NOMATCH); |
3317 | 0 | } |
3318 | 0 | GETCHARINC(fc, Feptr); |
3319 | 0 | if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0) |
3320 | 0 | RRETURN(MATCH_NOMATCH); |
3321 | 0 | } |
3322 | 0 | break; |
3323 | | |
3324 | 0 | case OP_DIGIT: |
3325 | 0 | for (i = 1; i <= Lmin; i++) |
3326 | 0 | { |
3327 | 0 | uint32_t cc; |
3328 | 0 | if (Feptr >= mb->end_subject) |
3329 | 0 | { |
3330 | 0 | SCHECK_PARTIAL(); |
3331 | 0 | RRETURN(MATCH_NOMATCH); |
3332 | 0 | } |
3333 | 0 | cc = UCHAR21(Feptr); |
3334 | 0 | if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0) |
3335 | 0 | RRETURN(MATCH_NOMATCH); |
3336 | 0 | Feptr++; |
3337 | | /* No need to skip more code units - we know it has only one. */ |
3338 | 0 | } |
3339 | 0 | break; |
3340 | | |
3341 | 0 | case OP_NOT_WHITESPACE: |
3342 | 0 | for (i = 1; i <= Lmin; i++) |
3343 | 0 | { |
3344 | 0 | uint32_t cc; |
3345 | 0 | if (Feptr >= mb->end_subject) |
3346 | 0 | { |
3347 | 0 | SCHECK_PARTIAL(); |
3348 | 0 | RRETURN(MATCH_NOMATCH); |
3349 | 0 | } |
3350 | 0 | cc = UCHAR21(Feptr); |
3351 | 0 | if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0) |
3352 | 0 | RRETURN(MATCH_NOMATCH); |
3353 | 0 | Feptr++; |
3354 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3355 | 0 | } |
3356 | 0 | break; |
3357 | | |
3358 | 0 | case OP_WHITESPACE: |
3359 | 0 | for (i = 1; i <= Lmin; i++) |
3360 | 0 | { |
3361 | 0 | uint32_t cc; |
3362 | 0 | if (Feptr >= mb->end_subject) |
3363 | 0 | { |
3364 | 0 | SCHECK_PARTIAL(); |
3365 | 0 | RRETURN(MATCH_NOMATCH); |
3366 | 0 | } |
3367 | 0 | cc = UCHAR21(Feptr); |
3368 | 0 | if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0) |
3369 | 0 | RRETURN(MATCH_NOMATCH); |
3370 | 0 | Feptr++; |
3371 | | /* No need to skip more code units - we know it has only one. */ |
3372 | 0 | } |
3373 | 0 | break; |
3374 | | |
3375 | 0 | case OP_NOT_WORDCHAR: |
3376 | 0 | for (i = 1; i <= Lmin; i++) |
3377 | 0 | { |
3378 | 0 | uint32_t cc; |
3379 | 0 | if (Feptr >= mb->end_subject) |
3380 | 0 | { |
3381 | 0 | SCHECK_PARTIAL(); |
3382 | 0 | RRETURN(MATCH_NOMATCH); |
3383 | 0 | } |
3384 | 0 | cc = UCHAR21(Feptr); |
3385 | 0 | if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0) |
3386 | 0 | RRETURN(MATCH_NOMATCH); |
3387 | 0 | Feptr++; |
3388 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3389 | 0 | } |
3390 | 0 | break; |
3391 | | |
3392 | 0 | case OP_WORDCHAR: |
3393 | 0 | for (i = 1; i <= Lmin; i++) |
3394 | 0 | { |
3395 | 0 | uint32_t cc; |
3396 | 0 | if (Feptr >= mb->end_subject) |
3397 | 0 | { |
3398 | 0 | SCHECK_PARTIAL(); |
3399 | 0 | RRETURN(MATCH_NOMATCH); |
3400 | 0 | } |
3401 | 0 | cc = UCHAR21(Feptr); |
3402 | 0 | if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0) |
3403 | 0 | RRETURN(MATCH_NOMATCH); |
3404 | 0 | Feptr++; |
3405 | | /* No need to skip more code units - we know it has only one. */ |
3406 | 0 | } |
3407 | 0 | break; |
3408 | | |
3409 | 0 | default: |
3410 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
3411 | 0 | return PCRE2_ERROR_INTERNAL; |
3412 | 4.15M | } /* End switch(Lctype) */ |
3413 | | |
3414 | 4.48M | else |
3415 | 4.48M | #endif /* SUPPORT_UNICODE */ |
3416 | | |
3417 | | /* Code for the non-UTF case for minimum matching of operators other |
3418 | | than OP_PROP and OP_NOTPROP. */ |
3419 | | |
3420 | 4.48M | switch(Lctype) |
3421 | 4.48M | { |
3422 | 72.7k | case OP_ANY: |
3423 | 144k | for (i = 1; i <= Lmin; i++) |
3424 | 72.7k | { |
3425 | 72.7k | if (Feptr >= mb->end_subject) |
3426 | 0 | { |
3427 | 0 | SCHECK_PARTIAL(); |
3428 | 0 | RRETURN(MATCH_NOMATCH); |
3429 | 0 | } |
3430 | 72.7k | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
3431 | 71.4k | if (mb->partial != 0 && |
3432 | 71.4k | Feptr + 1 >= mb->end_subject && |
3433 | 71.4k | NLBLOCK->nltype == NLTYPE_FIXED && |
3434 | 71.4k | NLBLOCK->nllen == 2 && |
3435 | 71.4k | *Feptr == NLBLOCK->nl[0]) |
3436 | 0 | { |
3437 | 0 | mb->hitend = TRUE; |
3438 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
3439 | 0 | } |
3440 | 71.4k | Feptr++; |
3441 | 71.4k | } |
3442 | 71.4k | break; |
3443 | | |
3444 | 71.4k | case OP_ALLANY: |
3445 | 11.3k | if (Feptr > mb->end_subject - Lmin) |
3446 | 120 | { |
3447 | 120 | SCHECK_PARTIAL(); |
3448 | 120 | RRETURN(MATCH_NOMATCH); |
3449 | 120 | } |
3450 | 11.1k | Feptr += Lmin; |
3451 | 11.1k | break; |
3452 | | |
3453 | | /* This OP_ANYBYTE case will never be reached because \C gets turned |
3454 | | into OP_ALLANY in non-UTF mode. Cut out the code so that coverage |
3455 | | reports don't complain about it's never being used. */ |
3456 | | |
3457 | | /* case OP_ANYBYTE: |
3458 | | * if (Feptr > mb->end_subject - Lmin) |
3459 | | * { |
3460 | | * SCHECK_PARTIAL(); |
3461 | | * RRETURN(MATCH_NOMATCH); |
3462 | | * } |
3463 | | * Feptr += Lmin; |
3464 | | * break; |
3465 | | */ |
3466 | 3.17M | case OP_ANYNL: |
3467 | 3.22M | for (i = 1; i <= Lmin; i++) |
3468 | 3.17M | { |
3469 | 3.17M | if (Feptr >= mb->end_subject) |
3470 | 6.10k | { |
3471 | 6.10k | SCHECK_PARTIAL(); |
3472 | 6.10k | RRETURN(MATCH_NOMATCH); |
3473 | 6.10k | } |
3474 | 3.16M | switch(*Feptr++) |
3475 | 3.16M | { |
3476 | 3.12M | default: RRETURN(MATCH_NOMATCH); |
3477 | | |
3478 | 13.1k | case CHAR_CR: |
3479 | 13.1k | if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; |
3480 | 13.1k | break; |
3481 | | |
3482 | 19.0k | case CHAR_LF: |
3483 | 19.0k | break; |
3484 | | |
3485 | 5.15k | case CHAR_VT: |
3486 | 12.9k | case CHAR_FF: |
3487 | 13.8k | case CHAR_NEL: |
3488 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3489 | | case 0x2028: |
3490 | | case 0x2029: |
3491 | | #endif |
3492 | 13.8k | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
3493 | 13.8k | break; |
3494 | 3.16M | } |
3495 | 3.16M | } |
3496 | 45.9k | break; |
3497 | | |
3498 | 45.9k | case OP_NOT_HSPACE: |
3499 | 50.1k | for (i = 1; i <= Lmin; i++) |
3500 | 25.4k | { |
3501 | 25.4k | if (Feptr >= mb->end_subject) |
3502 | 0 | { |
3503 | 0 | SCHECK_PARTIAL(); |
3504 | 0 | RRETURN(MATCH_NOMATCH); |
3505 | 0 | } |
3506 | 25.4k | switch(*Feptr++) |
3507 | 25.4k | { |
3508 | 24.6k | default: break; |
3509 | 24.6k | HSPACE_BYTE_CASES: |
3510 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3511 | | HSPACE_MULTIBYTE_CASES: |
3512 | | #endif |
3513 | 1.70k | RRETURN(MATCH_NOMATCH); |
3514 | 25.4k | } |
3515 | 25.4k | } |
3516 | 24.6k | break; |
3517 | | |
3518 | 24.6k | case OP_HSPACE: |
3519 | 2.16k | for (i = 1; i <= Lmin; i++) |
3520 | 1.28k | { |
3521 | 1.28k | if (Feptr >= mb->end_subject) |
3522 | 0 | { |
3523 | 0 | SCHECK_PARTIAL(); |
3524 | 0 | RRETURN(MATCH_NOMATCH); |
3525 | 0 | } |
3526 | 1.28k | switch(*Feptr++) |
3527 | 1.28k | { |
3528 | 405 | default: RRETURN(MATCH_NOMATCH); |
3529 | 2.58k | HSPACE_BYTE_CASES: |
3530 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3531 | | HSPACE_MULTIBYTE_CASES: |
3532 | | #endif |
3533 | 2.58k | break; |
3534 | 1.28k | } |
3535 | 1.28k | } |
3536 | 882 | break; |
3537 | | |
3538 | 746k | case OP_NOT_VSPACE: |
3539 | 1.47M | for (i = 1; i <= Lmin; i++) |
3540 | 746k | { |
3541 | 746k | if (Feptr >= mb->end_subject) |
3542 | 4.03k | { |
3543 | 4.03k | SCHECK_PARTIAL(); |
3544 | 4.03k | RRETURN(MATCH_NOMATCH); |
3545 | 4.03k | } |
3546 | 742k | switch(*Feptr++) |
3547 | 742k | { |
3548 | 66.6k | VSPACE_BYTE_CASES: |
3549 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3550 | | VSPACE_MULTIBYTE_CASES: |
3551 | | #endif |
3552 | 66.6k | RRETURN(MATCH_NOMATCH); |
3553 | 723k | default: break; |
3554 | 742k | } |
3555 | 742k | } |
3556 | 723k | break; |
3557 | | |
3558 | 723k | case OP_VSPACE: |
3559 | 98 | for (i = 1; i <= Lmin; i++) |
3560 | 82 | { |
3561 | 82 | if (Feptr >= mb->end_subject) |
3562 | 0 | { |
3563 | 0 | SCHECK_PARTIAL(); |
3564 | 0 | RRETURN(MATCH_NOMATCH); |
3565 | 0 | } |
3566 | 82 | switch(*Feptr++) |
3567 | 82 | { |
3568 | 66 | default: RRETURN(MATCH_NOMATCH); |
3569 | 80 | VSPACE_BYTE_CASES: |
3570 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3571 | | VSPACE_MULTIBYTE_CASES: |
3572 | | #endif |
3573 | 80 | break; |
3574 | 82 | } |
3575 | 82 | } |
3576 | 16 | break; |
3577 | | |
3578 | 96.8k | case OP_NOT_DIGIT: |
3579 | 159k | for (i = 1; i <= Lmin; i++) |
3580 | 96.8k | { |
3581 | 96.8k | if (Feptr >= mb->end_subject) |
3582 | 4.32k | { |
3583 | 4.32k | SCHECK_PARTIAL(); |
3584 | 4.32k | RRETURN(MATCH_NOMATCH); |
3585 | 4.32k | } |
3586 | 92.4k | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) |
3587 | 30.2k | RRETURN(MATCH_NOMATCH); |
3588 | 62.2k | Feptr++; |
3589 | 62.2k | } |
3590 | 62.2k | break; |
3591 | | |
3592 | 62.2k | case OP_DIGIT: |
3593 | 0 | for (i = 1; i <= Lmin; i++) |
3594 | 0 | { |
3595 | 0 | if (Feptr >= mb->end_subject) |
3596 | 0 | { |
3597 | 0 | SCHECK_PARTIAL(); |
3598 | 0 | RRETURN(MATCH_NOMATCH); |
3599 | 0 | } |
3600 | 0 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) |
3601 | 0 | RRETURN(MATCH_NOMATCH); |
3602 | 0 | Feptr++; |
3603 | 0 | } |
3604 | 0 | break; |
3605 | | |
3606 | 291k | case OP_NOT_WHITESPACE: |
3607 | 576k | for (i = 1; i <= Lmin; i++) |
3608 | 291k | { |
3609 | 291k | if (Feptr >= mb->end_subject) |
3610 | 2.26k | { |
3611 | 2.26k | SCHECK_PARTIAL(); |
3612 | 2.26k | RRETURN(MATCH_NOMATCH); |
3613 | 2.26k | } |
3614 | 289k | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) |
3615 | 4.80k | RRETURN(MATCH_NOMATCH); |
3616 | 284k | Feptr++; |
3617 | 284k | } |
3618 | 284k | break; |
3619 | | |
3620 | 284k | case OP_WHITESPACE: |
3621 | 68 | for (i = 1; i <= Lmin; i++) |
3622 | 58 | { |
3623 | 58 | if (Feptr >= mb->end_subject) |
3624 | 0 | { |
3625 | 0 | SCHECK_PARTIAL(); |
3626 | 0 | RRETURN(MATCH_NOMATCH); |
3627 | 0 | } |
3628 | 58 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) |
3629 | 48 | RRETURN(MATCH_NOMATCH); |
3630 | 10 | Feptr++; |
3631 | 10 | } |
3632 | 10 | break; |
3633 | | |
3634 | 32.8k | case OP_NOT_WORDCHAR: |
3635 | 58.4k | for (i = 1; i <= Lmin; i++) |
3636 | 32.8k | { |
3637 | 32.8k | if (Feptr >= mb->end_subject) |
3638 | 237 | { |
3639 | 237 | SCHECK_PARTIAL(); |
3640 | 237 | RRETURN(MATCH_NOMATCH); |
3641 | 237 | } |
3642 | 32.5k | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) |
3643 | 7.01k | RRETURN(MATCH_NOMATCH); |
3644 | 25.5k | Feptr++; |
3645 | 25.5k | } |
3646 | 25.5k | break; |
3647 | | |
3648 | 30.5k | case OP_WORDCHAR: |
3649 | 50.1k | for (i = 1; i <= Lmin; i++) |
3650 | 30.5k | { |
3651 | 30.5k | if (Feptr >= mb->end_subject) |
3652 | 21 | { |
3653 | 21 | SCHECK_PARTIAL(); |
3654 | 21 | RRETURN(MATCH_NOMATCH); |
3655 | 21 | } |
3656 | 30.5k | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) |
3657 | 11.0k | RRETURN(MATCH_NOMATCH); |
3658 | 19.5k | Feptr++; |
3659 | 19.5k | } |
3660 | 19.5k | break; |
3661 | | |
3662 | 19.5k | default: |
3663 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
3664 | 0 | return PCRE2_ERROR_INTERNAL; |
3665 | 4.48M | } |
3666 | 10.4M | } |
3667 | | |
3668 | | /* If Lmin = Lmax we are done. Continue with the main loop. */ |
3669 | | |
3670 | 60.3M | if (Lmin == Lmax) continue; |
3671 | | |
3672 | | /* If minimizing, we have to test the rest of the pattern before each |
3673 | | subsequent match. This means we cannot use a local "notmatch" variable as |
3674 | | in the other cases. As all 4 temporary 32-bit values in the frame are |
3675 | | already in use, just test the type each time. */ |
3676 | | |
3677 | 60.3M | if (reptype == REPTYPE_MIN) |
3678 | 474k | { |
3679 | 474k | #ifdef SUPPORT_UNICODE |
3680 | 474k | if (proptype >= 0) |
3681 | 31.2k | { |
3682 | 31.2k | switch(proptype) |
3683 | 31.2k | { |
3684 | 0 | case PT_LAMP: |
3685 | 0 | for (;;) |
3686 | 0 | { |
3687 | 0 | int chartype; |
3688 | 0 | RMATCH(Fecode, RM208); |
3689 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3690 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3691 | 0 | if (Feptr >= mb->end_subject) |
3692 | 0 | { |
3693 | 0 | SCHECK_PARTIAL(); |
3694 | 0 | RRETURN(MATCH_NOMATCH); |
3695 | 0 | } |
3696 | 0 | GETCHARINCTEST(fc, Feptr); |
3697 | 0 | chartype = UCD_CHARTYPE(fc); |
3698 | 0 | if ((chartype == ucp_Lu || |
3699 | 0 | chartype == ucp_Ll || |
3700 | 0 | chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) |
3701 | 0 | RRETURN(MATCH_NOMATCH); |
3702 | 0 | } |
3703 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3704 | |
|
3705 | 6.75k | case PT_GC: |
3706 | 6.75k | for (;;) |
3707 | 127k | { |
3708 | 127k | RMATCH(Fecode, RM209); |
3709 | 127k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3710 | 127k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3711 | 127k | if (Feptr >= mb->end_subject) |
3712 | 744 | { |
3713 | 744 | SCHECK_PARTIAL(); |
3714 | 744 | RRETURN(MATCH_NOMATCH); |
3715 | 744 | } |
3716 | 126k | GETCHARINCTEST(fc, Feptr); |
3717 | 126k | if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3718 | 6.01k | RRETURN(MATCH_NOMATCH); |
3719 | 126k | } |
3720 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3721 | |
|
3722 | 0 | case PT_PC: |
3723 | 0 | for (;;) |
3724 | 0 | { |
3725 | 0 | RMATCH(Fecode, RM210); |
3726 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3727 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3728 | 0 | if (Feptr >= mb->end_subject) |
3729 | 0 | { |
3730 | 0 | SCHECK_PARTIAL(); |
3731 | 0 | RRETURN(MATCH_NOMATCH); |
3732 | 0 | } |
3733 | 0 | GETCHARINCTEST(fc, Feptr); |
3734 | 0 | if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3735 | 0 | RRETURN(MATCH_NOMATCH); |
3736 | 0 | } |
3737 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3738 | |
|
3739 | 0 | case PT_SC: |
3740 | 0 | for (;;) |
3741 | 0 | { |
3742 | 0 | RMATCH(Fecode, RM211); |
3743 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3744 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3745 | 0 | if (Feptr >= mb->end_subject) |
3746 | 0 | { |
3747 | 0 | SCHECK_PARTIAL(); |
3748 | 0 | RRETURN(MATCH_NOMATCH); |
3749 | 0 | } |
3750 | 0 | GETCHARINCTEST(fc, Feptr); |
3751 | 0 | if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3752 | 0 | RRETURN(MATCH_NOMATCH); |
3753 | 0 | } |
3754 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3755 | |
|
3756 | 0 | case PT_SCX: |
3757 | 0 | for (;;) |
3758 | 0 | { |
3759 | 0 | BOOL ok; |
3760 | 0 | const ucd_record *prop; |
3761 | 0 | RMATCH(Fecode, RM224); |
3762 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3763 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3764 | 0 | if (Feptr >= mb->end_subject) |
3765 | 0 | { |
3766 | 0 | SCHECK_PARTIAL(); |
3767 | 0 | RRETURN(MATCH_NOMATCH); |
3768 | 0 | } |
3769 | 0 | GETCHARINCTEST(fc, Feptr); |
3770 | 0 | prop = GET_UCD(fc); |
3771 | 0 | ok = (prop->script == Lpropvalue |
3772 | 0 | || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); |
3773 | 0 | if (ok == (Lctype == OP_NOTPROP)) |
3774 | 0 | RRETURN(MATCH_NOMATCH); |
3775 | 0 | } |
3776 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3777 | |
|
3778 | 0 | case PT_ALNUM: |
3779 | 0 | for (;;) |
3780 | 0 | { |
3781 | 0 | int category; |
3782 | 0 | RMATCH(Fecode, RM212); |
3783 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3784 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3785 | 0 | if (Feptr >= mb->end_subject) |
3786 | 0 | { |
3787 | 0 | SCHECK_PARTIAL(); |
3788 | 0 | RRETURN(MATCH_NOMATCH); |
3789 | 0 | } |
3790 | 0 | GETCHARINCTEST(fc, Feptr); |
3791 | 0 | category = UCD_CATEGORY(fc); |
3792 | 0 | if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP)) |
3793 | 0 | RRETURN(MATCH_NOMATCH); |
3794 | 0 | } |
3795 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3796 | | |
3797 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
3798 | | which means that Perl space and POSIX space are now identical. PCRE |
3799 | | was changed at release 8.34. */ |
3800 | |
|
3801 | 22.4k | case PT_SPACE: /* Perl space */ |
3802 | 22.4k | case PT_PXSPACE: /* POSIX space */ |
3803 | 22.4k | for (;;) |
3804 | 40.0k | { |
3805 | 40.0k | RMATCH(Fecode, RM213); |
3806 | 40.0k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3807 | 40.0k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3808 | 22.4k | if (Feptr >= mb->end_subject) |
3809 | 0 | { |
3810 | 0 | SCHECK_PARTIAL(); |
3811 | 0 | RRETURN(MATCH_NOMATCH); |
3812 | 0 | } |
3813 | 22.4k | GETCHARINCTEST(fc, Feptr); |
3814 | 22.4k | switch(fc) |
3815 | 22.4k | { |
3816 | 54.9k | HSPACE_CASES: |
3817 | 54.9k | VSPACE_CASES: |
3818 | 32.8k | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
3819 | 0 | break; |
3820 | | |
3821 | 17.5k | default: |
3822 | 17.5k | if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) |
3823 | 0 | RRETURN(MATCH_NOMATCH); |
3824 | 17.5k | break; |
3825 | 22.4k | } |
3826 | 22.4k | } |
3827 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3828 | |
|
3829 | 0 | case PT_WORD: |
3830 | 0 | for (;;) |
3831 | 0 | { |
3832 | 0 | int chartype, category; |
3833 | 0 | RMATCH(Fecode, RM214); |
3834 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3835 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3836 | 0 | if (Feptr >= mb->end_subject) |
3837 | 0 | { |
3838 | 0 | SCHECK_PARTIAL(); |
3839 | 0 | RRETURN(MATCH_NOMATCH); |
3840 | 0 | } |
3841 | 0 | GETCHARINCTEST(fc, Feptr); |
3842 | 0 | chartype = UCD_CHARTYPE(fc); |
3843 | 0 | category = PRIV(ucp_gentype)[chartype]; |
3844 | 0 | if ((category == ucp_L || |
3845 | 0 | category == ucp_N || |
3846 | 0 | chartype == ucp_Mn || |
3847 | 0 | chartype == ucp_Pc) == (Lctype == OP_NOTPROP)) |
3848 | 0 | RRETURN(MATCH_NOMATCH); |
3849 | 0 | } |
3850 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3851 | |
|
3852 | 2.04k | case PT_CLIST: |
3853 | 2.04k | for (;;) |
3854 | 2.05k | { |
3855 | 2.05k | const uint32_t *cp; |
3856 | 2.05k | RMATCH(Fecode, RM215); |
3857 | 2.05k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3858 | 2.05k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3859 | 2.04k | if (Feptr >= mb->end_subject) |
3860 | 0 | { |
3861 | 0 | SCHECK_PARTIAL(); |
3862 | 0 | RRETURN(MATCH_NOMATCH); |
3863 | 0 | } |
3864 | 2.04k | GETCHARINCTEST(fc, Feptr); |
3865 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
3866 | | if (fc > MAX_UTF_CODE_POINT) |
3867 | | { |
3868 | | if (Lctype == OP_NOTPROP) continue; |
3869 | | RRETURN(MATCH_NOMATCH); |
3870 | | } |
3871 | | #endif |
3872 | 2.04k | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
3873 | 2.04k | for (;;) |
3874 | 3.08k | { |
3875 | 3.08k | if (fc < *cp) |
3876 | 2.04k | { |
3877 | 2.04k | if (Lctype == OP_NOTPROP) break; |
3878 | 2.04k | RRETURN(MATCH_NOMATCH); |
3879 | 2.04k | } |
3880 | 1.04k | if (fc == *cp++) |
3881 | 6 | { |
3882 | 6 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
3883 | 6 | break; |
3884 | 6 | } |
3885 | 1.04k | } |
3886 | 2.04k | } |
3887 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3888 | |
|
3889 | 0 | case PT_UCNC: |
3890 | 0 | for (;;) |
3891 | 0 | { |
3892 | 0 | RMATCH(Fecode, RM216); |
3893 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3894 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3895 | 0 | if (Feptr >= mb->end_subject) |
3896 | 0 | { |
3897 | 0 | SCHECK_PARTIAL(); |
3898 | 0 | RRETURN(MATCH_NOMATCH); |
3899 | 0 | } |
3900 | 0 | GETCHARINCTEST(fc, Feptr); |
3901 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
3902 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
3903 | 0 | fc >= 0xe000) == (Lctype == OP_NOTPROP)) |
3904 | 0 | RRETURN(MATCH_NOMATCH); |
3905 | 0 | } |
3906 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3907 | |
|
3908 | 0 | case PT_BIDICL: |
3909 | 0 | for (;;) |
3910 | 0 | { |
3911 | 0 | RMATCH(Fecode, RM223); |
3912 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3913 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3914 | 0 | if (Feptr >= mb->end_subject) |
3915 | 0 | { |
3916 | 0 | SCHECK_PARTIAL(); |
3917 | 0 | RRETURN(MATCH_NOMATCH); |
3918 | 0 | } |
3919 | 0 | GETCHARINCTEST(fc, Feptr); |
3920 | 0 | if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3921 | 0 | RRETURN(MATCH_NOMATCH); |
3922 | 0 | } |
3923 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3924 | |
|
3925 | 0 | case PT_BOOL: |
3926 | 0 | for (;;) |
3927 | 0 | { |
3928 | 0 | BOOL ok; |
3929 | 0 | const ucd_record *prop; |
3930 | 0 | RMATCH(Fecode, RM222); |
3931 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3932 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3933 | 0 | if (Feptr >= mb->end_subject) |
3934 | 0 | { |
3935 | 0 | SCHECK_PARTIAL(); |
3936 | 0 | RRETURN(MATCH_NOMATCH); |
3937 | 0 | } |
3938 | 0 | GETCHARINCTEST(fc, Feptr); |
3939 | 0 | prop = GET_UCD(fc); |
3940 | 0 | ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
3941 | 0 | UCD_BPROPS_PROP(prop), Lpropvalue) != 0; |
3942 | 0 | if (ok == (Lctype == OP_NOTPROP)) |
3943 | 0 | RRETURN(MATCH_NOMATCH); |
3944 | 0 | } |
3945 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3946 | | |
3947 | | /* This should never occur */ |
3948 | 0 | default: |
3949 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
3950 | 0 | return PCRE2_ERROR_INTERNAL; |
3951 | 31.2k | } |
3952 | 31.2k | } |
3953 | | |
3954 | | /* Match extended Unicode sequences. We will get here only if the |
3955 | | support is in the binary; otherwise a compile-time error occurs. */ |
3956 | | |
3957 | 443k | else if (Lctype == OP_EXTUNI) |
3958 | 18.8k | { |
3959 | 18.8k | for (;;) |
3960 | 2.92M | { |
3961 | 2.92M | RMATCH(Fecode, RM217); |
3962 | 2.92M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3963 | 2.92M | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3964 | 2.92M | if (Feptr >= mb->end_subject) |
3965 | 18.2k | { |
3966 | 18.2k | SCHECK_PARTIAL(); |
3967 | 18.2k | RRETURN(MATCH_NOMATCH); |
3968 | 18.2k | } |
3969 | 2.90M | else |
3970 | 2.90M | { |
3971 | 2.90M | GETCHARINCTEST(fc, Feptr); |
3972 | 2.90M | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, |
3973 | 2.90M | utf, NULL); |
3974 | 2.90M | } |
3975 | 2.90M | CHECK_PARTIAL(); |
3976 | 2.90M | } |
3977 | 18.8k | } |
3978 | 424k | else |
3979 | 424k | #endif /* SUPPORT_UNICODE */ |
3980 | | |
3981 | | /* UTF mode for non-property testing character types. */ |
3982 | | |
3983 | 424k | #ifdef SUPPORT_UNICODE |
3984 | 424k | if (utf) |
3985 | 94.3k | { |
3986 | 94.3k | for (;;) |
3987 | 6.39M | { |
3988 | 6.39M | RMATCH(Fecode, RM218); |
3989 | 6.39M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3990 | 6.39M | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3991 | 6.39M | if (Feptr >= mb->end_subject) |
3992 | 63.0k | { |
3993 | 63.0k | SCHECK_PARTIAL(); |
3994 | 63.0k | RRETURN(MATCH_NOMATCH); |
3995 | 63.0k | } |
3996 | 6.32M | if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
3997 | 6.32M | GETCHARINC(fc, Feptr); |
3998 | 6.32M | switch(Lctype) |
3999 | 6.32M | { |
4000 | 39.7k | case OP_ANY: /* This is the non-NL case */ |
4001 | 39.7k | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4002 | 39.7k | Feptr >= mb->end_subject && |
4003 | 39.7k | NLBLOCK->nltype == NLTYPE_FIXED && |
4004 | 39.7k | NLBLOCK->nllen == 2 && |
4005 | 39.7k | fc == NLBLOCK->nl[0]) |
4006 | 0 | { |
4007 | 0 | mb->hitend = TRUE; |
4008 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4009 | 0 | } |
4010 | 39.7k | break; |
4011 | | |
4012 | 1.39M | case OP_ALLANY: |
4013 | 5.84M | case OP_ANYBYTE: |
4014 | 5.84M | break; |
4015 | | |
4016 | 0 | case OP_ANYNL: |
4017 | 0 | switch(fc) |
4018 | 0 | { |
4019 | 0 | default: RRETURN(MATCH_NOMATCH); |
4020 | | |
4021 | 0 | case CHAR_CR: |
4022 | 0 | if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; |
4023 | 0 | break; |
4024 | | |
4025 | 0 | case CHAR_LF: |
4026 | 0 | break; |
4027 | | |
4028 | 0 | case CHAR_VT: |
4029 | 0 | case CHAR_FF: |
4030 | 0 | case CHAR_NEL: |
4031 | 0 | #ifndef EBCDIC |
4032 | 0 | case 0x2028: |
4033 | 0 | case 0x2029: |
4034 | 0 | #endif /* Not EBCDIC */ |
4035 | 0 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) |
4036 | 0 | RRETURN(MATCH_NOMATCH); |
4037 | 0 | break; |
4038 | 0 | } |
4039 | 0 | break; |
4040 | | |
4041 | 443k | case OP_NOT_HSPACE: |
4042 | 443k | switch(fc) |
4043 | 443k | { |
4044 | 317k | HSPACE_CASES: RRETURN(MATCH_NOMATCH); |
4045 | 420k | default: break; |
4046 | 443k | } |
4047 | 420k | break; |
4048 | | |
4049 | 420k | case OP_HSPACE: |
4050 | 0 | switch(fc) |
4051 | 0 | { |
4052 | 0 | HSPACE_CASES: break; |
4053 | 0 | default: RRETURN(MATCH_NOMATCH); |
4054 | 0 | } |
4055 | 0 | break; |
4056 | | |
4057 | 0 | case OP_NOT_VSPACE: |
4058 | 0 | switch(fc) |
4059 | 0 | { |
4060 | 0 | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
4061 | 0 | default: break; |
4062 | 0 | } |
4063 | 0 | break; |
4064 | | |
4065 | 24 | case OP_VSPACE: |
4066 | 24 | switch(fc) |
4067 | 24 | { |
4068 | 9 | VSPACE_CASES: break; |
4069 | 15 | default: RRETURN(MATCH_NOMATCH); |
4070 | 24 | } |
4071 | 9 | break; |
4072 | | |
4073 | 9 | case OP_NOT_DIGIT: |
4074 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) |
4075 | 0 | RRETURN(MATCH_NOMATCH); |
4076 | 0 | break; |
4077 | | |
4078 | 0 | case OP_DIGIT: |
4079 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0) |
4080 | 0 | RRETURN(MATCH_NOMATCH); |
4081 | 0 | break; |
4082 | | |
4083 | 0 | case OP_NOT_WHITESPACE: |
4084 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) |
4085 | 0 | RRETURN(MATCH_NOMATCH); |
4086 | 0 | break; |
4087 | | |
4088 | 0 | case OP_WHITESPACE: |
4089 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0) |
4090 | 0 | RRETURN(MATCH_NOMATCH); |
4091 | 0 | break; |
4092 | | |
4093 | 0 | case OP_NOT_WORDCHAR: |
4094 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) |
4095 | 0 | RRETURN(MATCH_NOMATCH); |
4096 | 0 | break; |
4097 | | |
4098 | 0 | case OP_WORDCHAR: |
4099 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) |
4100 | 0 | RRETURN(MATCH_NOMATCH); |
4101 | 0 | break; |
4102 | | |
4103 | 0 | default: |
4104 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
4105 | 0 | return PCRE2_ERROR_INTERNAL; |
4106 | 6.32M | } |
4107 | 6.32M | } |
4108 | 94.3k | } |
4109 | 330k | else |
4110 | 330k | #endif /* SUPPORT_UNICODE */ |
4111 | | |
4112 | | /* Not UTF mode */ |
4113 | 330k | { |
4114 | 330k | for (;;) |
4115 | 10.9M | { |
4116 | 10.9M | RMATCH(Fecode, RM33); |
4117 | 10.9M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4118 | 10.9M | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
4119 | 10.8M | if (Feptr >= mb->end_subject) |
4120 | 122k | { |
4121 | 122k | SCHECK_PARTIAL(); |
4122 | 122k | RRETURN(MATCH_NOMATCH); |
4123 | 122k | } |
4124 | 10.7M | if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) |
4125 | 1.31k | RRETURN(MATCH_NOMATCH); |
4126 | 10.7M | fc = *Feptr++; |
4127 | 10.7M | switch(Lctype) |
4128 | 10.7M | { |
4129 | 354k | case OP_ANY: /* This is the non-NL case */ |
4130 | 354k | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4131 | 354k | Feptr >= mb->end_subject && |
4132 | 354k | NLBLOCK->nltype == NLTYPE_FIXED && |
4133 | 354k | NLBLOCK->nllen == 2 && |
4134 | 354k | fc == NLBLOCK->nl[0]) |
4135 | 0 | { |
4136 | 0 | mb->hitend = TRUE; |
4137 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4138 | 0 | } |
4139 | 354k | break; |
4140 | | |
4141 | 2.34M | case OP_ALLANY: |
4142 | 2.34M | case OP_ANYBYTE: |
4143 | 2.34M | break; |
4144 | | |
4145 | 6.19k | case OP_ANYNL: |
4146 | 6.19k | switch(fc) |
4147 | 6.19k | { |
4148 | 5.54k | default: RRETURN(MATCH_NOMATCH); |
4149 | | |
4150 | 18 | case CHAR_CR: |
4151 | 18 | if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; |
4152 | 18 | break; |
4153 | | |
4154 | 520 | case CHAR_LF: |
4155 | 520 | break; |
4156 | | |
4157 | 45 | case CHAR_VT: |
4158 | 45 | case CHAR_FF: |
4159 | 111 | case CHAR_NEL: |
4160 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4161 | | case 0x2028: |
4162 | | case 0x2029: |
4163 | | #endif |
4164 | 111 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) |
4165 | 0 | RRETURN(MATCH_NOMATCH); |
4166 | 111 | break; |
4167 | 6.19k | } |
4168 | 649 | break; |
4169 | | |
4170 | 1.08M | case OP_NOT_HSPACE: |
4171 | 1.08M | switch(fc) |
4172 | 1.08M | { |
4173 | 1.07M | default: break; |
4174 | 1.07M | HSPACE_BYTE_CASES: |
4175 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4176 | | HSPACE_MULTIBYTE_CASES: |
4177 | | #endif |
4178 | 33.3k | RRETURN(MATCH_NOMATCH); |
4179 | 1.08M | } |
4180 | 1.07M | break; |
4181 | | |
4182 | 1.07M | case OP_HSPACE: |
4183 | 0 | switch(fc) |
4184 | 0 | { |
4185 | 0 | default: RRETURN(MATCH_NOMATCH); |
4186 | 0 | HSPACE_BYTE_CASES: |
4187 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4188 | | HSPACE_MULTIBYTE_CASES: |
4189 | | #endif |
4190 | 0 | break; |
4191 | 0 | } |
4192 | 0 | break; |
4193 | | |
4194 | 512k | case OP_NOT_VSPACE: |
4195 | 512k | switch(fc) |
4196 | 512k | { |
4197 | 505k | default: break; |
4198 | 505k | VSPACE_BYTE_CASES: |
4199 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4200 | | VSPACE_MULTIBYTE_CASES: |
4201 | | #endif |
4202 | 25.1k | RRETURN(MATCH_NOMATCH); |
4203 | 512k | } |
4204 | 505k | break; |
4205 | | |
4206 | 505k | case OP_VSPACE: |
4207 | 0 | switch(fc) |
4208 | 0 | { |
4209 | 0 | default: RRETURN(MATCH_NOMATCH); |
4210 | 0 | VSPACE_BYTE_CASES: |
4211 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4212 | | VSPACE_MULTIBYTE_CASES: |
4213 | | #endif |
4214 | 0 | break; |
4215 | 0 | } |
4216 | 0 | break; |
4217 | | |
4218 | 290k | case OP_NOT_DIGIT: |
4219 | 290k | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) |
4220 | 53.6k | RRETURN(MATCH_NOMATCH); |
4221 | 236k | break; |
4222 | | |
4223 | 236k | case OP_DIGIT: |
4224 | 498 | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) |
4225 | 444 | RRETURN(MATCH_NOMATCH); |
4226 | 54 | break; |
4227 | | |
4228 | 5.92M | case OP_NOT_WHITESPACE: |
4229 | 5.92M | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) |
4230 | 35.7k | RRETURN(MATCH_NOMATCH); |
4231 | 5.89M | break; |
4232 | | |
4233 | 5.89M | case OP_WHITESPACE: |
4234 | 0 | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) |
4235 | 0 | RRETURN(MATCH_NOMATCH); |
4236 | 0 | break; |
4237 | | |
4238 | 177k | case OP_NOT_WORDCHAR: |
4239 | 177k | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) |
4240 | 18.5k | RRETURN(MATCH_NOMATCH); |
4241 | 159k | break; |
4242 | | |
4243 | 159k | case OP_WORDCHAR: |
4244 | 17.4k | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) |
4245 | 3.77k | RRETURN(MATCH_NOMATCH); |
4246 | 13.6k | break; |
4247 | | |
4248 | 13.6k | default: |
4249 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
4250 | 0 | return PCRE2_ERROR_INTERNAL; |
4251 | 10.7M | } |
4252 | 10.7M | } |
4253 | 330k | } |
4254 | | |
4255 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ |
4256 | 0 | } |
4257 | | |
4258 | | /* If maximizing, it is worth using inline code for speed, doing the type |
4259 | | test once at the start (i.e. keep it out of the loops). Once again, |
4260 | | "notmatch" can be an ordinary local variable because the loops do not call |
4261 | | RMATCH. */ |
4262 | | |
4263 | 59.9M | else |
4264 | 59.9M | { |
4265 | 59.9M | Lstart_eptr = Feptr; /* Remember where we started */ |
4266 | | |
4267 | 59.9M | #ifdef SUPPORT_UNICODE |
4268 | 59.9M | if (proptype >= 0) |
4269 | 678k | { |
4270 | 678k | BOOL notmatch = Lctype == OP_NOTPROP; |
4271 | 678k | switch(proptype) |
4272 | 678k | { |
4273 | 0 | case PT_LAMP: |
4274 | 0 | for (i = Lmin; i < Lmax; i++) |
4275 | 0 | { |
4276 | 0 | int chartype; |
4277 | 0 | int len = 1; |
4278 | 0 | if (Feptr >= mb->end_subject) |
4279 | 0 | { |
4280 | 0 | SCHECK_PARTIAL(); |
4281 | 0 | break; |
4282 | 0 | } |
4283 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4284 | 0 | chartype = UCD_CHARTYPE(fc); |
4285 | 0 | if ((chartype == ucp_Lu || |
4286 | 0 | chartype == ucp_Ll || |
4287 | 0 | chartype == ucp_Lt) == notmatch) |
4288 | 0 | break; |
4289 | 0 | Feptr+= len; |
4290 | 0 | } |
4291 | 0 | break; |
4292 | | |
4293 | 834 | case PT_GC: |
4294 | 40.5k | for (i = Lmin; i < Lmax; i++) |
4295 | 40.5k | { |
4296 | 40.5k | int len = 1; |
4297 | 40.5k | if (Feptr >= mb->end_subject) |
4298 | 30 | { |
4299 | 30 | SCHECK_PARTIAL(); |
4300 | 30 | break; |
4301 | 30 | } |
4302 | 40.4k | GETCHARLENTEST(fc, Feptr, len); |
4303 | 40.4k | if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break; |
4304 | 39.6k | Feptr+= len; |
4305 | 39.6k | } |
4306 | 834 | break; |
4307 | | |
4308 | 12.2k | case PT_PC: |
4309 | 54.9k | for (i = Lmin; i < Lmax; i++) |
4310 | 54.9k | { |
4311 | 54.9k | int len = 1; |
4312 | 54.9k | if (Feptr >= mb->end_subject) |
4313 | 0 | { |
4314 | 0 | SCHECK_PARTIAL(); |
4315 | 0 | break; |
4316 | 0 | } |
4317 | 54.9k | GETCHARLENTEST(fc, Feptr, len); |
4318 | 54.9k | if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break; |
4319 | 42.7k | Feptr+= len; |
4320 | 42.7k | } |
4321 | 12.2k | break; |
4322 | | |
4323 | 12.2k | case PT_SC: |
4324 | 0 | for (i = Lmin; i < Lmax; i++) |
4325 | 0 | { |
4326 | 0 | int len = 1; |
4327 | 0 | if (Feptr >= mb->end_subject) |
4328 | 0 | { |
4329 | 0 | SCHECK_PARTIAL(); |
4330 | 0 | break; |
4331 | 0 | } |
4332 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4333 | 0 | if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break; |
4334 | 0 | Feptr+= len; |
4335 | 0 | } |
4336 | 0 | break; |
4337 | | |
4338 | 0 | case PT_SCX: |
4339 | 0 | for (i = Lmin; i < Lmax; i++) |
4340 | 0 | { |
4341 | 0 | BOOL ok; |
4342 | 0 | const ucd_record *prop; |
4343 | 0 | int len = 1; |
4344 | 0 | if (Feptr >= mb->end_subject) |
4345 | 0 | { |
4346 | 0 | SCHECK_PARTIAL(); |
4347 | 0 | break; |
4348 | 0 | } |
4349 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4350 | 0 | prop = GET_UCD(fc); |
4351 | 0 | ok = (prop->script == Lpropvalue || |
4352 | 0 | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); |
4353 | 0 | if (ok == notmatch) break; |
4354 | 0 | Feptr+= len; |
4355 | 0 | } |
4356 | 0 | break; |
4357 | | |
4358 | 0 | case PT_ALNUM: |
4359 | 0 | for (i = Lmin; i < Lmax; i++) |
4360 | 0 | { |
4361 | 0 | int category; |
4362 | 0 | int len = 1; |
4363 | 0 | if (Feptr >= mb->end_subject) |
4364 | 0 | { |
4365 | 0 | SCHECK_PARTIAL(); |
4366 | 0 | break; |
4367 | 0 | } |
4368 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4369 | 0 | category = UCD_CATEGORY(fc); |
4370 | 0 | if ((category == ucp_L || category == ucp_N) == notmatch) |
4371 | 0 | break; |
4372 | 0 | Feptr+= len; |
4373 | 0 | } |
4374 | 0 | break; |
4375 | | |
4376 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
4377 | | which means that Perl space and POSIX space are now identical. PCRE |
4378 | | was changed at release 8.34. */ |
4379 | | |
4380 | 627k | case PT_SPACE: /* Perl space */ |
4381 | 627k | case PT_PXSPACE: /* POSIX space */ |
4382 | 5.39M | for (i = Lmin; i < Lmax; i++) |
4383 | 5.39M | { |
4384 | 5.39M | int len = 1; |
4385 | 5.39M | if (Feptr >= mb->end_subject) |
4386 | 19.0k | { |
4387 | 19.0k | SCHECK_PARTIAL(); |
4388 | 19.0k | break; |
4389 | 19.0k | } |
4390 | 5.37M | GETCHARLENTEST(fc, Feptr, len); |
4391 | 5.37M | switch(fc) |
4392 | 5.37M | { |
4393 | 18.6M | HSPACE_CASES: |
4394 | 18.6M | VSPACE_CASES: |
4395 | 7.94M | if (notmatch) goto ENDLOOP99; /* Break the loop */ |
4396 | 878k | break; |
4397 | | |
4398 | 4.23M | default: |
4399 | 4.23M | if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch) |
4400 | 342k | goto ENDLOOP99; /* Break the loop */ |
4401 | 3.89M | break; |
4402 | 5.37M | } |
4403 | 4.77M | Feptr+= len; |
4404 | 4.77M | } |
4405 | 627k | ENDLOOP99: |
4406 | 627k | break; |
4407 | | |
4408 | 37.4k | case PT_WORD: |
4409 | 369k | for (i = Lmin; i < Lmax; i++) |
4410 | 369k | { |
4411 | 369k | int chartype, category; |
4412 | 369k | int len = 1; |
4413 | 369k | if (Feptr >= mb->end_subject) |
4414 | 828 | { |
4415 | 828 | SCHECK_PARTIAL(); |
4416 | 828 | break; |
4417 | 828 | } |
4418 | 368k | GETCHARLENTEST(fc, Feptr, len); |
4419 | 368k | chartype = UCD_CHARTYPE(fc); |
4420 | 368k | category = PRIV(ucp_gentype)[chartype]; |
4421 | 368k | if ((category == ucp_L || |
4422 | 368k | category == ucp_N || |
4423 | 368k | chartype == ucp_Mn || |
4424 | 368k | chartype == ucp_Pc) == notmatch) |
4425 | 36.5k | break; |
4426 | 332k | Feptr+= len; |
4427 | 332k | } |
4428 | 37.4k | break; |
4429 | | |
4430 | 37.4k | case PT_CLIST: |
4431 | 276 | for (i = Lmin; i < Lmax; i++) |
4432 | 276 | { |
4433 | 276 | const uint32_t *cp; |
4434 | 276 | int len = 1; |
4435 | 276 | if (Feptr >= mb->end_subject) |
4436 | 0 | { |
4437 | 0 | SCHECK_PARTIAL(); |
4438 | 0 | break; |
4439 | 0 | } |
4440 | 276 | GETCHARLENTEST(fc, Feptr, len); |
4441 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
4442 | | if (fc > MAX_UTF_CODE_POINT) |
4443 | | { |
4444 | | if (!notmatch) goto GOT_MAX; |
4445 | | } |
4446 | | else |
4447 | | #endif |
4448 | 276 | { |
4449 | 276 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
4450 | 276 | for (;;) |
4451 | 541 | { |
4452 | 541 | if (fc < *cp) |
4453 | 276 | { if (notmatch) break; else goto GOT_MAX; } |
4454 | 265 | if (fc == *cp++) |
4455 | 0 | { if (notmatch) goto GOT_MAX; else break; } |
4456 | 265 | } |
4457 | 276 | } |
4458 | | |
4459 | 0 | Feptr += len; |
4460 | 0 | } |
4461 | 276 | GOT_MAX: |
4462 | 276 | break; |
4463 | | |
4464 | 0 | case PT_UCNC: |
4465 | 0 | for (i = Lmin; i < Lmax; i++) |
4466 | 0 | { |
4467 | 0 | int len = 1; |
4468 | 0 | if (Feptr >= mb->end_subject) |
4469 | 0 | { |
4470 | 0 | SCHECK_PARTIAL(); |
4471 | 0 | break; |
4472 | 0 | } |
4473 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4474 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
4475 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
4476 | 0 | fc >= 0xe000) == notmatch) |
4477 | 0 | break; |
4478 | 0 | Feptr += len; |
4479 | 0 | } |
4480 | 0 | break; |
4481 | | |
4482 | 0 | case PT_BIDICL: |
4483 | 0 | for (i = Lmin; i < Lmax; i++) |
4484 | 0 | { |
4485 | 0 | int len = 1; |
4486 | 0 | if (Feptr >= mb->end_subject) |
4487 | 0 | { |
4488 | 0 | SCHECK_PARTIAL(); |
4489 | 0 | break; |
4490 | 0 | } |
4491 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4492 | 0 | if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break; |
4493 | 0 | Feptr+= len; |
4494 | 0 | } |
4495 | 0 | break; |
4496 | | |
4497 | 0 | case PT_BOOL: |
4498 | 0 | for (i = Lmin; i < Lmax; i++) |
4499 | 0 | { |
4500 | 0 | BOOL ok; |
4501 | 0 | const ucd_record *prop; |
4502 | 0 | int len = 1; |
4503 | 0 | if (Feptr >= mb->end_subject) |
4504 | 0 | { |
4505 | 0 | SCHECK_PARTIAL(); |
4506 | 0 | break; |
4507 | 0 | } |
4508 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4509 | 0 | prop = GET_UCD(fc); |
4510 | 0 | ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
4511 | 0 | UCD_BPROPS_PROP(prop), Lpropvalue) != 0; |
4512 | 0 | if (ok == notmatch) break; |
4513 | 0 | Feptr+= len; |
4514 | 0 | } |
4515 | 0 | break; |
4516 | | |
4517 | 0 | default: |
4518 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
4519 | 0 | return PCRE2_ERROR_INTERNAL; |
4520 | 678k | } |
4521 | | |
4522 | | /* Feptr is now past the end of the maximum run */ |
4523 | | |
4524 | 678k | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4525 | | |
4526 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
4527 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
4528 | | go too far. */ |
4529 | | |
4530 | 630k | for(;;) |
4531 | 5.46M | { |
4532 | 5.46M | if (Feptr <= Lstart_eptr) break; |
4533 | 4.83M | RMATCH(Fecode, RM221); |
4534 | 4.83M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4535 | 4.83M | Feptr--; |
4536 | 4.83M | if (utf) BACKCHAR(Feptr); |
4537 | 4.83M | } |
4538 | 630k | } |
4539 | | |
4540 | | /* Match extended Unicode grapheme clusters. We will get here only if the |
4541 | | support is in the binary; otherwise a compile-time error occurs. */ |
4542 | | |
4543 | 59.2M | else if (Lctype == OP_EXTUNI) |
4544 | 36.0k | { |
4545 | 4.59M | for (i = Lmin; i < Lmax; i++) |
4546 | 4.59M | { |
4547 | 4.59M | if (Feptr >= mb->end_subject) |
4548 | 36.0k | { |
4549 | 36.0k | SCHECK_PARTIAL(); |
4550 | 36.0k | break; |
4551 | 36.0k | } |
4552 | 4.55M | else |
4553 | 4.55M | { |
4554 | 4.55M | GETCHARINCTEST(fc, Feptr); |
4555 | 4.55M | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, |
4556 | 4.55M | utf, NULL); |
4557 | 4.55M | } |
4558 | 4.55M | CHECK_PARTIAL(); |
4559 | 4.55M | } |
4560 | | |
4561 | | /* Feptr is now past the end of the maximum run */ |
4562 | | |
4563 | 36.0k | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4564 | | |
4565 | | /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start |
4566 | | of the run while backtracking because the use of \C in UTF mode can |
4567 | | cause BACKCHAR to move back past Lstart_eptr. This is just palliative; |
4568 | | the use of \C in UTF mode is fraught with danger. */ |
4569 | | |
4570 | 36.0k | for(;;) |
4571 | 4.59M | { |
4572 | 4.59M | int lgb, rgb; |
4573 | 4.59M | PCRE2_SPTR fptr; |
4574 | | |
4575 | 4.59M | if (Feptr <= Lstart_eptr) break; /* At start of char run */ |
4576 | 4.55M | RMATCH(Fecode, RM219); |
4577 | 4.55M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4578 | | |
4579 | | /* Backtracking over an extended grapheme cluster involves inspecting |
4580 | | the previous two characters (if present) to see if a break is |
4581 | | permitted between them. */ |
4582 | | |
4583 | 4.55M | Feptr--; |
4584 | 4.55M | if (!utf) fc = *Feptr; else |
4585 | 2.70M | { |
4586 | 2.70M | BACKCHAR(Feptr); |
4587 | 2.70M | GETCHAR(fc, Feptr); |
4588 | 2.70M | } |
4589 | 4.55M | rgb = UCD_GRAPHBREAK(fc); |
4590 | | |
4591 | 4.55M | for (;;) |
4592 | 4.56M | { |
4593 | 4.56M | if (Feptr <= Lstart_eptr) break; /* At start of char run */ |
4594 | 4.52M | fptr = Feptr - 1; |
4595 | 4.52M | if (!utf) fc = *fptr; else |
4596 | 2.70M | { |
4597 | 2.70M | BACKCHAR(fptr); |
4598 | 2.70M | GETCHAR(fc, fptr); |
4599 | 2.70M | } |
4600 | 4.52M | lgb = UCD_GRAPHBREAK(fc); |
4601 | 4.52M | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; |
4602 | 6.82k | Feptr = fptr; |
4603 | 6.82k | rgb = lgb; |
4604 | 6.82k | } |
4605 | 4.55M | } |
4606 | 36.0k | } |
4607 | | |
4608 | 59.1M | else |
4609 | 59.1M | #endif /* SUPPORT_UNICODE */ |
4610 | | |
4611 | 59.1M | #ifdef SUPPORT_UNICODE |
4612 | 59.1M | if (utf) |
4613 | 51.1M | { |
4614 | 51.1M | switch(Lctype) |
4615 | 51.1M | { |
4616 | 14.5k | case OP_ANY: |
4617 | 30.9k | for (i = Lmin; i < Lmax; i++) |
4618 | 16.3k | { |
4619 | 16.3k | if (Feptr >= mb->end_subject) |
4620 | 3 | { |
4621 | 3 | SCHECK_PARTIAL(); |
4622 | 3 | break; |
4623 | 3 | } |
4624 | 16.3k | if (IS_NEWLINE(Feptr)) break; |
4625 | 16.3k | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4626 | 16.3k | Feptr + 1 >= mb->end_subject && |
4627 | 16.3k | NLBLOCK->nltype == NLTYPE_FIXED && |
4628 | 16.3k | NLBLOCK->nllen == 2 && |
4629 | 16.3k | UCHAR21(Feptr) == NLBLOCK->nl[0]) |
4630 | 0 | { |
4631 | 0 | mb->hitend = TRUE; |
4632 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4633 | 0 | } |
4634 | 16.3k | Feptr++; |
4635 | 16.3k | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
4636 | 16.3k | } |
4637 | 14.5k | break; |
4638 | | |
4639 | 4.02M | case OP_ALLANY: |
4640 | 4.02M | if (Lmax < UINT32_MAX) |
4641 | 1.13k | { |
4642 | 2.24k | for (i = Lmin; i < Lmax; i++) |
4643 | 1.13k | { |
4644 | 1.13k | if (Feptr >= mb->end_subject) |
4645 | 28 | { |
4646 | 28 | SCHECK_PARTIAL(); |
4647 | 28 | break; |
4648 | 28 | } |
4649 | 1.10k | Feptr++; |
4650 | 1.10k | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
4651 | 1.10k | } |
4652 | 1.13k | } |
4653 | 4.02M | else |
4654 | 4.02M | { |
4655 | 4.02M | Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */ |
4656 | 4.02M | SCHECK_PARTIAL(); |
4657 | 4.02M | } |
4658 | 4.02M | break; |
4659 | | |
4660 | | /* The "byte" (i.e. "code unit") case is the same as non-UTF */ |
4661 | | |
4662 | 4.02M | case OP_ANYBYTE: |
4663 | 248 | fc = Lmax - Lmin; |
4664 | 248 | if (fc > (uint32_t)(mb->end_subject - Feptr)) |
4665 | 248 | { |
4666 | 248 | Feptr = mb->end_subject; |
4667 | 248 | SCHECK_PARTIAL(); |
4668 | 248 | } |
4669 | 0 | else Feptr += fc; |
4670 | 248 | break; |
4671 | | |
4672 | 47.0M | case OP_ANYNL: |
4673 | 54.7M | for (i = Lmin; i < Lmax; i++) |
4674 | 47.0M | { |
4675 | 47.0M | int len = 1; |
4676 | 47.0M | if (Feptr >= mb->end_subject) |
4677 | 667k | { |
4678 | 667k | SCHECK_PARTIAL(); |
4679 | 667k | break; |
4680 | 667k | } |
4681 | 46.3M | GETCHARLEN(fc, Feptr, len); |
4682 | 46.3M | if (fc == CHAR_CR) |
4683 | 0 | { |
4684 | 0 | if (++Feptr >= mb->end_subject) break; |
4685 | 0 | if (UCHAR21(Feptr) == CHAR_LF) Feptr++; |
4686 | 0 | } |
4687 | 46.3M | else |
4688 | 46.3M | { |
4689 | 46.3M | if (fc != CHAR_LF && |
4690 | 46.3M | (mb->bsr_convention == PCRE2_BSR_ANYCRLF || |
4691 | 43.6M | (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL |
4692 | 43.6M | #ifndef EBCDIC |
4693 | 43.6M | && fc != 0x2028 && fc != 0x2029 |
4694 | 43.6M | #endif /* Not EBCDIC */ |
4695 | 43.6M | ))) |
4696 | 38.6M | break; |
4697 | 7.72M | Feptr += len; |
4698 | 7.72M | } |
4699 | 46.3M | } |
4700 | 47.0M | break; |
4701 | | |
4702 | 47.0M | case OP_NOT_HSPACE: |
4703 | 3.70k | case OP_HSPACE: |
4704 | 90.9k | for (i = Lmin; i < Lmax; i++) |
4705 | 89.2k | { |
4706 | 89.2k | BOOL gotspace; |
4707 | 89.2k | int len = 1; |
4708 | 89.2k | if (Feptr >= mb->end_subject) |
4709 | 350 | { |
4710 | 350 | SCHECK_PARTIAL(); |
4711 | 350 | break; |
4712 | 350 | } |
4713 | 88.8k | GETCHARLEN(fc, Feptr, len); |
4714 | 88.8k | switch(fc) |
4715 | 88.8k | { |
4716 | 1.60k | HSPACE_CASES: gotspace = TRUE; break; |
4717 | 87.2k | default: gotspace = FALSE; break; |
4718 | 88.8k | } |
4719 | 88.8k | if (gotspace == (Lctype == OP_NOT_HSPACE)) break; |
4720 | 87.2k | Feptr += len; |
4721 | 87.2k | } |
4722 | 3.70k | break; |
4723 | | |
4724 | 17.5k | case OP_NOT_VSPACE: |
4725 | 20.5k | case OP_VSPACE: |
4726 | 294k | for (i = Lmin; i < Lmax; i++) |
4727 | 294k | { |
4728 | 294k | BOOL gotspace; |
4729 | 294k | int len = 1; |
4730 | 294k | if (Feptr >= mb->end_subject) |
4731 | 24 | { |
4732 | 24 | SCHECK_PARTIAL(); |
4733 | 24 | break; |
4734 | 24 | } |
4735 | 294k | GETCHARLEN(fc, Feptr, len); |
4736 | 294k | switch(fc) |
4737 | 294k | { |
4738 | 17.7k | VSPACE_CASES: gotspace = TRUE; break; |
4739 | 276k | default: gotspace = FALSE; break; |
4740 | 294k | } |
4741 | 294k | if (gotspace == (Lctype == OP_NOT_VSPACE)) break; |
4742 | 273k | Feptr += len; |
4743 | 273k | } |
4744 | 20.5k | break; |
4745 | | |
4746 | 20.5k | case OP_NOT_DIGIT: |
4747 | 0 | for (i = Lmin; i < Lmax; i++) |
4748 | 0 | { |
4749 | 0 | int len = 1; |
4750 | 0 | if (Feptr >= mb->end_subject) |
4751 | 0 | { |
4752 | 0 | SCHECK_PARTIAL(); |
4753 | 0 | break; |
4754 | 0 | } |
4755 | 0 | GETCHARLEN(fc, Feptr, len); |
4756 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break; |
4757 | 0 | Feptr+= len; |
4758 | 0 | } |
4759 | 0 | break; |
4760 | | |
4761 | 0 | case OP_DIGIT: |
4762 | 0 | for (i = Lmin; i < Lmax; i++) |
4763 | 0 | { |
4764 | 0 | int len = 1; |
4765 | 0 | if (Feptr >= mb->end_subject) |
4766 | 0 | { |
4767 | 0 | SCHECK_PARTIAL(); |
4768 | 0 | break; |
4769 | 0 | } |
4770 | 0 | GETCHARLEN(fc, Feptr, len); |
4771 | 0 | if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break; |
4772 | 0 | Feptr+= len; |
4773 | 0 | } |
4774 | 0 | break; |
4775 | | |
4776 | 0 | case OP_NOT_WHITESPACE: |
4777 | 0 | for (i = Lmin; i < Lmax; i++) |
4778 | 0 | { |
4779 | 0 | int len = 1; |
4780 | 0 | if (Feptr >= mb->end_subject) |
4781 | 0 | { |
4782 | 0 | SCHECK_PARTIAL(); |
4783 | 0 | break; |
4784 | 0 | } |
4785 | 0 | GETCHARLEN(fc, Feptr, len); |
4786 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break; |
4787 | 0 | Feptr+= len; |
4788 | 0 | } |
4789 | 0 | break; |
4790 | | |
4791 | 0 | case OP_WHITESPACE: |
4792 | 0 | for (i = Lmin; i < Lmax; i++) |
4793 | 0 | { |
4794 | 0 | int len = 1; |
4795 | 0 | if (Feptr >= mb->end_subject) |
4796 | 0 | { |
4797 | 0 | SCHECK_PARTIAL(); |
4798 | 0 | break; |
4799 | 0 | } |
4800 | 0 | GETCHARLEN(fc, Feptr, len); |
4801 | 0 | if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break; |
4802 | 0 | Feptr+= len; |
4803 | 0 | } |
4804 | 0 | break; |
4805 | | |
4806 | 0 | case OP_NOT_WORDCHAR: |
4807 | 0 | for (i = Lmin; i < Lmax; i++) |
4808 | 0 | { |
4809 | 0 | int len = 1; |
4810 | 0 | if (Feptr >= mb->end_subject) |
4811 | 0 | { |
4812 | 0 | SCHECK_PARTIAL(); |
4813 | 0 | break; |
4814 | 0 | } |
4815 | 0 | GETCHARLEN(fc, Feptr, len); |
4816 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break; |
4817 | 0 | Feptr+= len; |
4818 | 0 | } |
4819 | 0 | break; |
4820 | | |
4821 | 0 | case OP_WORDCHAR: |
4822 | 0 | for (i = Lmin; i < Lmax; i++) |
4823 | 0 | { |
4824 | 0 | int len = 1; |
4825 | 0 | if (Feptr >= mb->end_subject) |
4826 | 0 | { |
4827 | 0 | SCHECK_PARTIAL(); |
4828 | 0 | break; |
4829 | 0 | } |
4830 | 0 | GETCHARLEN(fc, Feptr, len); |
4831 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break; |
4832 | 0 | Feptr+= len; |
4833 | 0 | } |
4834 | 0 | break; |
4835 | | |
4836 | 0 | default: |
4837 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
4838 | 0 | return PCRE2_ERROR_INTERNAL; |
4839 | 51.1M | } |
4840 | | |
4841 | 51.1M | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4842 | | |
4843 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
4844 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go |
4845 | | too far. */ |
4846 | | |
4847 | 4.06M | for(;;) |
4848 | 333M | { |
4849 | 333M | if (Feptr <= Lstart_eptr) break; |
4850 | 329M | RMATCH(Fecode, RM220); |
4851 | 329M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4852 | 329M | Feptr--; |
4853 | 329M | BACKCHAR(Feptr); |
4854 | 329M | if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && |
4855 | 329M | UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR) |
4856 | 0 | Feptr--; |
4857 | 329M | } |
4858 | 4.06M | } |
4859 | 8.08M | else |
4860 | 8.08M | #endif /* SUPPORT_UNICODE */ |
4861 | | |
4862 | | /* Not UTF mode */ |
4863 | 8.08M | { |
4864 | 8.08M | switch(Lctype) |
4865 | 8.08M | { |
4866 | 2.80M | case OP_ANY: |
4867 | 11.0M | for (i = Lmin; i < Lmax; i++) |
4868 | 8.33M | { |
4869 | 8.33M | if (Feptr >= mb->end_subject) |
4870 | 14.5k | { |
4871 | 14.5k | SCHECK_PARTIAL(); |
4872 | 14.5k | break; |
4873 | 14.5k | } |
4874 | 8.32M | if (IS_NEWLINE(Feptr)) break; |
4875 | 8.26M | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4876 | 8.26M | Feptr + 1 >= mb->end_subject && |
4877 | 8.26M | NLBLOCK->nltype == NLTYPE_FIXED && |
4878 | 8.26M | NLBLOCK->nllen == 2 && |
4879 | 8.26M | *Feptr == NLBLOCK->nl[0]) |
4880 | 0 | { |
4881 | 0 | mb->hitend = TRUE; |
4882 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4883 | 0 | } |
4884 | 8.26M | Feptr++; |
4885 | 8.26M | } |
4886 | 2.80M | break; |
4887 | | |
4888 | 4.19M | case OP_ALLANY: |
4889 | 4.19M | case OP_ANYBYTE: |
4890 | 4.19M | fc = Lmax - Lmin; |
4891 | 4.19M | if (fc > (uint32_t)(mb->end_subject - Feptr)) |
4892 | 32.3k | { |
4893 | 32.3k | Feptr = mb->end_subject; |
4894 | 32.3k | SCHECK_PARTIAL(); |
4895 | 32.3k | } |
4896 | 4.16M | else Feptr += fc; |
4897 | 4.19M | break; |
4898 | | |
4899 | 4.19M | case OP_ANYNL: |
4900 | 94.3k | for (i = Lmin; i < Lmax; i++) |
4901 | 92.7k | { |
4902 | 92.7k | if (Feptr >= mb->end_subject) |
4903 | 6.51k | { |
4904 | 6.51k | SCHECK_PARTIAL(); |
4905 | 6.51k | break; |
4906 | 6.51k | } |
4907 | 86.2k | fc = *Feptr; |
4908 | 86.2k | if (fc == CHAR_CR) |
4909 | 7.66k | { |
4910 | 7.66k | if (++Feptr >= mb->end_subject) break; |
4911 | 1.40k | if (*Feptr == CHAR_LF) Feptr++; |
4912 | 1.40k | } |
4913 | 78.5k | else |
4914 | 78.5k | { |
4915 | 78.5k | if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF || |
4916 | 77.1k | (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL |
4917 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4918 | | && fc != 0x2028 && fc != 0x2029 |
4919 | | #endif |
4920 | 77.1k | ))) break; |
4921 | 25.1k | Feptr++; |
4922 | 25.1k | } |
4923 | 86.2k | } |
4924 | 67.7k | break; |
4925 | | |
4926 | 67.7k | case OP_NOT_HSPACE: |
4927 | 462k | for (i = Lmin; i < Lmax; i++) |
4928 | 462k | { |
4929 | 462k | if (Feptr >= mb->end_subject) |
4930 | 1.00k | { |
4931 | 1.00k | SCHECK_PARTIAL(); |
4932 | 1.00k | break; |
4933 | 1.00k | } |
4934 | 461k | switch(*Feptr) |
4935 | 461k | { |
4936 | 455k | default: Feptr++; break; |
4937 | 12.2k | HSPACE_BYTE_CASES: |
4938 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4939 | | HSPACE_MULTIBYTE_CASES: |
4940 | | #endif |
4941 | 12.2k | goto ENDLOOP00; |
4942 | 461k | } |
4943 | 461k | } |
4944 | 7.44k | ENDLOOP00: |
4945 | 7.44k | break; |
4946 | | |
4947 | 1.13k | case OP_HSPACE: |
4948 | 6.79k | for (i = Lmin; i < Lmax; i++) |
4949 | 6.79k | { |
4950 | 6.79k | if (Feptr >= mb->end_subject) |
4951 | 0 | { |
4952 | 0 | SCHECK_PARTIAL(); |
4953 | 0 | break; |
4954 | 0 | } |
4955 | 6.79k | switch(*Feptr) |
4956 | 6.79k | { |
4957 | 882 | default: goto ENDLOOP01; |
4958 | 17.5k | HSPACE_BYTE_CASES: |
4959 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4960 | | HSPACE_MULTIBYTE_CASES: |
4961 | | #endif |
4962 | 17.5k | Feptr++; break; |
4963 | 6.79k | } |
4964 | 6.79k | } |
4965 | 882 | ENDLOOP01: |
4966 | 882 | break; |
4967 | | |
4968 | 721k | case OP_NOT_VSPACE: |
4969 | 27.7M | for (i = Lmin; i < Lmax; i++) |
4970 | 27.7M | { |
4971 | 27.7M | if (Feptr >= mb->end_subject) |
4972 | 702 | { |
4973 | 702 | SCHECK_PARTIAL(); |
4974 | 702 | break; |
4975 | 702 | } |
4976 | 27.7M | switch(*Feptr) |
4977 | 27.7M | { |
4978 | 27.0M | default: Feptr++; break; |
4979 | 2.61M | VSPACE_BYTE_CASES: |
4980 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4981 | | VSPACE_MULTIBYTE_CASES: |
4982 | | #endif |
4983 | 2.61M | goto ENDLOOP02; |
4984 | 27.7M | } |
4985 | 27.7M | } |
4986 | 721k | ENDLOOP02: |
4987 | 721k | break; |
4988 | | |
4989 | 5.61k | case OP_VSPACE: |
4990 | 32 | for (i = Lmin; i < Lmax; i++) |
4991 | 32 | { |
4992 | 32 | if (Feptr >= mb->end_subject) |
4993 | 0 | { |
4994 | 0 | SCHECK_PARTIAL(); |
4995 | 0 | break; |
4996 | 0 | } |
4997 | 32 | switch(*Feptr) |
4998 | 32 | { |
4999 | 24 | default: goto ENDLOOP03; |
5000 | 40 | VSPACE_BYTE_CASES: |
5001 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
5002 | | VSPACE_MULTIBYTE_CASES: |
5003 | | #endif |
5004 | 40 | Feptr++; break; |
5005 | 32 | } |
5006 | 32 | } |
5007 | 24 | ENDLOOP03: |
5008 | 24 | break; |
5009 | | |
5010 | 387 | case OP_NOT_DIGIT: |
5011 | 731 | for (i = Lmin; i < Lmax; i++) |
5012 | 417 | { |
5013 | 417 | if (Feptr >= mb->end_subject) |
5014 | 0 | { |
5015 | 0 | SCHECK_PARTIAL(); |
5016 | 0 | break; |
5017 | 0 | } |
5018 | 417 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) |
5019 | 73 | break; |
5020 | 344 | Feptr++; |
5021 | 344 | } |
5022 | 387 | break; |
5023 | | |
5024 | 10.9k | case OP_DIGIT: |
5025 | 11.5k | for (i = Lmin; i < Lmax; i++) |
5026 | 10.9k | { |
5027 | 10.9k | if (Feptr >= mb->end_subject) |
5028 | 282 | { |
5029 | 282 | SCHECK_PARTIAL(); |
5030 | 282 | break; |
5031 | 282 | } |
5032 | 10.7k | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) |
5033 | 10.1k | break; |
5034 | 586 | Feptr++; |
5035 | 586 | } |
5036 | 10.9k | break; |
5037 | | |
5038 | 155k | case OP_NOT_WHITESPACE: |
5039 | 4.27M | for (i = Lmin; i < Lmax; i++) |
5040 | 4.27M | { |
5041 | 4.27M | if (Feptr >= mb->end_subject) |
5042 | 32.3k | { |
5043 | 32.3k | SCHECK_PARTIAL(); |
5044 | 32.3k | break; |
5045 | 32.3k | } |
5046 | 4.23M | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) |
5047 | 123k | break; |
5048 | 4.11M | Feptr++; |
5049 | 4.11M | } |
5050 | 155k | break; |
5051 | | |
5052 | 155k | case OP_WHITESPACE: |
5053 | 16.0k | for (i = Lmin; i < Lmax; i++) |
5054 | 13.2k | { |
5055 | 13.2k | if (Feptr >= mb->end_subject) |
5056 | 16 | { |
5057 | 16 | SCHECK_PARTIAL(); |
5058 | 16 | break; |
5059 | 16 | } |
5060 | 13.2k | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) |
5061 | 10.3k | break; |
5062 | 2.88k | Feptr++; |
5063 | 2.88k | } |
5064 | 13.2k | break; |
5065 | | |
5066 | 13.2k | case OP_NOT_WORDCHAR: |
5067 | 158k | for (i = Lmin; i < Lmax; i++) |
5068 | 158k | { |
5069 | 158k | if (Feptr >= mb->end_subject) |
5070 | 1.27k | { |
5071 | 1.27k | SCHECK_PARTIAL(); |
5072 | 1.27k | break; |
5073 | 1.27k | } |
5074 | 157k | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) |
5075 | 5.49k | break; |
5076 | 151k | Feptr++; |
5077 | 151k | } |
5078 | 6.82k | break; |
5079 | | |
5080 | 104k | case OP_WORDCHAR: |
5081 | 476k | for (i = Lmin; i < Lmax; i++) |
5082 | 445k | { |
5083 | 445k | if (Feptr >= mb->end_subject) |
5084 | 372 | { |
5085 | 372 | SCHECK_PARTIAL(); |
5086 | 372 | break; |
5087 | 372 | } |
5088 | 445k | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) |
5089 | 73.4k | break; |
5090 | 371k | Feptr++; |
5091 | 371k | } |
5092 | 104k | break; |
5093 | | |
5094 | 104k | default: |
5095 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
5096 | 0 | return PCRE2_ERROR_INTERNAL; |
5097 | 8.08M | } |
5098 | | |
5099 | 8.08M | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
5100 | | |
5101 | 7.91M | for (;;) |
5102 | 51.9M | { |
5103 | 51.9M | if (Feptr == Lstart_eptr) break; |
5104 | 44.0M | RMATCH(Fecode, RM34); |
5105 | 44.0M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5106 | 44.0M | Feptr--; |
5107 | 44.0M | if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF && |
5108 | 44.0M | Feptr[-1] == CHAR_CR) Feptr--; |
5109 | 44.0M | } |
5110 | 7.91M | } |
5111 | 59.9M | } |
5112 | 12.6M | break; /* End of repeat character type processing */ |
5113 | | |
5114 | 12.6M | #undef Lstart_eptr |
5115 | 12.6M | #undef Lmin |
5116 | 12.6M | #undef Lmax |
5117 | 12.6M | #undef Lctype |
5118 | 12.6M | #undef Lpropvalue |
5119 | | |
5120 | | |
5121 | | /* ===================================================================== */ |
5122 | | /* Match a back reference, possibly repeatedly. Look past the end of the |
5123 | | item to see if there is repeat information following. The OP_REF and |
5124 | | OP_REFI opcodes are used for a reference to a numbered group or to a |
5125 | | non-duplicated named group. For a duplicated named group, OP_DNREF and |
5126 | | OP_DNREFI are used. In this case we must scan the list of groups to which |
5127 | | the name refers, and use the first one that is set. */ |
5128 | | |
5129 | 12.6M | #define Lmin F->temp_32[0] |
5130 | 12.6M | #define Lmax F->temp_32[1] |
5131 | 12.6M | #define Lcaseless F->temp_32[2] |
5132 | 12.6M | #define Lcaseopts F->temp_32[3] |
5133 | 12.6M | #define Lstart F->temp_sptr[0] |
5134 | 12.6M | #define Loffset F->temp_size |
5135 | | |
5136 | 12.6M | case OP_DNREF: |
5137 | 0 | case OP_DNREFI: |
5138 | 0 | Lcaseless = (Fop == OP_DNREFI); |
5139 | 0 | Lcaseopts = (Fop == OP_DNREFI)? Fecode[1 + 2*IMM2_SIZE] : 0; |
5140 | 0 | { |
5141 | 0 | int count = GET2(Fecode, 1+IMM2_SIZE); |
5142 | 0 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
5143 | 0 | Fecode += 1 + 2*IMM2_SIZE + (Fop == OP_DNREFI? 1 : 0); |
5144 | |
|
5145 | 0 | while (count-- > 0) |
5146 | 0 | { |
5147 | 0 | Loffset = (GET2(slot, 0) << 1) - 2; |
5148 | 0 | if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break; |
5149 | 0 | slot += mb->name_entry_size; |
5150 | 0 | } |
5151 | 0 | } |
5152 | 0 | goto REF_REPEAT; |
5153 | | |
5154 | 0 | case OP_REF: |
5155 | 0 | case OP_REFI: |
5156 | 0 | Lcaseless = (Fop == OP_REFI); |
5157 | 0 | Lcaseopts = (Fop == OP_REFI)? Fecode[1 + IMM2_SIZE] : 0; |
5158 | 0 | Loffset = (GET2(Fecode, 1) << 1) - 2; |
5159 | 0 | Fecode += 1 + IMM2_SIZE + (Fop == OP_REFI? 1 : 0); |
5160 | | |
5161 | | /* Set up for repetition, or handle the non-repeated case. The maximum and |
5162 | | minimum must be in the heap frame, but as they are short-term values, we |
5163 | | use temporary fields. */ |
5164 | |
|
5165 | 0 | REF_REPEAT: |
5166 | 0 | switch (*Fecode) |
5167 | 0 | { |
5168 | 0 | case OP_CRSTAR: |
5169 | 0 | case OP_CRMINSTAR: |
5170 | 0 | case OP_CRPLUS: |
5171 | 0 | case OP_CRMINPLUS: |
5172 | 0 | case OP_CRQUERY: |
5173 | 0 | case OP_CRMINQUERY: |
5174 | 0 | fc = *Fecode++ - OP_CRSTAR; |
5175 | 0 | Lmin = rep_min[fc]; |
5176 | 0 | Lmax = rep_max[fc]; |
5177 | 0 | reptype = rep_typ[fc]; |
5178 | 0 | break; |
5179 | | |
5180 | 0 | case OP_CRRANGE: |
5181 | 0 | case OP_CRMINRANGE: |
5182 | 0 | Lmin = GET2(Fecode, 1); |
5183 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
5184 | 0 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
5185 | 0 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
5186 | 0 | Fecode += 1 + 2 * IMM2_SIZE; |
5187 | 0 | break; |
5188 | | |
5189 | 0 | default: /* No repeat follows */ |
5190 | 0 | { |
5191 | 0 | rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &length); |
5192 | 0 | if (rrc != 0) |
5193 | 0 | { |
5194 | 0 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
5195 | 0 | CHECK_PARTIAL(); |
5196 | 0 | RRETURN(MATCH_NOMATCH); |
5197 | 0 | } |
5198 | 0 | } |
5199 | 0 | Feptr += length; |
5200 | 0 | continue; /* With the main loop */ |
5201 | 0 | } |
5202 | | |
5203 | | /* Handle repeated back references. If a set group has length zero, just |
5204 | | continue with the main loop, because it matches however many times. For an |
5205 | | unset reference, if the minimum is zero, we can also just continue. We can |
5206 | | also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset |
5207 | | group behave as a zero-length group. For any other unset cases, carrying |
5208 | | on will result in NOMATCH. */ |
5209 | | |
5210 | 0 | if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) |
5211 | 0 | { |
5212 | 0 | if (Fovector[Loffset] == Fovector[Loffset + 1]) continue; |
5213 | 0 | } |
5214 | 0 | else /* Group is not set */ |
5215 | 0 | { |
5216 | 0 | if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) |
5217 | 0 | continue; |
5218 | 0 | } |
5219 | | |
5220 | | /* First, ensure the minimum number of matches are present. */ |
5221 | | |
5222 | 0 | for (i = 1; i <= Lmin; i++) |
5223 | 0 | { |
5224 | 0 | PCRE2_SIZE slength; |
5225 | 0 | rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); |
5226 | 0 | if (rrc != 0) |
5227 | 0 | { |
5228 | 0 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
5229 | 0 | CHECK_PARTIAL(); |
5230 | 0 | RRETURN(MATCH_NOMATCH); |
5231 | 0 | } |
5232 | 0 | Feptr += slength; |
5233 | 0 | } |
5234 | | |
5235 | | /* If min = max, we are done. They are not both allowed to be zero. */ |
5236 | | |
5237 | 0 | if (Lmin == Lmax) continue; |
5238 | | |
5239 | | /* If minimizing, keep trying and advancing the pointer. */ |
5240 | | |
5241 | 0 | if (reptype == REPTYPE_MIN) |
5242 | 0 | { |
5243 | 0 | for (;;) |
5244 | 0 | { |
5245 | 0 | PCRE2_SIZE slength; |
5246 | 0 | RMATCH(Fecode, RM20); |
5247 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5248 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
5249 | 0 | rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); |
5250 | 0 | if (rrc != 0) |
5251 | 0 | { |
5252 | 0 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
5253 | 0 | CHECK_PARTIAL(); |
5254 | 0 | RRETURN(MATCH_NOMATCH); |
5255 | 0 | } |
5256 | 0 | Feptr += slength; |
5257 | 0 | } |
5258 | | |
5259 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
5260 | 0 | } |
5261 | | |
5262 | | /* If maximizing, find the longest string and work backwards, as long as |
5263 | | the matched lengths for each iteration are the same. */ |
5264 | | |
5265 | 0 | else |
5266 | 0 | { |
5267 | 0 | BOOL samelengths = TRUE; |
5268 | 0 | Lstart = Feptr; /* Starting position */ |
5269 | 0 | Flength = Fovector[Loffset+1] - Fovector[Loffset]; |
5270 | |
|
5271 | 0 | for (i = Lmin; i < Lmax; i++) |
5272 | 0 | { |
5273 | 0 | PCRE2_SIZE slength; |
5274 | 0 | rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); |
5275 | 0 | if (rrc != 0) |
5276 | 0 | { |
5277 | | /* Can't use CHECK_PARTIAL because we don't want to update Feptr in |
5278 | | the soft partial matching case. */ |
5279 | |
|
5280 | 0 | if (rrc > 0 && mb->partial != 0 && |
5281 | 0 | mb->end_subject > mb->start_used_ptr) |
5282 | 0 | { |
5283 | 0 | mb->hitend = TRUE; |
5284 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
5285 | 0 | } |
5286 | 0 | break; |
5287 | 0 | } |
5288 | | |
5289 | 0 | if (slength != Flength) samelengths = FALSE; |
5290 | 0 | Feptr += slength; |
5291 | 0 | } |
5292 | | |
5293 | | /* If the length matched for each repetition is the same as the length of |
5294 | | the captured group, we can easily work backwards. This is the normal |
5295 | | case. However, in caseless UTF-8 mode there are pairs of case-equivalent |
5296 | | characters whose lengths (in terms of code units) differ. However, this |
5297 | | is very rare, so we handle it by re-matching fewer and fewer times. */ |
5298 | | |
5299 | 0 | if (samelengths) |
5300 | 0 | { |
5301 | 0 | while (Feptr >= Lstart) |
5302 | 0 | { |
5303 | 0 | RMATCH(Fecode, RM21); |
5304 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5305 | 0 | Feptr -= Flength; |
5306 | 0 | } |
5307 | 0 | } |
5308 | | |
5309 | | /* The rare case of non-matching lengths. Re-scan the repetition for each |
5310 | | iteration. We know that match_ref() will succeed every time. */ |
5311 | | |
5312 | 0 | else |
5313 | 0 | { |
5314 | 0 | Lmax = i; |
5315 | 0 | for (;;) |
5316 | 0 | { |
5317 | 0 | RMATCH(Fecode, RM22); |
5318 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5319 | 0 | if (Feptr == Lstart) break; /* Failed after minimal repetition */ |
5320 | 0 | Feptr = Lstart; |
5321 | 0 | Lmax--; |
5322 | 0 | for (i = Lmin; i < Lmax; i++) |
5323 | 0 | { |
5324 | 0 | PCRE2_SIZE slength; |
5325 | 0 | (void)match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); |
5326 | 0 | Feptr += slength; |
5327 | 0 | } |
5328 | 0 | } |
5329 | 0 | } |
5330 | | |
5331 | 0 | RRETURN(MATCH_NOMATCH); |
5332 | 0 | } |
5333 | | |
5334 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ |
5335 | |
|
5336 | 0 | #undef Lcaseless |
5337 | 0 | #undef Lmin |
5338 | 0 | #undef Lmax |
5339 | 0 | #undef Lstart |
5340 | 0 | #undef Loffset |
5341 | | |
5342 | | |
5343 | | |
5344 | | /* ========================================================================= */ |
5345 | | /* Opcodes for the start of various parenthesized items */ |
5346 | | /* ========================================================================= */ |
5347 | | |
5348 | | /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the |
5349 | | (*THEN) is within the current branch by comparing the address of OP_THEN |
5350 | | that is passed back with the end of the branch. If (*THEN) is within the |
5351 | | current branch, and the branch is one of two or more alternatives (it |
5352 | | either starts or ends with OP_ALT), we have reached the limit of THEN's |
5353 | | action, so convert the return code to NOMATCH, which will cause normal |
5354 | | backtracking to happen from now on. Otherwise, THEN is passed back to an |
5355 | | outer alternative. This implements Perl's treatment of parenthesized |
5356 | | groups, where a group not containing | does not affect the current |
5357 | | alternative, that is, (X) is NOT the same as (X|(*F)). */ |
5358 | | |
5359 | | |
5360 | | /* ===================================================================== */ |
5361 | | /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive |
5362 | | bracket group, indicating that it may occur zero times. It may repeat |
5363 | | infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in |
5364 | | the pattern. Brackets with fixed upper repeat limits are compiled as a |
5365 | | number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO. |
5366 | | Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */ |
5367 | |
|
5368 | 503 | #define Lnext_ecode F->temp_sptr[0] |
5369 | |
|
5370 | 32 | case OP_BRAZERO: |
5371 | 32 | Lnext_ecode = Fecode + 1; |
5372 | 32 | RMATCH(Lnext_ecode, RM9); |
5373 | 29 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5374 | 221 | do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); |
5375 | 29 | Fecode = Lnext_ecode + 1 + LINK_SIZE; |
5376 | 29 | break; |
5377 | | |
5378 | 0 | case OP_BRAMINZERO: |
5379 | 0 | Lnext_ecode = Fecode + 1; |
5380 | 0 | do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); |
5381 | 0 | RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10); |
5382 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5383 | 0 | Fecode++; |
5384 | 0 | break; |
5385 | | |
5386 | 0 | #undef Lnext_ecode |
5387 | | |
5388 | 0 | case OP_SKIPZERO: |
5389 | 0 | Fecode++; |
5390 | 0 | do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); |
5391 | 0 | Fecode += 1 + LINK_SIZE; |
5392 | 0 | break; |
5393 | | |
5394 | | |
5395 | | /* ===================================================================== */ |
5396 | | /* Handle possessive brackets with an unlimited repeat. The end of these |
5397 | | brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without |
5398 | | going further in the pattern. */ |
5399 | | |
5400 | 3.67k | #define Lframe_type F->temp_32[0] |
5401 | 6.50k | #define Lmatched_once F->temp_32[1] |
5402 | 2.03k | #define Lzero_allowed F->temp_32[2] |
5403 | 3.34k | #define Lstart_eptr F->temp_sptr[0] |
5404 | 1.78k | #define Lstart_group F->temp_sptr[1] |
5405 | | |
5406 | 0 | case OP_BRAPOSZERO: |
5407 | 0 | Lzero_allowed = TRUE; /* Zero repeat is allowed */ |
5408 | 0 | Fecode += 1; |
5409 | 0 | if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS) |
5410 | 0 | goto POSSESSIVE_CAPTURE; |
5411 | 0 | goto POSSESSIVE_NON_CAPTURE; |
5412 | | |
5413 | 0 | case OP_BRAPOS: |
5414 | 0 | case OP_SBRAPOS: |
5415 | 0 | Lzero_allowed = FALSE; /* Zero repeat not allowed */ |
5416 | |
|
5417 | 0 | POSSESSIVE_NON_CAPTURE: |
5418 | 0 | Lframe_type = GF_NOCAPTURE; /* Remembered frame type */ |
5419 | 0 | goto POSSESSIVE_GROUP; |
5420 | | |
5421 | 400 | case OP_CBRAPOS: |
5422 | 1.70k | case OP_SCBRAPOS: |
5423 | 1.70k | Lzero_allowed = FALSE; /* Zero repeat not allowed */ |
5424 | | |
5425 | 1.70k | POSSESSIVE_CAPTURE: |
5426 | 1.70k | number = GET2(Fecode, 1+LINK_SIZE); |
5427 | 1.70k | Lframe_type = GF_CAPTURE | number; /* Remembered frame type */ |
5428 | | |
5429 | 1.70k | POSSESSIVE_GROUP: |
5430 | 1.70k | Lmatched_once = FALSE; /* Never matched */ |
5431 | 1.70k | Lstart_group = Fecode; /* Start of this group */ |
5432 | | |
5433 | 1.70k | for (;;) |
5434 | 1.96k | { |
5435 | 1.96k | Lstart_eptr = Feptr; /* Position at group start */ |
5436 | 1.96k | group_frame_type = Lframe_type; |
5437 | 1.96k | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8); |
5438 | 1.96k | if (rrc == MATCH_KETRPOS) |
5439 | 1.38k | { |
5440 | 1.38k | Lmatched_once = TRUE; /* Matched at least once */ |
5441 | 1.38k | if (Feptr == Lstart_eptr) /* Empty match; skip to end */ |
5442 | 1.30k | { |
5443 | 1.30k | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5444 | 1.30k | break; |
5445 | 1.30k | } |
5446 | | |
5447 | 76 | Fecode = Lstart_group; |
5448 | 76 | continue; |
5449 | 1.38k | } |
5450 | | |
5451 | | /* See comment above about handling THEN. */ |
5452 | | |
5453 | 582 | if (rrc == MATCH_THEN) |
5454 | 0 | { |
5455 | 0 | PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); |
5456 | 0 | if (mb->verb_ecode_ptr < next_ecode && |
5457 | 0 | (*Fecode == OP_ALT || *next_ecode == OP_ALT)) |
5458 | 0 | rrc = MATCH_NOMATCH; |
5459 | 0 | } |
5460 | | |
5461 | 582 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5462 | 582 | Fecode += GET(Fecode, 1); |
5463 | 582 | if (*Fecode != OP_ALT) break; |
5464 | 582 | } |
5465 | | |
5466 | | /* Success if matched something or zero repeat allowed */ |
5467 | | |
5468 | 1.70k | if (Lmatched_once || Lzero_allowed) |
5469 | 1.38k | { |
5470 | 1.38k | Fecode += 1 + LINK_SIZE; |
5471 | 1.38k | break; |
5472 | 1.38k | } |
5473 | | |
5474 | 324 | RRETURN(MATCH_NOMATCH); |
5475 | | |
5476 | 0 | #undef Lmatched_once |
5477 | 0 | #undef Lzero_allowed |
5478 | 0 | #undef Lframe_type |
5479 | 0 | #undef Lstart_eptr |
5480 | 0 | #undef Lstart_group |
5481 | | |
5482 | | |
5483 | | /* ===================================================================== */ |
5484 | | /* Handle non-capturing brackets that cannot match an empty string. When we |
5485 | | get to the final alternative within the brackets, as long as there are no |
5486 | | THEN's in the pattern, we can optimize by not recording a new backtracking |
5487 | | point. (Ideally we should test for a THEN within this group, but we don't |
5488 | | have that information.) Don't do this if we are at the very top level, |
5489 | | however, because that would make handling assertions and once-only brackets |
5490 | | messier when there is nothing to go back to. */ |
5491 | | |
5492 | 9.65M | #define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */ |
5493 | 9.44k | #define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */ |
5494 | | |
5495 | 362k | case OP_BRA: |
5496 | 362k | if (mb->hasthen || Frdepth == 0) |
5497 | 360k | { |
5498 | 360k | Lframe_type = 0; |
5499 | 360k | goto GROUPLOOP; |
5500 | 360k | } |
5501 | | |
5502 | 1.29k | for (;;) |
5503 | 3.57k | { |
5504 | 3.57k | Lnext_branch = Fecode + GET(Fecode, 1); |
5505 | 3.57k | if (*Lnext_branch != OP_ALT) break; |
5506 | | |
5507 | | /* This is never the final branch. We do not need to test for MATCH_THEN |
5508 | | here because this code is not used when there is a THEN in the pattern. */ |
5509 | | |
5510 | 2.29k | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1); |
5511 | 2.28k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5512 | 2.28k | Fecode = Lnext_branch; |
5513 | 2.28k | } |
5514 | | |
5515 | | /* Hit the start of the final branch. Continue at this level. */ |
5516 | | |
5517 | 1.28k | Fecode += PRIV(OP_lengths)[*Fecode]; |
5518 | 1.28k | break; |
5519 | | |
5520 | 0 | #undef Lnext_branch |
5521 | | |
5522 | | |
5523 | | /* ===================================================================== */ |
5524 | | /* Handle a capturing bracket, other than those that are possessive with an |
5525 | | unlimited repeat. */ |
5526 | | |
5527 | 2.03M | case OP_CBRA: |
5528 | 2.03M | case OP_SCBRA: |
5529 | 2.03M | Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE); |
5530 | 2.03M | goto GROUPLOOP; |
5531 | | |
5532 | | |
5533 | | /* ===================================================================== */ |
5534 | | /* Atomic groups and non-capturing brackets that can match an empty string |
5535 | | must record a backtracking point and also set up a chained frame. */ |
5536 | | |
5537 | 0 | case OP_ONCE: |
5538 | 0 | case OP_SCRIPT_RUN: |
5539 | 20 | case OP_SBRA: |
5540 | 20 | Lframe_type = GF_NOCAPTURE | Fop; |
5541 | | |
5542 | 2.39M | GROUPLOOP: |
5543 | 2.39M | for (;;) |
5544 | 7.25M | { |
5545 | 7.25M | group_frame_type = Lframe_type; |
5546 | 7.25M | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2); |
5547 | 7.25M | if (rrc == MATCH_THEN) |
5548 | 0 | { |
5549 | 0 | PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); |
5550 | 0 | if (mb->verb_ecode_ptr < next_ecode && |
5551 | 0 | (*Fecode == OP_ALT || *next_ecode == OP_ALT)) |
5552 | 0 | rrc = MATCH_NOMATCH; |
5553 | 0 | } |
5554 | 7.25M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5555 | 7.25M | Fecode += GET(Fecode, 1); |
5556 | 7.25M | if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
5557 | 7.25M | } |
5558 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
5559 | |
|
5560 | 0 | #undef Lframe_type |
5561 | | |
5562 | | |
5563 | | /* ===================================================================== */ |
5564 | | /* Pattern recursion either matches the current regex, or some |
5565 | | subexpression. The offset data is the offset to the starting bracket from |
5566 | | the start of the whole pattern. This is so that it works from duplicated |
5567 | | subpatterns. For a whole-pattern recursion, we have to infer the number |
5568 | | zero. */ |
5569 | |
|
5570 | 1.01M | #define Lframe_type F->temp_32[0] |
5571 | 3.01M | #define Lstart_branch F->temp_sptr[0] |
5572 | |
|
5573 | 18.2k | case OP_RECURSE: |
5574 | 18.2k | bracode = mb->start_code + GET(Fecode, 1); |
5575 | 18.2k | number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE); |
5576 | | |
5577 | | /* If we are already in a pattern recursion, check for repeating the same |
5578 | | one without changing the subject pointer or the last referenced character |
5579 | | in the subject. This should catch convoluted mutual recursions; some |
5580 | | simple cases are caught at compile time. However, there are rare cases when |
5581 | | this check needs to be turned off. In this case, actual recursion loops |
5582 | | will be caught by the match or heap limits. */ |
5583 | | |
5584 | 18.2k | if (Fcurrent_recurse != RECURSE_UNSET) |
5585 | 18.2k | { |
5586 | 18.2k | offset = Flast_group_offset; |
5587 | 18.2k | while (offset != PCRE2_UNSET) |
5588 | 18.2k | { |
5589 | 18.2k | N = (heapframe *)((char *)match_data->heapframes + offset); |
5590 | 18.2k | P = (heapframe *)((char *)N - frame_size); |
5591 | 18.2k | if (N->group_frame_type == (GF_RECURSE | number)) |
5592 | 18.2k | { |
5593 | 18.2k | if (Feptr == P->eptr && mb->last_used_ptr == P->recurse_last_used && |
5594 | 18.2k | (mb->moptions & PCRE2_DISABLE_RECURSELOOP_CHECK) == 0) |
5595 | 15 | return PCRE2_ERROR_RECURSELOOP; |
5596 | 18.1k | break; |
5597 | 18.2k | } |
5598 | 43 | offset = P->last_group_offset; |
5599 | 43 | } |
5600 | 18.2k | } |
5601 | | |
5602 | | /* Remember the current last referenced character and then run the |
5603 | | recursion branch by branch. */ |
5604 | | |
5605 | 18.2k | F->recurse_last_used = mb->last_used_ptr; |
5606 | 18.2k | Lstart_branch = bracode; |
5607 | 18.2k | Lframe_type = GF_RECURSE | number; |
5608 | | |
5609 | 18.2k | for (;;) |
5610 | 1.00M | { |
5611 | 1.00M | PCRE2_SPTR next_ecode; |
5612 | | |
5613 | 1.00M | group_frame_type = Lframe_type; |
5614 | 1.00M | RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11); |
5615 | 1.00M | next_ecode = Lstart_branch + GET(Lstart_branch,1); |
5616 | | |
5617 | | /* Handle backtracking verbs, which are defined in a range that can |
5618 | | easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to |
5619 | | escape beyond a recursion; they cause a NOMATCH for the entire recursion. |
5620 | | |
5621 | | When one of these verbs triggers, the current recursion group number is |
5622 | | recorded. If it matches the recursion we are processing, the verb |
5623 | | happened within the recursion and we must deal with it. Otherwise it must |
5624 | | have happened after the recursion completed, and so has to be passed |
5625 | | back. See comment above about handling THEN. */ |
5626 | | |
5627 | 1.00M | if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX && |
5628 | 1.00M | mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE)) |
5629 | 0 | { |
5630 | 0 | if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode && |
5631 | 0 | (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT)) |
5632 | 0 | rrc = MATCH_NOMATCH; |
5633 | 0 | else RRETURN(MATCH_NOMATCH); |
5634 | 0 | } |
5635 | | |
5636 | | /* Note that carrying on after (*ACCEPT) in a recursion is handled in the |
5637 | | OP_ACCEPT code. Nothing needs to be done here. */ |
5638 | | |
5639 | 1.00M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5640 | 1.00M | Lstart_branch = next_ecode; |
5641 | 1.00M | if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH); |
5642 | 1.00M | } |
5643 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
5644 | |
|
5645 | 0 | #undef Lframe_type |
5646 | 0 | #undef Lstart_branch |
5647 | | |
5648 | | |
5649 | | /* ===================================================================== */ |
5650 | | /* Positive assertions are like other groups except that PCRE doesn't allow |
5651 | | the effect of (*THEN) to escape beyond an assertion; it is therefore |
5652 | | treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its |
5653 | | captures and mark retained. Any other return is an error. */ |
5654 | |
|
5655 | 1.84k | #define Lframe_type F->temp_32[0] |
5656 | |
|
5657 | 0 | case OP_ASSERT: |
5658 | 292 | case OP_ASSERTBACK: |
5659 | 292 | case OP_ASSERT_NA: |
5660 | 414 | case OP_ASSERTBACK_NA: |
5661 | 414 | Lframe_type = GF_NOCAPTURE | Fop; |
5662 | 414 | for (;;) |
5663 | 1.42k | { |
5664 | 1.42k | group_frame_type = Lframe_type; |
5665 | 1.42k | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3); |
5666 | 1.42k | if (rrc == MATCH_ACCEPT) |
5667 | 0 | { |
5668 | 0 | memcpy(Fovector, |
5669 | 0 | (char *)assert_accept_frame + offsetof(heapframe, ovector), |
5670 | 0 | assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); |
5671 | 0 | Foffset_top = assert_accept_frame->offset_top; |
5672 | 0 | Fmark = assert_accept_frame->mark; |
5673 | 0 | break; |
5674 | 0 | } |
5675 | 1.42k | if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
5676 | 1.42k | Fecode += GET(Fecode, 1); |
5677 | 1.42k | if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
5678 | 1.42k | } |
5679 | | |
5680 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5681 | 0 | Fecode += 1 + LINK_SIZE; |
5682 | 0 | break; |
5683 | | |
5684 | 0 | #undef Lframe_type |
5685 | | |
5686 | | |
5687 | | /* ===================================================================== */ |
5688 | | /* Handle negative assertions. Loop for each non-matching branch as for |
5689 | | positive assertions. */ |
5690 | | |
5691 | 112 | #define Lframe_type F->temp_32[0] |
5692 | | |
5693 | 28 | case OP_ASSERT_NOT: |
5694 | 28 | case OP_ASSERTBACK_NOT: |
5695 | 28 | Lframe_type = GF_NOCAPTURE | Fop; |
5696 | | |
5697 | 28 | for (;;) |
5698 | 84 | { |
5699 | 84 | group_frame_type = Lframe_type; |
5700 | 84 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4); |
5701 | 84 | switch(rrc) |
5702 | 84 | { |
5703 | 0 | case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */ |
5704 | 28 | case MATCH_MATCH: |
5705 | 28 | RRETURN (MATCH_NOMATCH); |
5706 | | |
5707 | 56 | case MATCH_NOMATCH: /* Branch failed, try next if present. */ |
5708 | 56 | case MATCH_THEN: |
5709 | 56 | Fecode += GET(Fecode, 1); |
5710 | 56 | if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED; |
5711 | 56 | break; |
5712 | | |
5713 | 56 | case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */ |
5714 | 0 | case MATCH_SKIP: |
5715 | 0 | case MATCH_PRUNE: |
5716 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5717 | 0 | goto ASSERT_NOT_FAILED; |
5718 | | |
5719 | 0 | default: /* Pass back any other return */ |
5720 | 0 | RRETURN(rrc); |
5721 | 84 | } |
5722 | 84 | } |
5723 | | |
5724 | | /* None of the branches have matched or there was a backtrack to (*COMMIT), |
5725 | | (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a |
5726 | | negative assertion, so carry on. */ |
5727 | | |
5728 | 0 | ASSERT_NOT_FAILED: |
5729 | 0 | Fecode += 1 + LINK_SIZE; |
5730 | 0 | break; |
5731 | | |
5732 | 0 | #undef Lframe_type |
5733 | | |
5734 | | /* ===================================================================== */ |
5735 | | /* Handle scan substring operation. */ |
5736 | | |
5737 | 0 | #define Lframe_type F->temp_32[0] |
5738 | 0 | #define Lextra_size F->temp_32[1] |
5739 | 0 | #define Lsaved_moptions F->temp_32[2] |
5740 | 0 | #define Lsaved_end_subject F->temp_sptr[0] |
5741 | 0 | #define Lsaved_eptr F->temp_sptr[1] |
5742 | 0 | #define Ltrue_end_extra F->temp_size |
5743 | | |
5744 | 0 | case OP_ASSERT_SCS: |
5745 | 0 | { |
5746 | 0 | PCRE2_SPTR ecode = Fecode + 1 + LINK_SIZE; |
5747 | 0 | uint32_t extra_size = 0; |
5748 | 0 | int count; |
5749 | 0 | PCRE2_SPTR slot; |
5750 | | |
5751 | | /* Disable compiler warning. */ |
5752 | 0 | offset = 0; |
5753 | 0 | (void)offset; |
5754 | |
|
5755 | 0 | for (;;) |
5756 | 0 | { |
5757 | 0 | if (*ecode == OP_CREF) |
5758 | 0 | { |
5759 | 0 | extra_size += 1+IMM2_SIZE; |
5760 | 0 | offset = (GET2(ecode, 1) << 1) - 2; |
5761 | 0 | ecode += 1+IMM2_SIZE; |
5762 | 0 | if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET) |
5763 | 0 | goto SCS_OFFSET_FOUND; |
5764 | 0 | continue; |
5765 | 0 | } |
5766 | | |
5767 | 0 | if (*ecode != OP_DNCREF) RRETURN(MATCH_NOMATCH); |
5768 | | |
5769 | 0 | count = GET2(ecode, 1 + IMM2_SIZE); |
5770 | 0 | slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size; |
5771 | 0 | extra_size += 1+2*IMM2_SIZE; |
5772 | 0 | ecode += 1+2*IMM2_SIZE; |
5773 | |
|
5774 | 0 | while (count > 0) |
5775 | 0 | { |
5776 | 0 | offset = (GET2(slot, 0) << 1) - 2; |
5777 | 0 | if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET) |
5778 | 0 | goto SCS_OFFSET_FOUND; |
5779 | 0 | slot += mb->name_entry_size; |
5780 | 0 | count--; |
5781 | 0 | } |
5782 | 0 | } |
5783 | | |
5784 | 0 | SCS_OFFSET_FOUND: |
5785 | | |
5786 | | /* Skip remaining options. */ |
5787 | 0 | for (;;) |
5788 | 0 | { |
5789 | 0 | if (*ecode == OP_CREF) |
5790 | 0 | { |
5791 | 0 | extra_size += 1+IMM2_SIZE; |
5792 | 0 | ecode += 1+IMM2_SIZE; |
5793 | 0 | } |
5794 | 0 | else if (*ecode == OP_DNCREF) |
5795 | 0 | { |
5796 | 0 | extra_size += 1+2*IMM2_SIZE; |
5797 | 0 | ecode += 1+2*IMM2_SIZE; |
5798 | 0 | } |
5799 | 0 | else break; |
5800 | 0 | } |
5801 | |
|
5802 | 0 | Lextra_size = extra_size; |
5803 | 0 | } |
5804 | | |
5805 | 0 | Lsaved_end_subject = mb->end_subject; |
5806 | 0 | Ltrue_end_extra = mb->true_end_subject - mb->end_subject; |
5807 | 0 | Lsaved_eptr = Feptr; |
5808 | 0 | Lsaved_moptions = mb->moptions; |
5809 | |
|
5810 | 0 | Feptr = mb->start_subject + Fovector[offset]; |
5811 | 0 | mb->true_end_subject = mb->end_subject = |
5812 | 0 | mb->start_subject + Fovector[offset + 1]; |
5813 | 0 | mb->moptions &= ~PCRE2_NOTEOL; |
5814 | |
|
5815 | 0 | Lframe_type = GF_NOCAPTURE | Fop; |
5816 | 0 | for (;;) |
5817 | 0 | { |
5818 | 0 | group_frame_type = Lframe_type; |
5819 | 0 | RMATCH(Fecode + 1 + LINK_SIZE + Lextra_size, RM38); |
5820 | 0 | if (rrc == MATCH_ACCEPT) |
5821 | 0 | { |
5822 | 0 | memcpy(Fovector, |
5823 | 0 | (char *)assert_accept_frame + offsetof(heapframe, ovector), |
5824 | 0 | assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); |
5825 | 0 | Foffset_top = assert_accept_frame->offset_top; |
5826 | 0 | Fmark = assert_accept_frame->mark; |
5827 | 0 | break; |
5828 | 0 | } |
5829 | | |
5830 | 0 | if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) |
5831 | 0 | { |
5832 | 0 | mb->end_subject = Lsaved_end_subject; |
5833 | 0 | mb->true_end_subject = mb->end_subject + Ltrue_end_extra; |
5834 | 0 | mb->moptions = Lsaved_moptions; |
5835 | 0 | RRETURN(rrc); |
5836 | 0 | } |
5837 | | |
5838 | 0 | Fecode += GET(Fecode, 1); |
5839 | 0 | if (*Fecode != OP_ALT) |
5840 | 0 | { |
5841 | 0 | mb->end_subject = Lsaved_end_subject; |
5842 | 0 | mb->true_end_subject = mb->end_subject + Ltrue_end_extra; |
5843 | 0 | mb->moptions = Lsaved_moptions; |
5844 | 0 | RRETURN(MATCH_NOMATCH); |
5845 | 0 | } |
5846 | 0 | Lextra_size = 0; |
5847 | 0 | } |
5848 | | |
5849 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5850 | 0 | Fecode += 1 + LINK_SIZE; |
5851 | 0 | Feptr = Lsaved_eptr; |
5852 | 0 | break; |
5853 | | |
5854 | 0 | #undef Lframe_type |
5855 | 0 | #undef Lextra_size |
5856 | 0 | #undef Lsaved_end_subject |
5857 | 0 | #undef Lsaved_eptr |
5858 | 0 | #undef Ltrue_end_extra |
5859 | 0 | #undef Lsave_moptions |
5860 | | |
5861 | | /* ===================================================================== */ |
5862 | | /* The callout item calls an external function, if one is provided, passing |
5863 | | details of the match so far. This is mainly for debugging, though the |
5864 | | function is able to force a failure. */ |
5865 | | |
5866 | 0 | case OP_CALLOUT: |
5867 | 0 | case OP_CALLOUT_STR: |
5868 | 0 | rrc = do_callout(F, mb, &length); |
5869 | 0 | if (rrc > 0) RRETURN(MATCH_NOMATCH); |
5870 | 0 | if (rrc < 0) RRETURN(rrc); |
5871 | 0 | Fecode += length; |
5872 | 0 | break; |
5873 | | |
5874 | | |
5875 | | /* ===================================================================== */ |
5876 | | /* Conditional group: compilation checked that there are no more than two |
5877 | | branches. If the condition is false, skipping the first branch takes us |
5878 | | past the end of the item if there is only one branch, but that's exactly |
5879 | | what we want. */ |
5880 | | |
5881 | 0 | case OP_COND: |
5882 | 0 | case OP_SCOND: |
5883 | | |
5884 | | /* The variable Flength will be added to Fecode when the condition is |
5885 | | false, to get to the second branch. Setting it to the offset to the ALT or |
5886 | | KET, then incrementing Fecode achieves this effect. However, if the second |
5887 | | branch is non-existent, we must point to the KET so that the end of the |
5888 | | group is correctly processed. We now have Fecode pointing to the condition |
5889 | | or callout. */ |
5890 | |
|
5891 | 0 | Flength = GET(Fecode, 1); /* Offset to the second branch */ |
5892 | 0 | if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE; |
5893 | 0 | Fecode += 1 + LINK_SIZE; /* From this opcode */ |
5894 | | |
5895 | | /* Because of the way auto-callout works during compile, a callout item is |
5896 | | inserted between OP_COND and an assertion condition. Such a callout can |
5897 | | also be inserted manually. */ |
5898 | |
|
5899 | 0 | if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR) |
5900 | 0 | { |
5901 | 0 | rrc = do_callout(F, mb, &length); |
5902 | 0 | if (rrc > 0) RRETURN(MATCH_NOMATCH); |
5903 | 0 | if (rrc < 0) RRETURN(rrc); |
5904 | | |
5905 | | /* Advance Fecode past the callout, so it now points to the condition. We |
5906 | | must adjust Flength so that the value of Fecode+Flength is unchanged. */ |
5907 | | |
5908 | 0 | Fecode += length; |
5909 | 0 | Flength -= length; |
5910 | 0 | } |
5911 | | |
5912 | | /* Test the various possible conditions */ |
5913 | | |
5914 | 0 | condition = FALSE; |
5915 | 0 | switch(*Fecode) |
5916 | 0 | { |
5917 | 0 | case OP_RREF: /* Group recursion test */ |
5918 | 0 | if (Fcurrent_recurse != RECURSE_UNSET) |
5919 | 0 | { |
5920 | 0 | number = GET2(Fecode, 1); |
5921 | 0 | condition = (number == RREF_ANY || number == Fcurrent_recurse); |
5922 | 0 | } |
5923 | 0 | break; |
5924 | | |
5925 | 0 | case OP_DNRREF: /* Duplicate named group recursion test */ |
5926 | 0 | if (Fcurrent_recurse != RECURSE_UNSET) |
5927 | 0 | { |
5928 | 0 | int count = GET2(Fecode, 1 + IMM2_SIZE); |
5929 | 0 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
5930 | 0 | while (count-- > 0) |
5931 | 0 | { |
5932 | 0 | number = GET2(slot, 0); |
5933 | 0 | condition = number == Fcurrent_recurse; |
5934 | 0 | if (condition) break; |
5935 | 0 | slot += mb->name_entry_size; |
5936 | 0 | } |
5937 | 0 | } |
5938 | 0 | break; |
5939 | | |
5940 | 0 | case OP_CREF: /* Numbered group used test */ |
5941 | 0 | offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */ |
5942 | 0 | condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; |
5943 | 0 | break; |
5944 | | |
5945 | 0 | case OP_DNCREF: /* Duplicate named group used test */ |
5946 | 0 | { |
5947 | 0 | int count = GET2(Fecode, 1 + IMM2_SIZE); |
5948 | 0 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
5949 | 0 | while (count-- > 0) |
5950 | 0 | { |
5951 | 0 | offset = (GET2(slot, 0) << 1) - 2; |
5952 | 0 | condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; |
5953 | 0 | if (condition) break; |
5954 | 0 | slot += mb->name_entry_size; |
5955 | 0 | } |
5956 | 0 | } |
5957 | 0 | break; |
5958 | | |
5959 | 0 | case OP_FALSE: |
5960 | 0 | case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ |
5961 | 0 | break; |
5962 | | |
5963 | 0 | case OP_TRUE: |
5964 | 0 | condition = TRUE; |
5965 | 0 | break; |
5966 | | |
5967 | | /* The condition is an assertion. Run code similar to the assertion code |
5968 | | above. */ |
5969 | | |
5970 | 0 | #define Lpositive F->temp_32[0] |
5971 | 0 | #define Lstart_branch F->temp_sptr[0] |
5972 | | |
5973 | 0 | default: |
5974 | 0 | Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK); |
5975 | 0 | Lstart_branch = Fecode; |
5976 | |
|
5977 | 0 | for (;;) |
5978 | 0 | { |
5979 | 0 | group_frame_type = GF_CONDASSERT | *Fecode; |
5980 | 0 | RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5); |
5981 | | |
5982 | 0 | switch(rrc) |
5983 | 0 | { |
5984 | 0 | case MATCH_ACCEPT: /* Save captures */ |
5985 | 0 | memcpy(Fovector, |
5986 | 0 | (char *)assert_accept_frame + offsetof(heapframe, ovector), |
5987 | 0 | assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); |
5988 | 0 | Foffset_top = assert_accept_frame->offset_top; |
5989 | | |
5990 | | /* Fall through */ |
5991 | | /* In the case of a match, the captures have already been put into |
5992 | | the current frame. */ |
5993 | |
|
5994 | 0 | case MATCH_MATCH: |
5995 | 0 | condition = Lpositive; /* TRUE for positive assertion */ |
5996 | 0 | break; |
5997 | | |
5998 | | /* PCRE doesn't allow the effect of (*THEN) to escape beyond an |
5999 | | assertion; it is therefore always treated as NOMATCH. */ |
6000 | | |
6001 | 0 | case MATCH_NOMATCH: |
6002 | 0 | case MATCH_THEN: |
6003 | 0 | Lstart_branch += GET(Lstart_branch, 1); |
6004 | 0 | if (*Lstart_branch == OP_ALT) continue; /* Try next branch */ |
6005 | 0 | condition = !Lpositive; /* TRUE for negative assertion */ |
6006 | 0 | break; |
6007 | | |
6008 | | /* These force no match without checking other branches. */ |
6009 | | |
6010 | 0 | case MATCH_COMMIT: |
6011 | 0 | case MATCH_SKIP: |
6012 | 0 | case MATCH_PRUNE: |
6013 | 0 | condition = !Lpositive; |
6014 | 0 | break; |
6015 | | |
6016 | 0 | default: |
6017 | 0 | RRETURN(rrc); |
6018 | 0 | } |
6019 | 0 | break; /* Out of the branch loop */ |
6020 | 0 | } |
6021 | | |
6022 | | /* If the condition is true, find the end of the assertion so that |
6023 | | advancing past it gets us to the start of the first branch. */ |
6024 | | |
6025 | 0 | if (condition) |
6026 | 0 | { |
6027 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
6028 | 0 | } |
6029 | 0 | break; /* End of assertion condition */ |
6030 | 0 | } |
6031 | | |
6032 | 0 | #undef Lpositive |
6033 | 0 | #undef Lstart_branch |
6034 | | |
6035 | | /* Choose branch according to the condition. */ |
6036 | | |
6037 | 0 | Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength; |
6038 | | |
6039 | | /* If the opcode is OP_SCOND it means we are at a repeated conditional |
6040 | | group that might match an empty string. We must therefore descend a level |
6041 | | so that the start is remembered for checking. For OP_COND we can just |
6042 | | continue at this level. */ |
6043 | |
|
6044 | 0 | if (Fop == OP_SCOND) |
6045 | 0 | { |
6046 | 0 | group_frame_type = GF_NOCAPTURE | Fop; |
6047 | 0 | RMATCH(Fecode, RM35); |
6048 | 0 | RRETURN(rrc); |
6049 | 0 | } |
6050 | 0 | break; |
6051 | | |
6052 | | |
6053 | | |
6054 | | /* ========================================================================= */ |
6055 | | /* End of start of parenthesis opcodes */ |
6056 | | /* ========================================================================= */ |
6057 | | |
6058 | | |
6059 | | /* ===================================================================== */ |
6060 | | /* Move the subject pointer back by one fixed amount. This occurs at the |
6061 | | start of each branch that has a fixed length in a lookbehind assertion. If |
6062 | | we are too close to the start to move back, fail. When working with UTF-8 |
6063 | | we move back a number of characters, not bytes. */ |
6064 | | |
6065 | 606 | case OP_REVERSE: |
6066 | 606 | number = GET2(Fecode, 1); |
6067 | 606 | #ifdef SUPPORT_UNICODE |
6068 | 606 | if (utf) |
6069 | 0 | { |
6070 | | /* We used to do a simpler `while (number-- > 0)` but that triggers |
6071 | | clang's unsigned integer overflow sanitizer. */ |
6072 | 0 | while (number > 0) |
6073 | 0 | { |
6074 | 0 | --number; |
6075 | 0 | if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH); |
6076 | 0 | Feptr--; |
6077 | 0 | BACKCHAR(Feptr); |
6078 | 0 | } |
6079 | 0 | } |
6080 | 606 | else |
6081 | 606 | #endif |
6082 | | |
6083 | | /* No UTF support, or not in UTF mode: count is code unit count */ |
6084 | | |
6085 | 606 | { |
6086 | 606 | if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH); |
6087 | 486 | Feptr -= number; |
6088 | 486 | } |
6089 | | |
6090 | | /* Save the earliest consulted character, then skip to next opcode */ |
6091 | | |
6092 | 486 | if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr; |
6093 | 486 | Fecode += 1 + IMM2_SIZE; |
6094 | 486 | break; |
6095 | | |
6096 | | |
6097 | | /* ===================================================================== */ |
6098 | | /* Move the subject pointer back by a variable amount. This occurs at the |
6099 | | start of each branch of a lookbehind assertion when the branch has a |
6100 | | variable, but limited, length. A loop is needed to try matching the branch |
6101 | | after moving back different numbers of characters. If we are too close to |
6102 | | the start to move back even the minimum amount, fail. When working with |
6103 | | UTF-8 we move back a number of characters, not bytes. */ |
6104 | | |
6105 | 4.30k | #define Lmin F->temp_32[0] |
6106 | 5.38k | #define Lmax F->temp_32[1] |
6107 | 710 | #define Leptr F->temp_sptr[0] |
6108 | | |
6109 | 710 | case OP_VREVERSE: |
6110 | 710 | Lmin = GET2(Fecode, 1); |
6111 | 710 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
6112 | 710 | Leptr = Feptr; |
6113 | | |
6114 | | /* Move back by the maximum branch length and then work forwards. This |
6115 | | ensures that items such as \d{3,5} get the maximum length, which is |
6116 | | relevant for captures, and makes for Perl compatibility. */ |
6117 | | |
6118 | 710 | #ifdef SUPPORT_UNICODE |
6119 | 710 | if (utf) |
6120 | 0 | { |
6121 | 0 | for (i = 0; i < Lmax; i++) |
6122 | 0 | { |
6123 | 0 | if (Feptr == mb->start_subject) |
6124 | 0 | { |
6125 | 0 | if (i < Lmin) RRETURN(MATCH_NOMATCH); |
6126 | 0 | Lmax = i; |
6127 | 0 | break; |
6128 | 0 | } |
6129 | 0 | Feptr--; |
6130 | 0 | BACKCHAR(Feptr); |
6131 | 0 | } |
6132 | 0 | } |
6133 | 710 | else |
6134 | 710 | #endif |
6135 | | |
6136 | | /* No UTF support or not in UTF mode */ |
6137 | | |
6138 | 710 | { |
6139 | 710 | ptrdiff_t diff = Feptr - mb->start_subject; |
6140 | 710 | uint32_t available = (diff > 65535)? 65535 : ((diff > 0)? (int)diff : 0); |
6141 | 710 | if (Lmin > available) RRETURN(MATCH_NOMATCH); |
6142 | 630 | if (Lmax > available) Lmax = available; |
6143 | 630 | Feptr -= Lmax; |
6144 | 630 | } |
6145 | | |
6146 | | /* Now try matching, moving forward one character on failure, until we |
6147 | | reach the minimum back length. */ |
6148 | | |
6149 | 630 | for (;;) |
6150 | 2.88k | { |
6151 | 2.88k | RMATCH(Fecode + 1 + 2 * IMM2_SIZE, RM37); |
6152 | 2.88k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6153 | 2.88k | if (Lmax-- <= Lmin) RRETURN(MATCH_NOMATCH); |
6154 | 2.25k | Feptr++; |
6155 | 2.25k | #ifdef SUPPORT_UNICODE |
6156 | 2.25k | if (utf) { FORWARDCHARTEST(Feptr, mb->end_subject); } |
6157 | 2.25k | #endif |
6158 | 2.25k | } |
6159 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
6160 | |
|
6161 | 0 | #undef Lmin |
6162 | 0 | #undef Lmax |
6163 | 0 | #undef Leptr |
6164 | | |
6165 | | /* ===================================================================== */ |
6166 | | /* An alternation is the end of a branch; scan along to find the end of the |
6167 | | bracketed group. */ |
6168 | |
|
6169 | 2.47M | case OP_ALT: |
6170 | 2.47M | branch_end = Fecode; |
6171 | 13.4M | do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); |
6172 | 2.47M | break; |
6173 | | |
6174 | | |
6175 | | /* ===================================================================== */ |
6176 | | /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the |
6177 | | starting frame was added to the chained frames in order to remember the |
6178 | | starting subject position for the group. (Not true for OP_BRA when it's a |
6179 | | whole pattern recursion, but that is handled separately below.)*/ |
6180 | | |
6181 | 2.48M | case OP_KET: |
6182 | 2.48M | case OP_KETRMIN: |
6183 | 2.48M | case OP_KETRMAX: |
6184 | 2.48M | case OP_KETRPOS: |
6185 | | |
6186 | 2.48M | bracode = Fecode - GET(Fecode, 1); |
6187 | | |
6188 | 2.48M | if (branch_end == NULL) branch_end = Fecode; |
6189 | 2.48M | branch_start = bracode; |
6190 | 14.7M | while (branch_start + GET(branch_start, 1) != branch_end) |
6191 | 12.2M | branch_start += GET(branch_start, 1); |
6192 | 2.48M | branch_end = NULL; |
6193 | | |
6194 | | /* Point N to the frame at the start of the most recent group, and P to its |
6195 | | predecessor. Remember the subject pointer at the start of the group. */ |
6196 | | |
6197 | 2.48M | if (*bracode != OP_BRA && *bracode != OP_COND) |
6198 | 2.48M | { |
6199 | 2.48M | N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset); |
6200 | 2.48M | P = (heapframe *)((char *)N - frame_size); |
6201 | 2.48M | Flast_group_offset = P->last_group_offset; |
6202 | | |
6203 | | #ifdef DEBUG_SHOW_RMATCH |
6204 | | fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n", |
6205 | | N->rdepth, N->group_frame_type, |
6206 | | (char *)P->eptr - (char *)mb->start_subject); |
6207 | | #endif |
6208 | | |
6209 | | /* If we are at the end of an assertion that is a condition, first check |
6210 | | to see if we are at the end of a variable-length branch in a lookbehind. |
6211 | | If this is the case and we have not landed on the current character, |
6212 | | return no match. Compare code below for non-condition lookbehinds. In |
6213 | | other cases, return a match, discarding any intermediate backtracking |
6214 | | points. Copy back the mark setting and the captures into the frame before |
6215 | | N so that they are set on return. Doing this for all assertions, both |
6216 | | positive and negative, seems to match what Perl does. */ |
6217 | | |
6218 | 2.48M | if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT) |
6219 | 0 | { |
6220 | 0 | if ((*bracode == OP_ASSERTBACK || *bracode == OP_ASSERTBACK_NOT) && |
6221 | 0 | branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr) |
6222 | 0 | RRETURN(MATCH_NOMATCH); |
6223 | 0 | memcpy((char *)P + offsetof(heapframe, ovector), Fovector, |
6224 | 0 | Foffset_top * sizeof(PCRE2_SIZE)); |
6225 | 0 | P->offset_top = Foffset_top; |
6226 | 0 | P->mark = Fmark; |
6227 | 0 | Fback_frame = (char *)F - (char *)P; |
6228 | 0 | RRETURN(MATCH_MATCH); |
6229 | 0 | } |
6230 | 2.48M | } |
6231 | 4.58k | else P = NULL; /* Indicates starting frame not recorded */ |
6232 | | |
6233 | | /* The group was not a conditional assertion. */ |
6234 | | |
6235 | 2.48M | switch (*bracode) |
6236 | 2.48M | { |
6237 | | /* Whole pattern recursion is handled as a recursion into group 0, but |
6238 | | the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing |
6239 | | group - a design mistake: it should perhaps have been capture group 0. |
6240 | | Anyway, that means the end of such recursion must be handled here. It is |
6241 | | detected by checking for an immediately following OP_END when we are |
6242 | | recursing in group 0. If this is not the end of a whole-pattern |
6243 | | recursion, there is nothing to be done. */ |
6244 | | |
6245 | 4.58k | case OP_BRA: |
6246 | 4.58k | if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break; |
6247 | | |
6248 | | /* It is the end of whole-pattern recursion. */ |
6249 | | |
6250 | 0 | offset = Flast_group_offset; |
6251 | | |
6252 | | /* Corrupted heapframes?. Trigger an assert and return an error */ |
6253 | 0 | PCRE2_ASSERT(offset != PCRE2_UNSET); |
6254 | 0 | if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; |
6255 | | |
6256 | 0 | N = (heapframe *)((char *)match_data->heapframes + offset); |
6257 | 0 | P = (heapframe *)((char *)N - frame_size); |
6258 | 0 | Flast_group_offset = P->last_group_offset; |
6259 | | |
6260 | | /* Reinstate the previous set of captures and then carry on after the |
6261 | | recursion call. */ |
6262 | |
|
6263 | 0 | memcpy((char *)F + offsetof(heapframe, ovector), P->ovector, |
6264 | 0 | Foffset_top * sizeof(PCRE2_SIZE)); |
6265 | 0 | Foffset_top = P->offset_top; |
6266 | 0 | Fcapture_last = P->capture_last; |
6267 | 0 | Fcurrent_recurse = P->current_recurse; |
6268 | 0 | Fecode = P->ecode + 1 + LINK_SIZE; |
6269 | 0 | continue; /* With next opcode */ |
6270 | | |
6271 | 0 | case OP_COND: /* No need to do anything for these */ |
6272 | 0 | case OP_SCOND: |
6273 | 0 | break; |
6274 | | |
6275 | | /* Non-atomic positive assertions are like OP_BRA, except that the |
6276 | | subject pointer must be put back to where it was at the start of the |
6277 | | assertion. For a variable lookbehind, check its end point. */ |
6278 | | |
6279 | 30 | case OP_ASSERTBACK_NA: |
6280 | 30 | if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr) |
6281 | 0 | RRETURN(MATCH_NOMATCH); |
6282 | | /* Fall through */ |
6283 | | |
6284 | 30 | case OP_ASSERT_NA: |
6285 | 30 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
6286 | 30 | Feptr = P->eptr; |
6287 | 30 | break; |
6288 | | |
6289 | | /* Atomic positive assertions are like OP_ONCE, except that in addition |
6290 | | the subject pointer must be put back to where it was at the start of the |
6291 | | assertion. For a variable lookbehind, check its end point. */ |
6292 | | |
6293 | 112 | case OP_ASSERTBACK: |
6294 | 112 | if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr) |
6295 | 0 | RRETURN(MATCH_NOMATCH); |
6296 | | /* Fall through */ |
6297 | | |
6298 | 112 | case OP_ASSERT: |
6299 | 112 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
6300 | 112 | Feptr = P->eptr; |
6301 | | /* Fall through */ |
6302 | | |
6303 | | /* For an atomic group, discard internal backtracking points. We must |
6304 | | also ensure that any remaining branches within the top-level of the group |
6305 | | are not tried. Do this by adjusting the code pointer within the backtrack |
6306 | | frame so that it points to the final branch. */ |
6307 | | |
6308 | 112 | case OP_ONCE: |
6309 | 112 | Fback_frame = ((char *)F - (char *)P); |
6310 | 112 | for (;;) |
6311 | 602 | { |
6312 | 602 | uint32_t y = GET(P->ecode,1); |
6313 | 602 | if ((P->ecode)[y] != OP_ALT) break; |
6314 | 490 | P->ecode += y; |
6315 | 490 | } |
6316 | 112 | break; |
6317 | | |
6318 | | /* A matching negative assertion returns MATCH, which is turned into |
6319 | | NOMATCH at the assertion level. For a variable lookbehind, check its end |
6320 | | point. */ |
6321 | | |
6322 | 0 | case OP_ASSERTBACK_NOT: |
6323 | 0 | if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr) |
6324 | 0 | RRETURN(MATCH_NOMATCH); |
6325 | | /* Fall through */ |
6326 | | |
6327 | 28 | case OP_ASSERT_NOT: |
6328 | 28 | RRETURN(MATCH_MATCH); |
6329 | | |
6330 | | /* A scan substring group must preserve the current end_subject, |
6331 | | and restore it before the backtracking is performed into its sub |
6332 | | pattern. */ |
6333 | | |
6334 | 0 | case OP_ASSERT_SCS: |
6335 | 0 | F->temp_sptr[0] = mb->end_subject; |
6336 | 0 | mb->end_subject = P->temp_sptr[0]; |
6337 | 0 | mb->true_end_subject = mb->end_subject + P->temp_size; |
6338 | 0 | Feptr = P->temp_sptr[1]; |
6339 | |
|
6340 | 0 | RMATCH(Fecode + 1 + LINK_SIZE, RM39); |
6341 | | |
6342 | 0 | mb->end_subject = F->temp_sptr[0]; |
6343 | 0 | mb->true_end_subject = mb->end_subject; |
6344 | 0 | RRETURN(rrc); |
6345 | 0 | break; |
6346 | | |
6347 | | /* At the end of a script run, apply the script-checking rules. This code |
6348 | | will never by exercised if Unicode support it not compiled, because in |
6349 | | that environment script runs cause an error at compile time. */ |
6350 | | |
6351 | 0 | case OP_SCRIPT_RUN: |
6352 | 0 | if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH); |
6353 | 0 | break; |
6354 | | |
6355 | | /* Whole-pattern recursion is coded as a recurse into group 0, and is |
6356 | | handled with OP_BRA above. Other recursion is handled here. */ |
6357 | | |
6358 | 2.47M | case OP_CBRA: |
6359 | 2.47M | case OP_CBRAPOS: |
6360 | 2.48M | case OP_SCBRA: |
6361 | 2.48M | case OP_SCBRAPOS: |
6362 | 2.48M | number = GET2(bracode, 1+LINK_SIZE); |
6363 | | |
6364 | | /* Handle a recursively called group. We reinstate the previous set of |
6365 | | captures and then carry on after the recursion call. */ |
6366 | | |
6367 | 2.48M | if (Fcurrent_recurse == number) |
6368 | 19 | { |
6369 | 19 | P = (heapframe *)((char *)N - frame_size); |
6370 | 19 | memcpy((char *)F + offsetof(heapframe, ovector), P->ovector, |
6371 | 19 | Foffset_top * sizeof(PCRE2_SIZE)); |
6372 | 19 | Foffset_top = P->offset_top; |
6373 | 19 | Fcapture_last = P->capture_last; |
6374 | 19 | Fcurrent_recurse = P->current_recurse; |
6375 | 19 | Fecode = P->ecode + 1 + LINK_SIZE; |
6376 | 19 | continue; /* With next opcode */ |
6377 | 19 | } |
6378 | | |
6379 | | /* Deal with actual capturing. */ |
6380 | | |
6381 | 2.48M | offset = (number << 1) - 2; |
6382 | 2.48M | Fcapture_last = number; |
6383 | 2.48M | Fovector[offset] = P->eptr - mb->start_subject; |
6384 | 2.48M | Fovector[offset+1] = Feptr - mb->start_subject; |
6385 | 2.48M | if (offset >= Foffset_top) Foffset_top = offset + 2; |
6386 | 2.48M | break; |
6387 | 2.48M | } /* End actions relating to the starting opcode */ |
6388 | | |
6389 | | /* OP_KETRPOS is a possessive repeating ket. Remember the current position, |
6390 | | and return the MATCH_KETRPOS. This makes it possible to do the repeats one |
6391 | | at a time from the outer level. This must precede the empty string test - |
6392 | | in this case that test is done at the outer level. */ |
6393 | | |
6394 | 2.48M | if (*Fecode == OP_KETRPOS) |
6395 | 1.38k | { |
6396 | 1.38k | memcpy((char *)P + offsetof(heapframe, eptr), |
6397 | 1.38k | (char *)F + offsetof(heapframe, eptr), |
6398 | 1.38k | frame_copy_size); |
6399 | 1.38k | RRETURN(MATCH_KETRPOS); |
6400 | 1.38k | } |
6401 | | |
6402 | | /* Handle the different kinds of closing brackets. A non-repeating ket |
6403 | | needs no special action, just continuing at this level. This also happens |
6404 | | for the repeating kets if the group matched no characters, in order to |
6405 | | forcibly break infinite loops. Otherwise, the repeating kets try the rest |
6406 | | of the pattern or restart from the preceding bracket, in the appropriate |
6407 | | order. */ |
6408 | | |
6409 | 2.48M | if (Fop != OP_KET && (P == NULL || Feptr != P->eptr)) |
6410 | 175 | { |
6411 | 175 | if (Fop == OP_KETRMIN) |
6412 | 0 | { |
6413 | 0 | RMATCH(Fecode + 1 + LINK_SIZE, RM6); |
6414 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6415 | 0 | Fecode -= GET(Fecode, 1); |
6416 | 0 | break; /* End of ket processing */ |
6417 | 0 | } |
6418 | | |
6419 | | /* Repeat the maximum number of times (KETRMAX) */ |
6420 | | |
6421 | 175 | RMATCH(bracode, RM7); |
6422 | 175 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6423 | 175 | } |
6424 | | |
6425 | | /* Carry on at this level for a non-repeating ket, or after matching an |
6426 | | empty string, or after repeating for a maximum number of times. */ |
6427 | | |
6428 | 2.48M | Fecode += 1 + LINK_SIZE; |
6429 | 2.48M | break; |
6430 | | |
6431 | | |
6432 | | /* ===================================================================== */ |
6433 | | /* Start and end of line assertions, not multiline mode. */ |
6434 | | |
6435 | 344k | case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */ |
6436 | 344k | if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0) |
6437 | 344k | RRETURN(MATCH_NOMATCH); |
6438 | 446 | Fecode++; |
6439 | 446 | break; |
6440 | | |
6441 | 243k | case OP_SOD: /* Unconditional start of subject */ |
6442 | 243k | if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH); |
6443 | 14 | Fecode++; |
6444 | 14 | break; |
6445 | | |
6446 | | /* When PCRE2_NOTEOL is unset, assert before the subject end, or a |
6447 | | terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */ |
6448 | | |
6449 | 183k | case OP_DOLL: |
6450 | 183k | if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); |
6451 | 183k | if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS; |
6452 | | |
6453 | | /* Fall through */ |
6454 | | /* Unconditional end of subject assertion (\z). */ |
6455 | | |
6456 | 1.23k | case OP_EOD: |
6457 | 1.23k | if (Feptr < mb->true_end_subject) RRETURN(MATCH_NOMATCH); |
6458 | 270 | if (mb->partial != 0) |
6459 | 0 | { |
6460 | 0 | mb->hitend = TRUE; |
6461 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
6462 | 0 | } |
6463 | 270 | Fecode++; |
6464 | 270 | break; |
6465 | | |
6466 | | /* End of subject or ending \n assertion (\Z) */ |
6467 | | |
6468 | 383 | case OP_EODN: |
6469 | 184k | ASSERT_NL_OR_EOS: |
6470 | 184k | if (Feptr < mb->true_end_subject && |
6471 | 184k | (!IS_NEWLINE(Feptr) || Feptr != mb->true_end_subject - mb->nllen)) |
6472 | 184k | { |
6473 | 184k | if (mb->partial != 0 && |
6474 | 184k | Feptr + 1 >= mb->end_subject && |
6475 | 184k | NLBLOCK->nltype == NLTYPE_FIXED && |
6476 | 184k | NLBLOCK->nllen == 2 && |
6477 | 184k | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
6478 | 0 | { |
6479 | 0 | mb->hitend = TRUE; |
6480 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
6481 | 0 | } |
6482 | 184k | RRETURN(MATCH_NOMATCH); |
6483 | 184k | } |
6484 | | |
6485 | | /* Either at end of string or \n before end. */ |
6486 | | |
6487 | 24 | if (mb->partial != 0) |
6488 | 0 | { |
6489 | 0 | mb->hitend = TRUE; |
6490 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
6491 | 0 | } |
6492 | 24 | Fecode++; |
6493 | 24 | break; |
6494 | | |
6495 | | |
6496 | | /* ===================================================================== */ |
6497 | | /* Start and end of line assertions, multiline mode. */ |
6498 | | |
6499 | | /* Start of subject unless notbol, or after any newline except for one at |
6500 | | the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */ |
6501 | | |
6502 | 7.96k | case OP_CIRCM: |
6503 | 7.96k | if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject) |
6504 | 0 | RRETURN(MATCH_NOMATCH); |
6505 | 7.96k | if (Feptr != mb->start_subject && |
6506 | 7.96k | ((Feptr == mb->end_subject && |
6507 | 7.92k | (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) || |
6508 | 7.92k | !WAS_NEWLINE(Feptr))) |
6509 | 7.69k | RRETURN(MATCH_NOMATCH); |
6510 | 273 | Fecode++; |
6511 | 273 | break; |
6512 | | |
6513 | | /* Assert before any newline, or before end of subject unless noteol is |
6514 | | set. */ |
6515 | | |
6516 | 158k | case OP_DOLLM: |
6517 | 158k | if (Feptr < mb->end_subject) |
6518 | 158k | { |
6519 | 158k | if (!IS_NEWLINE(Feptr)) |
6520 | 156k | { |
6521 | 156k | if (mb->partial != 0 && |
6522 | 156k | Feptr + 1 >= mb->end_subject && |
6523 | 156k | NLBLOCK->nltype == NLTYPE_FIXED && |
6524 | 156k | NLBLOCK->nllen == 2 && |
6525 | 156k | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
6526 | 0 | { |
6527 | 0 | mb->hitend = TRUE; |
6528 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
6529 | 0 | } |
6530 | 156k | RRETURN(MATCH_NOMATCH); |
6531 | 156k | } |
6532 | 158k | } |
6533 | 603 | else |
6534 | 603 | { |
6535 | 603 | if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); |
6536 | 603 | SCHECK_PARTIAL(); |
6537 | 603 | } |
6538 | 2.24k | Fecode++; |
6539 | 2.24k | break; |
6540 | | |
6541 | | |
6542 | | /* ===================================================================== */ |
6543 | | /* Start of match assertion */ |
6544 | | |
6545 | 1.03k | case OP_SOM: |
6546 | 1.03k | if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH); |
6547 | 11 | Fecode++; |
6548 | 11 | break; |
6549 | | |
6550 | | |
6551 | | /* ===================================================================== */ |
6552 | | /* Reset the start of match point */ |
6553 | | |
6554 | 0 | case OP_SET_SOM: |
6555 | 0 | Fstart_match = Feptr; |
6556 | 0 | Fecode++; |
6557 | 0 | break; |
6558 | | |
6559 | | |
6560 | | /* ===================================================================== */ |
6561 | | /* Word boundary assertions. Find out if the previous and current |
6562 | | characters are "word" characters. It takes a bit more work in UTF mode. |
6563 | | Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is |
6564 | | not set. When it is set, use Unicode properties if available, even when not |
6565 | | in UTF mode. Remember the earliest and latest consulted characters. */ |
6566 | | |
6567 | 121k | case OP_NOT_WORD_BOUNDARY: |
6568 | 123k | case OP_WORD_BOUNDARY: |
6569 | 136k | case OP_NOT_UCP_WORD_BOUNDARY: |
6570 | 140k | case OP_UCP_WORD_BOUNDARY: |
6571 | 140k | if (Feptr == mb->check_subject) prev_is_word = FALSE; else |
6572 | 139k | { |
6573 | 139k | PCRE2_SPTR lastptr = Feptr - 1; |
6574 | 139k | #ifdef SUPPORT_UNICODE |
6575 | 139k | if (utf) |
6576 | 16.6k | { |
6577 | 16.6k | BACKCHAR(lastptr); |
6578 | 16.6k | GETCHAR(fc, lastptr); |
6579 | 16.6k | } |
6580 | 123k | else |
6581 | 123k | #endif /* SUPPORT_UNICODE */ |
6582 | 123k | fc = *lastptr; |
6583 | 139k | if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr; |
6584 | 139k | #ifdef SUPPORT_UNICODE |
6585 | 139k | if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY) |
6586 | 16.6k | { |
6587 | 16.6k | int chartype = UCD_CHARTYPE(fc); |
6588 | 16.6k | int category = PRIV(ucp_gentype)[chartype]; |
6589 | 16.6k | prev_is_word = (category == ucp_L || category == ucp_N || |
6590 | 16.6k | chartype == ucp_Mn || chartype == ucp_Pc); |
6591 | 16.6k | } |
6592 | 123k | else |
6593 | 123k | #endif /* SUPPORT_UNICODE */ |
6594 | 123k | prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; |
6595 | 139k | } |
6596 | | |
6597 | | /* Get status of next character */ |
6598 | | |
6599 | 140k | if (Feptr >= mb->end_subject) |
6600 | 83 | { |
6601 | 83 | SCHECK_PARTIAL(); |
6602 | 83 | cur_is_word = FALSE; |
6603 | 83 | } |
6604 | 140k | else |
6605 | 140k | { |
6606 | 140k | PCRE2_SPTR nextptr = Feptr + 1; |
6607 | 140k | #ifdef SUPPORT_UNICODE |
6608 | 140k | if (utf) |
6609 | 16.8k | { |
6610 | 16.8k | FORWARDCHARTEST(nextptr, mb->end_subject); |
6611 | 16.8k | GETCHAR(fc, Feptr); |
6612 | 16.8k | } |
6613 | 123k | else |
6614 | 123k | #endif /* SUPPORT_UNICODE */ |
6615 | 123k | fc = *Feptr; |
6616 | 140k | if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr; |
6617 | 140k | #ifdef SUPPORT_UNICODE |
6618 | 140k | if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY) |
6619 | 16.8k | { |
6620 | 16.8k | int chartype = UCD_CHARTYPE(fc); |
6621 | 16.8k | int category = PRIV(ucp_gentype)[chartype]; |
6622 | 16.8k | cur_is_word = (category == ucp_L || category == ucp_N || |
6623 | 16.8k | chartype == ucp_Mn || chartype == ucp_Pc); |
6624 | 16.8k | } |
6625 | 123k | else |
6626 | 123k | #endif /* SUPPORT_UNICODE */ |
6627 | 123k | cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; |
6628 | 140k | } |
6629 | | |
6630 | | /* Now see if the situation is what we want */ |
6631 | | |
6632 | 140k | if ((*Fecode++ == OP_WORD_BOUNDARY || Fop == OP_UCP_WORD_BOUNDARY)? |
6633 | 134k | cur_is_word == prev_is_word : cur_is_word != prev_is_word) |
6634 | 27.3k | RRETURN(MATCH_NOMATCH); |
6635 | 112k | break; |
6636 | | |
6637 | | |
6638 | | /* ===================================================================== */ |
6639 | | /* Backtracking (*VERB)s, with and without arguments. Note that if the |
6640 | | pattern is successfully matched, we do not come back from RMATCH. */ |
6641 | | |
6642 | 112k | case OP_MARK: |
6643 | 1.47k | Fmark = mb->nomatch_mark = Fecode + 2; |
6644 | 1.47k | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12); |
6645 | | |
6646 | | /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an |
6647 | | argument, and we must check whether that argument matches this MARK's |
6648 | | argument. It is passed back in mb->verb_skip_ptr. If it does match, we |
6649 | | return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject |
6650 | | position that corresponds to this mark. Otherwise, pass back the return |
6651 | | code unaltered. */ |
6652 | | |
6653 | 1.47k | if (rrc == MATCH_SKIP_ARG && |
6654 | 1.47k | PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0) |
6655 | 0 | { |
6656 | 0 | mb->verb_skip_ptr = Feptr; /* Pass back current position */ |
6657 | 0 | RRETURN(MATCH_SKIP); |
6658 | 0 | } |
6659 | 1.47k | RRETURN(rrc); |
6660 | | |
6661 | 0 | case OP_FAIL: |
6662 | 0 | RRETURN(MATCH_NOMATCH); |
6663 | | |
6664 | | /* Record the current recursing group number in mb->verb_current_recurse |
6665 | | when a backtracking return such as MATCH_COMMIT is given. This enables the |
6666 | | recurse processing to catch verbs from within the recursion. */ |
6667 | | |
6668 | 0 | case OP_COMMIT: |
6669 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13); |
6670 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6671 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6672 | 0 | RRETURN(MATCH_COMMIT); |
6673 | | |
6674 | 0 | case OP_COMMIT_ARG: |
6675 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
6676 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36); |
6677 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6678 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6679 | 0 | RRETURN(MATCH_COMMIT); |
6680 | | |
6681 | 0 | case OP_PRUNE: |
6682 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14); |
6683 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6684 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6685 | 0 | RRETURN(MATCH_PRUNE); |
6686 | | |
6687 | 0 | case OP_PRUNE_ARG: |
6688 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
6689 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15); |
6690 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6691 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6692 | 0 | RRETURN(MATCH_PRUNE); |
6693 | | |
6694 | 0 | case OP_SKIP: |
6695 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16); |
6696 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6697 | 0 | mb->verb_skip_ptr = Feptr; /* Pass back current position */ |
6698 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6699 | 0 | RRETURN(MATCH_SKIP); |
6700 | | |
6701 | | /* Note that, for Perl compatibility, SKIP with an argument does NOT set |
6702 | | nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was |
6703 | | not a matching mark, we have to re-run the match, ignoring the SKIP_ARG |
6704 | | that failed and any that precede it (either they also failed, or were not |
6705 | | triggered). To do this, we maintain a count of executed SKIP_ARGs. If a |
6706 | | SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg |
6707 | | set to the count of the one that failed. */ |
6708 | | |
6709 | 0 | case OP_SKIP_ARG: |
6710 | 0 | mb->skip_arg_count++; |
6711 | 0 | if (mb->skip_arg_count <= mb->ignore_skip_arg) |
6712 | 0 | { |
6713 | 0 | Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1]; |
6714 | 0 | break; |
6715 | 0 | } |
6716 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17); |
6717 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6718 | | |
6719 | | /* Pass back the current skip name and return the special MATCH_SKIP_ARG |
6720 | | return code. This will either be caught by a matching MARK, or get to the |
6721 | | top, where it causes a rematch with mb->ignore_skip_arg set to the value of |
6722 | | mb->skip_arg_count. */ |
6723 | | |
6724 | 0 | mb->verb_skip_ptr = Fecode + 2; |
6725 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6726 | 0 | RRETURN(MATCH_SKIP_ARG); |
6727 | | |
6728 | | /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that |
6729 | | the branch in which it occurs can be determined. */ |
6730 | | |
6731 | 0 | case OP_THEN: |
6732 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18); |
6733 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6734 | 0 | mb->verb_ecode_ptr = Fecode; |
6735 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6736 | 0 | RRETURN(MATCH_THEN); |
6737 | | |
6738 | 0 | case OP_THEN_ARG: |
6739 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
6740 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19); |
6741 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6742 | 0 | mb->verb_ecode_ptr = Fecode; |
6743 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6744 | 0 | RRETURN(MATCH_THEN); |
6745 | | |
6746 | | |
6747 | | /* ===================================================================== */ |
6748 | | /* There's been some horrible disaster. Arrival here can only mean there is |
6749 | | something seriously wrong in the code above or the OP_xxx definitions. */ |
6750 | | |
6751 | 0 | default: |
6752 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
6753 | 0 | return PCRE2_ERROR_INTERNAL; |
6754 | 572M | } |
6755 | | |
6756 | | /* Do not insert any code in here without much thought; it is assumed |
6757 | | that "continue" in the code above comes out to here to repeat the main |
6758 | | loop. */ |
6759 | | |
6760 | 572M | } /* End of main loop */ |
6761 | | |
6762 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ |
6763 | | |
6764 | | /* ========================================================================= */ |
6765 | | /* The RRETURN() macro jumps here. The number that is saved in Freturn_id |
6766 | | indicates which label we actually want to return to. The value in Frdepth is |
6767 | | the index number of the frame in the vector. The return value has been placed |
6768 | | in rrc. */ |
6769 | |
|
6770 | 427M | #define LBL(val) case val: goto L_RM##val; |
6771 | |
|
6772 | 427M | RETURN_SWITCH: |
6773 | 427M | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
6774 | 427M | if (Frdepth == 0) return rrc; /* Exit from the top level */ |
6775 | 427M | F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */ |
6776 | 427M | mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */ |
6777 | | |
6778 | | #ifdef DEBUG_SHOW_RMATCH |
6779 | | fprintf(stderr, "++ RETURN %d to RM%d\n", rrc, Freturn_id); |
6780 | | #endif |
6781 | | |
6782 | 427M | switch (Freturn_id) |
6783 | 427M | { |
6784 | 7.25M | LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) |
6785 | 1.00M | LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16) |
6786 | 2.64M | LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24) |
6787 | 6.58M | LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32) |
6788 | 44.0M | LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) |
6789 | | |
6790 | 0 | #ifdef SUPPORT_WIDE_CHARS |
6791 | 625k | LBL(100) LBL(101) LBL(102) LBL(103) |
6792 | 0 | #endif |
6793 | | |
6794 | 0 | #ifdef SUPPORT_UNICODE |
6795 | 269k | LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206) |
6796 | 127k | LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213) |
6797 | 329M | LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220) |
6798 | 4.83M | LBL(221) LBL(222) LBL(223) LBL(224) |
6799 | 0 | #endif |
6800 | | |
6801 | 0 | default: |
6802 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
6803 | 0 | return PCRE2_ERROR_INTERNAL; |
6804 | 427M | } |
6805 | 427M | #undef LBL |
6806 | 427M | } |
6807 | | |
6808 | | |
6809 | | /************************************************* |
6810 | | * Match a Regular Expression * |
6811 | | *************************************************/ |
6812 | | |
6813 | | /* This function applies a compiled pattern to a subject string and picks out |
6814 | | portions of the string if it matches. Two elements in the vector are set for |
6815 | | each substring: the offsets to the start and end of the substring. |
6816 | | |
6817 | | Arguments: |
6818 | | code points to the compiled expression |
6819 | | subject points to the subject string |
6820 | | length length of subject string (may contain binary zeros) |
6821 | | start_offset where to start in the subject string |
6822 | | options option bits |
6823 | | match_data points to a match_data block |
6824 | | mcontext points a PCRE2 context |
6825 | | |
6826 | | Returns: > 0 => success; value is the number of ovector pairs filled |
6827 | | = 0 => success, but ovector is not big enough |
6828 | | = -1 => failed to match (PCRE2_ERROR_NOMATCH) |
6829 | | = -2 => partial match (PCRE2_ERROR_PARTIAL) |
6830 | | < -2 => some kind of unexpected problem |
6831 | | */ |
6832 | | |
6833 | | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION |
6834 | | pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, |
6835 | | PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, |
6836 | | pcre2_match_context *mcontext) |
6837 | 4.32k | { |
6838 | 4.32k | int rc; |
6839 | 4.32k | int was_zero_terminated = 0; |
6840 | 4.32k | const uint8_t *start_bits = NULL; |
6841 | 4.32k | const pcre2_real_code *re = (const pcre2_real_code *)code; |
6842 | | |
6843 | 4.32k | BOOL anchored; |
6844 | 4.32k | BOOL firstline; |
6845 | 4.32k | BOOL has_first_cu = FALSE; |
6846 | 4.32k | BOOL has_req_cu = FALSE; |
6847 | 4.32k | BOOL startline; |
6848 | | |
6849 | 4.32k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6850 | 4.32k | PCRE2_SPTR memchr_found_first_cu; |
6851 | 4.32k | PCRE2_SPTR memchr_found_first_cu2; |
6852 | 4.32k | #endif |
6853 | | |
6854 | 4.32k | PCRE2_UCHAR first_cu = 0; |
6855 | 4.32k | PCRE2_UCHAR first_cu2 = 0; |
6856 | 4.32k | PCRE2_UCHAR req_cu = 0; |
6857 | 4.32k | PCRE2_UCHAR req_cu2 = 0; |
6858 | | |
6859 | 4.32k | PCRE2_SPTR bumpalong_limit; |
6860 | 4.32k | PCRE2_SPTR end_subject; |
6861 | 4.32k | PCRE2_SPTR true_end_subject; |
6862 | 4.32k | PCRE2_SPTR start_match; |
6863 | 4.32k | PCRE2_SPTR req_cu_ptr; |
6864 | 4.32k | PCRE2_SPTR start_partial; |
6865 | 4.32k | PCRE2_SPTR match_partial; |
6866 | | |
6867 | | #ifdef SUPPORT_JIT |
6868 | | BOOL use_jit; |
6869 | | #endif |
6870 | | |
6871 | | /* This flag is needed even when Unicode is not supported for convenience |
6872 | | (it is used by the IS_NEWLINE macro). */ |
6873 | | |
6874 | 4.32k | BOOL utf = FALSE; |
6875 | | |
6876 | 4.32k | #ifdef SUPPORT_UNICODE |
6877 | 4.32k | BOOL ucp = FALSE; |
6878 | 4.32k | BOOL allow_invalid; |
6879 | 4.32k | uint32_t fragment_options = 0; |
6880 | | #ifdef SUPPORT_JIT |
6881 | | BOOL jit_checked_utf = FALSE; |
6882 | | #endif |
6883 | 4.32k | #endif /* SUPPORT_UNICODE */ |
6884 | | |
6885 | 4.32k | PCRE2_SIZE frame_size; |
6886 | 4.32k | PCRE2_SIZE heapframes_size; |
6887 | | |
6888 | | /* We need to have mb as a pointer to a match block, because the IS_NEWLINE |
6889 | | macro is used below, and it expects NLBLOCK to be defined as a pointer. */ |
6890 | | |
6891 | 4.32k | pcre2_callout_block cb; |
6892 | 4.32k | match_block actual_match_block; |
6893 | 4.32k | match_block *mb = &actual_match_block; |
6894 | | |
6895 | | /* Recognize NULL, length 0 as an empty string. */ |
6896 | | |
6897 | 4.32k | if (subject == NULL && length == 0) subject = (PCRE2_SPTR)""; |
6898 | | |
6899 | | /* Plausibility checks */ |
6900 | | |
6901 | 4.32k | if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; |
6902 | 4.32k | if (code == NULL || subject == NULL || match_data == NULL) |
6903 | 0 | return PCRE2_ERROR_NULL; |
6904 | | |
6905 | 4.32k | start_match = subject + start_offset; |
6906 | 4.32k | req_cu_ptr = start_match - 1; |
6907 | 4.32k | if (length == PCRE2_ZERO_TERMINATED) |
6908 | 0 | { |
6909 | 0 | length = PRIV(strlen)(subject); |
6910 | 0 | was_zero_terminated = 1; |
6911 | 0 | } |
6912 | 4.32k | true_end_subject = end_subject = subject + length; |
6913 | | |
6914 | 4.32k | if (start_offset > length) return PCRE2_ERROR_BADOFFSET; |
6915 | | |
6916 | | /* Check that the first field in the block is the magic number. */ |
6917 | | |
6918 | 4.32k | if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; |
6919 | | |
6920 | | /* Check the code unit width. */ |
6921 | | |
6922 | 4.32k | if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) |
6923 | 0 | return PCRE2_ERROR_BADMODE; |
6924 | | |
6925 | | /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the |
6926 | | options variable for this function. Users of PCRE2 who are not calling the |
6927 | | function directly would like to have a way of setting these flags, in the same |
6928 | | way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with |
6929 | | constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and |
6930 | | (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now |
6931 | | transfer to the options for this function. The bits are guaranteed to be |
6932 | | adjacent, but do not have the same values. This bit of Boolean trickery assumes |
6933 | | that the match-time bits are not more significant than the flag bits. If by |
6934 | | accident this is not the case, a compile-time division by zero error will |
6935 | | occur. */ |
6936 | | |
6937 | 12.9k | #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) |
6938 | 8.65k | #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) |
6939 | 4.32k | options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); |
6940 | 4.32k | #undef FF |
6941 | 4.32k | #undef OO |
6942 | | |
6943 | | /* If the pattern was successfully studied with JIT support, we will run the |
6944 | | JIT executable instead of the rest of this function. Most options must be set |
6945 | | at compile time for the JIT code to be usable. */ |
6946 | | |
6947 | | #ifdef SUPPORT_JIT |
6948 | | use_jit = (re->executable_jit != NULL && |
6949 | | (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); |
6950 | | #endif |
6951 | | |
6952 | | /* Initialize UTF/UCP parameters. */ |
6953 | | |
6954 | 4.32k | #ifdef SUPPORT_UNICODE |
6955 | 4.32k | utf = (re->overall_options & PCRE2_UTF) != 0; |
6956 | 4.32k | allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; |
6957 | 4.32k | ucp = (re->overall_options & PCRE2_UCP) != 0; |
6958 | 4.32k | #endif /* SUPPORT_UNICODE */ |
6959 | | |
6960 | | /* Convert the partial matching flags into an integer. */ |
6961 | | |
6962 | 4.32k | mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : |
6963 | 4.32k | ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0; |
6964 | | |
6965 | | /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same |
6966 | | time. */ |
6967 | | |
6968 | 4.32k | if (mb->partial != 0 && |
6969 | 4.32k | ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) |
6970 | 0 | return PCRE2_ERROR_BADOPTION; |
6971 | | |
6972 | | /* It is an error to set an offset limit without setting the flag at compile |
6973 | | time. */ |
6974 | | |
6975 | 4.32k | if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && |
6976 | 4.32k | (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) |
6977 | 0 | return PCRE2_ERROR_BADOFFSETLIMIT; |
6978 | | |
6979 | | /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, |
6980 | | free the memory that was obtained. Set the field to NULL for no match cases. */ |
6981 | | |
6982 | 4.32k | if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) |
6983 | 0 | { |
6984 | 0 | match_data->memctl.free((void *)match_data->subject, |
6985 | 0 | match_data->memctl.memory_data); |
6986 | 0 | match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; |
6987 | 0 | } |
6988 | 4.32k | match_data->subject = NULL; |
6989 | | |
6990 | | /* Zero the error offset in case the first code unit is invalid UTF. */ |
6991 | | |
6992 | 4.32k | match_data->startchar = 0; |
6993 | | |
6994 | | |
6995 | | /* ============================= JIT matching ============================== */ |
6996 | | |
6997 | | /* Prepare for JIT matching. Check a UTF string for validity unless no check is |
6998 | | requested or invalid UTF can be handled. We check only the portion of the |
6999 | | subject that might be be inspected during matching - from the offset minus the |
7000 | | maximum lookbehind to the given length. This saves time when a small part of a |
7001 | | large subject is being matched by the use of a starting offset. Note that the |
7002 | | maximum lookbehind is a number of characters, not code units. */ |
7003 | | |
7004 | | #ifdef SUPPORT_JIT |
7005 | | if (use_jit) |
7006 | | { |
7007 | | #ifdef SUPPORT_UNICODE |
7008 | | if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid) |
7009 | | { |
7010 | | |
7011 | | /* For 8-bit and 16-bit UTF, check that the first code unit is a valid |
7012 | | character start. */ |
7013 | | |
7014 | | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7015 | | if (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
7016 | | { |
7017 | | if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; |
7018 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7019 | | return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ |
7020 | | #else |
7021 | | return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ |
7022 | | #endif |
7023 | | } |
7024 | | #endif /* WIDTH != 32 */ |
7025 | | |
7026 | | /* Move back by the maximum lookbehind, just in case it happens at the very |
7027 | | start of matching. */ |
7028 | | |
7029 | | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7030 | | for (unsigned int i = re->max_lookbehind; i > 0 && start_match > subject; i--) |
7031 | | { |
7032 | | start_match--; |
7033 | | while (start_match > subject && |
7034 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7035 | | (*start_match & 0xc0) == 0x80) |
7036 | | #else /* 16-bit */ |
7037 | | (*start_match & 0xfc00) == 0xdc00) |
7038 | | #endif |
7039 | | start_match--; |
7040 | | } |
7041 | | #else /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
7042 | | |
7043 | | /* In the 32-bit library, one code unit equals one character. However, |
7044 | | we cannot just subtract the lookbehind and then compare pointers, because |
7045 | | a very large lookbehind could create an invalid pointer. */ |
7046 | | |
7047 | | if (start_offset >= re->max_lookbehind) |
7048 | | start_match -= re->max_lookbehind; |
7049 | | else |
7050 | | start_match = subject; |
7051 | | #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
7052 | | |
7053 | | /* Validate the relevant portion of the subject. Adjust the offset of an |
7054 | | invalid code point to be an absolute offset in the whole string. */ |
7055 | | |
7056 | | match_data->rc = PRIV(valid_utf)(start_match, |
7057 | | length - (start_match - subject), &(match_data->startchar)); |
7058 | | if (match_data->rc != 0) |
7059 | | { |
7060 | | match_data->startchar += start_match - subject; |
7061 | | return match_data->rc; |
7062 | | } |
7063 | | jit_checked_utf = TRUE; |
7064 | | } |
7065 | | #endif /* SUPPORT_UNICODE */ |
7066 | | |
7067 | | /* If JIT returns BADOPTION, which means that the selected complete or |
7068 | | partial matching mode was not compiled, fall through to the interpreter. */ |
7069 | | |
7070 | | rc = pcre2_jit_match(code, subject, length, start_offset, options, |
7071 | | match_data, mcontext); |
7072 | | if (rc != PCRE2_ERROR_JIT_BADOPTION) |
7073 | | { |
7074 | | match_data->subject_length = length; |
7075 | | if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0) |
7076 | | { |
7077 | | length = CU2BYTES(length + was_zero_terminated); |
7078 | | match_data->subject = match_data->memctl.malloc(length, |
7079 | | match_data->memctl.memory_data); |
7080 | | if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; |
7081 | | memcpy((void *)match_data->subject, subject, length); |
7082 | | match_data->flags |= PCRE2_MD_COPIED_SUBJECT; |
7083 | | } |
7084 | | return rc; |
7085 | | } |
7086 | | } |
7087 | | #endif /* SUPPORT_JIT */ |
7088 | | |
7089 | | /* ========================= End of JIT matching ========================== */ |
7090 | | |
7091 | | |
7092 | | /* Proceed with non-JIT matching. The default is to allow lookbehinds to the |
7093 | | start of the subject. A UTF check when there is a non-zero offset may change |
7094 | | this. */ |
7095 | | |
7096 | 4.32k | mb->check_subject = subject; |
7097 | | |
7098 | | /* If a UTF subject string was not checked for validity in the JIT code above, |
7099 | | check it here, and handle support for invalid UTF strings. The check above |
7100 | | happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset. |
7101 | | If we get here in those circumstances, it means the subject string is valid, |
7102 | | but for some reason JIT matching was not successful. There is no need to check |
7103 | | the subject again. |
7104 | | |
7105 | | We check only the portion of the subject that might be be inspected during |
7106 | | matching - from the offset minus the maximum lookbehind to the given length. |
7107 | | This saves time when a small part of a large subject is being matched by the |
7108 | | use of a starting offset. Note that the maximum lookbehind is a number of |
7109 | | characters, not code units. |
7110 | | |
7111 | | Note also that support for invalid UTF forces a check, overriding the setting |
7112 | | of PCRE2_NO_CHECK_UTF. */ |
7113 | | |
7114 | 4.32k | #ifdef SUPPORT_UNICODE |
7115 | 4.32k | if (utf && |
7116 | | #ifdef SUPPORT_JIT |
7117 | | !jit_checked_utf && |
7118 | | #endif |
7119 | 4.32k | ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid)) |
7120 | 1.23k | { |
7121 | 1.23k | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7122 | 1.23k | BOOL skipped_bad_start = FALSE; |
7123 | 1.23k | #endif |
7124 | | |
7125 | | /* For 8-bit and 16-bit UTF, check that the first code unit is a valid |
7126 | | character start. If we are handling invalid UTF, just skip over such code |
7127 | | units. Otherwise, give an appropriate error. */ |
7128 | | |
7129 | 1.23k | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7130 | 1.23k | if (allow_invalid) |
7131 | 0 | { |
7132 | 0 | while (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
7133 | 0 | { |
7134 | 0 | start_match++; |
7135 | 0 | skipped_bad_start = TRUE; |
7136 | 0 | } |
7137 | 0 | } |
7138 | 1.23k | else if (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
7139 | 0 | { |
7140 | 0 | if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; |
7141 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7142 | 0 | return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ |
7143 | | #else |
7144 | | return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ |
7145 | | #endif |
7146 | 0 | } |
7147 | 1.23k | #endif /* WIDTH != 32 */ |
7148 | | |
7149 | | /* The mb->check_subject field points to the start of UTF checking; |
7150 | | lookbehinds can go back no further than this. */ |
7151 | | |
7152 | 1.23k | mb->check_subject = start_match; |
7153 | | |
7154 | | /* Move back by the maximum lookbehind, just in case it happens at the very |
7155 | | start of matching, but don't do this if we skipped bad 8-bit or 16-bit code |
7156 | | units above. */ |
7157 | | |
7158 | 1.23k | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7159 | 1.23k | if (!skipped_bad_start) |
7160 | 1.23k | { |
7161 | 1.23k | unsigned int i; |
7162 | 1.23k | for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--) |
7163 | 0 | { |
7164 | 0 | mb->check_subject--; |
7165 | 0 | while (mb->check_subject > subject && |
7166 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7167 | 0 | (*mb->check_subject & 0xc0) == 0x80) |
7168 | | #else /* 16-bit */ |
7169 | | (*mb->check_subject & 0xfc00) == 0xdc00) |
7170 | | #endif |
7171 | 0 | mb->check_subject--; |
7172 | 0 | } |
7173 | 1.23k | } |
7174 | | #else /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
7175 | | |
7176 | | /* In the 32-bit library, one code unit equals one character. However, |
7177 | | we cannot just subtract the lookbehind and then compare pointers, because |
7178 | | a very large lookbehind could create an invalid pointer. */ |
7179 | | |
7180 | | if (start_offset >= re->max_lookbehind) |
7181 | | mb->check_subject -= re->max_lookbehind; |
7182 | | else |
7183 | | mb->check_subject = subject; |
7184 | | #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
7185 | | |
7186 | | /* Validate the relevant portion of the subject. There's a loop in case we |
7187 | | encounter bad UTF in the characters preceding start_match which we are |
7188 | | scanning because of a lookbehind. */ |
7189 | | |
7190 | 1.23k | for (;;) |
7191 | 1.23k | { |
7192 | 1.23k | match_data->rc = PRIV(valid_utf)(mb->check_subject, |
7193 | 1.23k | length - (mb->check_subject - subject), &(match_data->startchar)); |
7194 | | |
7195 | 1.23k | if (match_data->rc == 0) break; /* Valid UTF string */ |
7196 | | |
7197 | | /* Invalid UTF string. Adjust the offset to be an absolute offset in the |
7198 | | whole string. If we are handling invalid UTF strings, set end_subject to |
7199 | | stop before the bad code unit, and set the options to "not end of line". |
7200 | | Otherwise return the error. */ |
7201 | | |
7202 | 140 | match_data->startchar += mb->check_subject - subject; |
7203 | 140 | if (!allow_invalid || match_data->rc > 0) return match_data->rc; |
7204 | 0 | end_subject = subject + match_data->startchar; |
7205 | | |
7206 | | /* If the end precedes start_match, it means there is invalid UTF in the |
7207 | | extra code units we reversed over because of a lookbehind. Advance past the |
7208 | | first bad code unit, and then skip invalid character starting code units in |
7209 | | 8-bit and 16-bit modes, and try again with the original end point. */ |
7210 | |
|
7211 | 0 | if (end_subject < start_match) |
7212 | 0 | { |
7213 | 0 | mb->check_subject = end_subject + 1; |
7214 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7215 | 0 | while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject)) |
7216 | 0 | mb->check_subject++; |
7217 | 0 | #endif |
7218 | 0 | end_subject = true_end_subject; |
7219 | 0 | } |
7220 | | |
7221 | | /* Otherwise, set the not end of line option, and do the match. */ |
7222 | | |
7223 | 0 | else |
7224 | 0 | { |
7225 | 0 | fragment_options = PCRE2_NOTEOL; |
7226 | 0 | break; |
7227 | 0 | } |
7228 | 0 | } |
7229 | 1.23k | } |
7230 | 4.18k | #endif /* SUPPORT_UNICODE */ |
7231 | | |
7232 | | /* A NULL match context means "use a default context", but we take the memory |
7233 | | control functions from the pattern. */ |
7234 | | |
7235 | 4.18k | if (mcontext == NULL) |
7236 | 0 | { |
7237 | 0 | mcontext = (pcre2_match_context *)(&PRIV(default_match_context)); |
7238 | 0 | mb->memctl = re->memctl; |
7239 | 0 | } |
7240 | 4.18k | else mb->memctl = mcontext->memctl; |
7241 | | |
7242 | 4.18k | anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; |
7243 | 4.18k | firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0; |
7244 | 4.18k | startline = (re->flags & PCRE2_STARTLINE) != 0; |
7245 | 4.18k | bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? |
7246 | 4.18k | true_end_subject : subject + mcontext->offset_limit; |
7247 | | |
7248 | | /* Initialize and set up the fixed fields in the callout block, with a pointer |
7249 | | in the match block. */ |
7250 | | |
7251 | 4.18k | mb->cb = &cb; |
7252 | 4.18k | cb.version = 2; |
7253 | 4.18k | cb.subject = subject; |
7254 | 4.18k | cb.subject_length = (PCRE2_SIZE)(end_subject - subject); |
7255 | 4.18k | cb.callout_flags = 0; |
7256 | | |
7257 | | /* Fill in the remaining fields in the match block, except for moptions, which |
7258 | | gets set later. */ |
7259 | | |
7260 | 4.18k | mb->callout = mcontext->callout; |
7261 | 4.18k | mb->callout_data = mcontext->callout_data; |
7262 | | |
7263 | 4.18k | mb->start_subject = subject; |
7264 | 4.18k | mb->start_offset = start_offset; |
7265 | 4.18k | mb->end_subject = end_subject; |
7266 | 4.18k | mb->true_end_subject = true_end_subject; |
7267 | 4.18k | mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; |
7268 | 4.18k | mb->allowemptypartial = (re->max_lookbehind > 0) || |
7269 | 4.18k | (re->flags & PCRE2_MATCH_EMPTY) != 0; |
7270 | 4.18k | mb->poptions = re->overall_options; /* Pattern options */ |
7271 | 4.18k | mb->ignore_skip_arg = 0; |
7272 | 4.18k | mb->mark = mb->nomatch_mark = NULL; /* In case never set */ |
7273 | | |
7274 | | /* The name table is needed for finding all the numbers associated with a |
7275 | | given name, for condition testing. The code follows the name table. */ |
7276 | | |
7277 | 4.18k | mb->name_table = (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)); |
7278 | 4.18k | mb->name_count = re->name_count; |
7279 | 4.18k | mb->name_entry_size = re->name_entry_size; |
7280 | 4.18k | mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start); |
7281 | | |
7282 | | /* Process the \R and newline settings. */ |
7283 | | |
7284 | 4.18k | mb->bsr_convention = re->bsr_convention; |
7285 | 4.18k | mb->nltype = NLTYPE_FIXED; |
7286 | 4.18k | switch(re->newline_convention) |
7287 | 4.18k | { |
7288 | 0 | case PCRE2_NEWLINE_CR: |
7289 | 0 | mb->nllen = 1; |
7290 | 0 | mb->nl[0] = CHAR_CR; |
7291 | 0 | break; |
7292 | | |
7293 | 4.18k | case PCRE2_NEWLINE_LF: |
7294 | 4.18k | mb->nllen = 1; |
7295 | 4.18k | mb->nl[0] = CHAR_NL; |
7296 | 4.18k | break; |
7297 | | |
7298 | 0 | case PCRE2_NEWLINE_NUL: |
7299 | 0 | mb->nllen = 1; |
7300 | 0 | mb->nl[0] = CHAR_NUL; |
7301 | 0 | break; |
7302 | | |
7303 | 0 | case PCRE2_NEWLINE_CRLF: |
7304 | 0 | mb->nllen = 2; |
7305 | 0 | mb->nl[0] = CHAR_CR; |
7306 | 0 | mb->nl[1] = CHAR_NL; |
7307 | 0 | break; |
7308 | | |
7309 | 0 | case PCRE2_NEWLINE_ANY: |
7310 | 0 | mb->nltype = NLTYPE_ANY; |
7311 | 0 | break; |
7312 | | |
7313 | 0 | case PCRE2_NEWLINE_ANYCRLF: |
7314 | 0 | mb->nltype = NLTYPE_ANYCRLF; |
7315 | 0 | break; |
7316 | | |
7317 | 0 | default: |
7318 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
7319 | 0 | return PCRE2_ERROR_INTERNAL; |
7320 | 4.18k | } |
7321 | | |
7322 | | /* The backtracking frames have fixed data at the front, and a PCRE2_SIZE |
7323 | | vector at the end, whose size depends on the number of capturing parentheses in |
7324 | | the pattern. It is not used at all if there are no capturing parentheses. |
7325 | | |
7326 | | frame_size is the total size of each frame |
7327 | | match_data->heapframes is the pointer to the frames vector |
7328 | | match_data->heapframes_size is the allocated size of the vector |
7329 | | |
7330 | | We must pad the frame_size for alignment to ensure subsequent frames are as |
7331 | | aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE |
7332 | | array, that does not guarantee it is suitably aligned for pointers, as some |
7333 | | architectures have pointers that are larger than a size_t. */ |
7334 | | |
7335 | 4.18k | frame_size = (offsetof(heapframe, ovector) + |
7336 | 4.18k | re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) & |
7337 | 4.18k | ~(HEAPFRAME_ALIGNMENT - 1); |
7338 | | |
7339 | | /* Limits set in the pattern override the match context only if they are |
7340 | | smaller. */ |
7341 | | |
7342 | 4.18k | mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)? |
7343 | 4.18k | mcontext->heap_limit : re->limit_heap); |
7344 | | |
7345 | 4.18k | mb->match_limit = (mcontext->match_limit < re->limit_match)? |
7346 | 4.18k | mcontext->match_limit : re->limit_match; |
7347 | | |
7348 | 4.18k | mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)? |
7349 | 4.18k | mcontext->depth_limit : re->limit_depth; |
7350 | | |
7351 | | /* If a pattern has very many capturing parentheses, the frame size may be very |
7352 | | large. Set the initial frame vector size to ensure that there are at least 10 |
7353 | | available frames, but enforce a minimum of START_FRAMES_SIZE. If this is |
7354 | | greater than the heap limit, get as large a vector as possible. */ |
7355 | | |
7356 | 4.18k | heapframes_size = frame_size * 10; |
7357 | 4.18k | if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE; |
7358 | 4.18k | if (heapframes_size / 1024 > mb->heap_limit) |
7359 | 0 | { |
7360 | 0 | PCRE2_SIZE max_size = 1024 * mb->heap_limit; |
7361 | 0 | if (max_size < frame_size) return PCRE2_ERROR_HEAPLIMIT; |
7362 | 0 | heapframes_size = max_size; |
7363 | 0 | } |
7364 | | |
7365 | | /* If an existing frame vector in the match_data block is large enough, we can |
7366 | | use it. Otherwise, free any pre-existing vector and get a new one. */ |
7367 | | |
7368 | 4.18k | if (match_data->heapframes_size < heapframes_size) |
7369 | 155 | { |
7370 | 155 | match_data->memctl.free(match_data->heapframes, |
7371 | 155 | match_data->memctl.memory_data); |
7372 | 155 | match_data->heapframes = match_data->memctl.malloc(heapframes_size, |
7373 | 155 | match_data->memctl.memory_data); |
7374 | 155 | if (match_data->heapframes == NULL) |
7375 | 0 | { |
7376 | 0 | match_data->heapframes_size = 0; |
7377 | 0 | return PCRE2_ERROR_NOMEMORY; |
7378 | 0 | } |
7379 | 155 | match_data->heapframes_size = heapframes_size; |
7380 | 155 | } |
7381 | | |
7382 | | /* Write to the ovector within the first frame to mark every capture unset and |
7383 | | to avoid uninitialized memory read errors when it is copied to a new frame. */ |
7384 | | |
7385 | 4.18k | memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff, |
7386 | 4.18k | frame_size - offsetof(heapframe, ovector)); |
7387 | | |
7388 | | /* Pointers to the individual character tables */ |
7389 | | |
7390 | 4.18k | mb->lcc = re->tables + lcc_offset; |
7391 | 4.18k | mb->fcc = re->tables + fcc_offset; |
7392 | 4.18k | mb->ctypes = re->tables + ctypes_offset; |
7393 | | |
7394 | | /* Set up the first code unit to match, if available. If there's no first code |
7395 | | unit there may be a bitmap of possible first characters. */ |
7396 | | |
7397 | 4.18k | if ((re->flags & PCRE2_FIRSTSET) != 0) |
7398 | 652 | { |
7399 | 652 | has_first_cu = TRUE; |
7400 | 652 | first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); |
7401 | 652 | if ((re->flags & PCRE2_FIRSTCASELESS) != 0) |
7402 | 7 | { |
7403 | 7 | first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); |
7404 | 7 | #ifdef SUPPORT_UNICODE |
7405 | 7 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7406 | 7 | if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu); |
7407 | | #else |
7408 | | if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu); |
7409 | | #endif |
7410 | 7 | #endif /* SUPPORT_UNICODE */ |
7411 | 7 | } |
7412 | 652 | } |
7413 | 3.53k | else |
7414 | 3.53k | if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) |
7415 | 1.57k | start_bits = re->start_bitmap; |
7416 | | |
7417 | | /* There may also be a "last known required character" set. */ |
7418 | | |
7419 | 4.18k | if ((re->flags & PCRE2_LASTSET) != 0) |
7420 | 578 | { |
7421 | 578 | has_req_cu = TRUE; |
7422 | 578 | req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); |
7423 | 578 | if ((re->flags & PCRE2_LASTCASELESS) != 0) |
7424 | 75 | { |
7425 | 75 | req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); |
7426 | 75 | #ifdef SUPPORT_UNICODE |
7427 | 75 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7428 | 75 | if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu); |
7429 | | #else |
7430 | | if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu); |
7431 | | #endif |
7432 | 75 | #endif /* SUPPORT_UNICODE */ |
7433 | 75 | } |
7434 | 578 | } |
7435 | | |
7436 | | |
7437 | | /* ==========================================================================*/ |
7438 | | |
7439 | | /* Loop for handling unanchored repeated matching attempts; for anchored regexs |
7440 | | the loop runs just once. */ |
7441 | | |
7442 | 4.18k | #ifdef SUPPORT_UNICODE |
7443 | 4.18k | FRAGMENT_RESTART: |
7444 | 4.18k | #endif |
7445 | | |
7446 | 4.18k | start_partial = match_partial = NULL; |
7447 | 4.18k | mb->hitend = FALSE; |
7448 | | |
7449 | 4.18k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7450 | 4.18k | memchr_found_first_cu = NULL; |
7451 | 4.18k | memchr_found_first_cu2 = NULL; |
7452 | 4.18k | #endif |
7453 | | |
7454 | 4.18k | for(;;) |
7455 | 364k | { |
7456 | 364k | PCRE2_SPTR new_start_match; |
7457 | | |
7458 | | /* ----------------- Start of match optimizations ---------------- */ |
7459 | | |
7460 | | /* There are some optimizations that avoid running the match if a known |
7461 | | starting point is not found, or if a known later code unit is not present. |
7462 | | However, there is an option (settable at compile time) that disables these, |
7463 | | for testing and for ensuring that all callouts do actually occur. */ |
7464 | | |
7465 | 364k | if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0) |
7466 | 364k | { |
7467 | | /* If firstline is TRUE, the start of the match is constrained to the first |
7468 | | line of a multiline string. That is, the match must be before or at the |
7469 | | first newline following the start of matching. Temporarily adjust |
7470 | | end_subject so that we stop the scans for a first code unit at a newline. |
7471 | | If the match fails at the newline, later code breaks the loop. */ |
7472 | | |
7473 | 364k | if (firstline) |
7474 | 0 | { |
7475 | 0 | PCRE2_SPTR t = start_match; |
7476 | 0 | #ifdef SUPPORT_UNICODE |
7477 | 0 | if (utf) |
7478 | 0 | { |
7479 | 0 | while (t < end_subject && !IS_NEWLINE(t)) |
7480 | 0 | { |
7481 | 0 | t++; |
7482 | 0 | ACROSSCHAR(t < end_subject, t, t++); |
7483 | 0 | } |
7484 | 0 | } |
7485 | 0 | else |
7486 | 0 | #endif |
7487 | 0 | while (t < end_subject && !IS_NEWLINE(t)) t++; |
7488 | 0 | end_subject = t; |
7489 | 0 | } |
7490 | | |
7491 | | /* Anchored: check the first code unit if one is recorded. This may seem |
7492 | | pointless but it can help in detecting a no match case without scanning for |
7493 | | the required code unit. */ |
7494 | | |
7495 | 364k | if (anchored) |
7496 | 190 | { |
7497 | 190 | if (has_first_cu || start_bits != NULL) |
7498 | 55 | { |
7499 | 55 | BOOL ok = start_match < end_subject; |
7500 | 55 | if (ok) |
7501 | 55 | { |
7502 | 55 | PCRE2_UCHAR c = UCHAR21TEST(start_match); |
7503 | 55 | ok = has_first_cu && (c == first_cu || c == first_cu2); |
7504 | 55 | if (!ok && start_bits != NULL) |
7505 | 35 | { |
7506 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7507 | | if (c > 255) c = 255; |
7508 | | #endif |
7509 | 35 | ok = (start_bits[c/8] & (1u << (c&7))) != 0; |
7510 | 35 | } |
7511 | 55 | } |
7512 | 55 | if (!ok) |
7513 | 19 | { |
7514 | 19 | rc = MATCH_NOMATCH; |
7515 | 19 | break; |
7516 | 19 | } |
7517 | 55 | } |
7518 | 190 | } |
7519 | | |
7520 | | /* Not anchored. Advance to a unique first code unit if there is one. */ |
7521 | | |
7522 | 364k | else |
7523 | 364k | { |
7524 | 364k | if (has_first_cu) |
7525 | 1.64k | { |
7526 | 1.64k | if (first_cu != first_cu2) /* Caseless */ |
7527 | 13 | { |
7528 | | /* In 16-bit and 32_bit modes we have to do our own search, so can |
7529 | | look for both cases at once. */ |
7530 | | |
7531 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7532 | | PCRE2_UCHAR smc; |
7533 | | while (start_match < end_subject && |
7534 | | (smc = UCHAR21TEST(start_match)) != first_cu && |
7535 | | smc != first_cu2) |
7536 | | start_match++; |
7537 | | #else |
7538 | | /* In 8-bit mode, the use of memchr() gives a big speed up, even |
7539 | | though we have to call it twice in order to find the earliest |
7540 | | occurrence of the code unit in either of its cases. Caching is used |
7541 | | to remember the positions of previously found code units. This can |
7542 | | make a huge difference when the strings are very long and only one |
7543 | | case is actually present. */ |
7544 | | |
7545 | 13 | PCRE2_SPTR pp1 = NULL; |
7546 | 13 | PCRE2_SPTR pp2 = NULL; |
7547 | 13 | PCRE2_SIZE searchlength = end_subject - start_match; |
7548 | | |
7549 | | /* If we haven't got a previously found position for first_cu, or if |
7550 | | the current starting position is later, we need to do a search. If |
7551 | | the code unit is not found, set it to the end. */ |
7552 | | |
7553 | 13 | if (memchr_found_first_cu == NULL || |
7554 | 13 | start_match > memchr_found_first_cu) |
7555 | 7 | { |
7556 | 7 | pp1 = memchr(start_match, first_cu, searchlength); |
7557 | 7 | memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1; |
7558 | 7 | } |
7559 | | |
7560 | | /* If the start is before a previously found position, use the |
7561 | | previous position, or NULL if a previous search failed. */ |
7562 | | |
7563 | 6 | else pp1 = (memchr_found_first_cu == end_subject)? NULL : |
7564 | 6 | memchr_found_first_cu; |
7565 | | |
7566 | | /* Do the same thing for the other case. */ |
7567 | | |
7568 | 13 | if (memchr_found_first_cu2 == NULL || |
7569 | 13 | start_match > memchr_found_first_cu2) |
7570 | 13 | { |
7571 | 13 | pp2 = memchr(start_match, first_cu2, searchlength); |
7572 | 13 | memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2; |
7573 | 13 | } |
7574 | | |
7575 | 0 | else pp2 = (memchr_found_first_cu2 == end_subject)? NULL : |
7576 | 0 | memchr_found_first_cu2; |
7577 | | |
7578 | | /* Set the start to the end of the subject if neither case was found. |
7579 | | Otherwise, use the earlier found point. */ |
7580 | | |
7581 | 13 | if (pp1 == NULL) |
7582 | 10 | start_match = (pp2 == NULL)? end_subject : pp2; |
7583 | 3 | else |
7584 | 3 | start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; |
7585 | | |
7586 | 13 | #endif /* 8-bit handling */ |
7587 | 13 | } |
7588 | | |
7589 | | /* The caseful case is much simpler. */ |
7590 | | |
7591 | 1.63k | else |
7592 | 1.63k | { |
7593 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7594 | | while (start_match < end_subject && UCHAR21TEST(start_match) != |
7595 | | first_cu) |
7596 | | start_match++; |
7597 | | #else |
7598 | 1.63k | start_match = memchr(start_match, first_cu, end_subject - start_match); |
7599 | 1.63k | if (start_match == NULL) start_match = end_subject; |
7600 | 1.63k | #endif |
7601 | 1.63k | } |
7602 | | |
7603 | | /* If we can't find the required first code unit, having reached the |
7604 | | true end of the subject, break the bumpalong loop, to force a match |
7605 | | failure, except when doing partial matching, when we let the next cycle |
7606 | | run at the end of the subject. To see why, consider the pattern |
7607 | | /(?<=abc)def/, which partially matches "abc", even though the string |
7608 | | does not contain the starting character "d". If we have not reached the |
7609 | | true end of the subject (PCRE2_FIRSTLINE caused end_subject to be |
7610 | | temporarily modified) we also let the cycle run, because the matching |
7611 | | string is legitimately allowed to start with the first code unit of a |
7612 | | newline. */ |
7613 | | |
7614 | 1.64k | if (mb->partial == 0 && start_match >= mb->end_subject) |
7615 | 278 | { |
7616 | 278 | rc = MATCH_NOMATCH; |
7617 | 278 | break; |
7618 | 278 | } |
7619 | 1.64k | } |
7620 | | |
7621 | | /* If there's no first code unit, advance to just after a linebreak for a |
7622 | | multiline match if required. */ |
7623 | | |
7624 | 362k | else if (startline) |
7625 | 0 | { |
7626 | 0 | if (start_match > mb->start_subject + start_offset) |
7627 | 0 | { |
7628 | 0 | #ifdef SUPPORT_UNICODE |
7629 | 0 | if (utf) |
7630 | 0 | { |
7631 | 0 | while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
7632 | 0 | { |
7633 | 0 | start_match++; |
7634 | 0 | ACROSSCHAR(start_match < end_subject, start_match, start_match++); |
7635 | 0 | } |
7636 | 0 | } |
7637 | 0 | else |
7638 | 0 | #endif |
7639 | 0 | while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
7640 | 0 | start_match++; |
7641 | | |
7642 | | /* If we have just passed a CR and the newline option is ANY or |
7643 | | ANYCRLF, and we are now at a LF, advance the match position by one |
7644 | | more code unit. */ |
7645 | |
|
7646 | 0 | if (start_match[-1] == CHAR_CR && |
7647 | 0 | (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && |
7648 | 0 | start_match < end_subject && |
7649 | 0 | UCHAR21TEST(start_match) == CHAR_NL) |
7650 | 0 | start_match++; |
7651 | 0 | } |
7652 | 0 | } |
7653 | | |
7654 | | /* If there's no first code unit or a requirement for a multiline line |
7655 | | start, advance to a non-unique first code unit if any have been |
7656 | | identified. The bitmap contains only 256 bits. When code units are 16 or |
7657 | | 32 bits wide, all code units greater than 254 set the 255 bit. */ |
7658 | | |
7659 | 362k | else if (start_bits != NULL) |
7660 | 68.3k | { |
7661 | 113k | while (start_match < end_subject) |
7662 | 113k | { |
7663 | 113k | uint32_t c = UCHAR21TEST(start_match); |
7664 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7665 | | if (c > 255) c = 255; |
7666 | | #endif |
7667 | 113k | if ((start_bits[c/8] & (1u << (c&7))) != 0) break; |
7668 | 45.6k | start_match++; |
7669 | 45.6k | } |
7670 | | |
7671 | | /* See comment above in first_cu checking about the next few lines. */ |
7672 | | |
7673 | 68.3k | if (mb->partial == 0 && start_match >= mb->end_subject) |
7674 | 762 | { |
7675 | 762 | rc = MATCH_NOMATCH; |
7676 | 762 | break; |
7677 | 762 | } |
7678 | 68.3k | } |
7679 | 364k | } /* End first code unit handling */ |
7680 | | |
7681 | | /* Restore fudged end_subject */ |
7682 | | |
7683 | 363k | end_subject = mb->end_subject; |
7684 | | |
7685 | | /* The following two optimizations must be disabled for partial matching. */ |
7686 | | |
7687 | 363k | if (mb->partial == 0) |
7688 | 363k | { |
7689 | 363k | PCRE2_SPTR p; |
7690 | | |
7691 | | /* The minimum matching length is a lower bound; no string of that length |
7692 | | may actually match the pattern. Although the value is, strictly, in |
7693 | | characters, we treat it as code units to avoid spending too much time in |
7694 | | this optimization. */ |
7695 | | |
7696 | 363k | if (end_subject - start_match < re->minlength) |
7697 | 2.21k | { |
7698 | 2.21k | rc = MATCH_NOMATCH; |
7699 | 2.21k | break; |
7700 | 2.21k | } |
7701 | | |
7702 | | /* If req_cu is set, we know that that code unit must appear in the |
7703 | | subject for the (non-partial) match to succeed. If the first code unit is |
7704 | | set, req_cu must be later in the subject; otherwise the test starts at |
7705 | | the match point. This optimization can save a huge amount of backtracking |
7706 | | in patterns with nested unlimited repeats that aren't going to match. |
7707 | | Writing separate code for caseful/caseless versions makes it go faster, |
7708 | | as does using an autoincrement and backing off on a match. As in the case |
7709 | | of the first code unit, using memchr() in the 8-bit library gives a big |
7710 | | speed up. Unlike the first_cu check above, we do not need to call |
7711 | | memchr() twice in the caseless case because we only need to check for the |
7712 | | presence of the character in either case, not find the first occurrence. |
7713 | | |
7714 | | The search can be skipped if the code unit was found later than the |
7715 | | current starting point in a previous iteration of the bumpalong loop. |
7716 | | |
7717 | | HOWEVER: when the subject string is very, very long, searching to its end |
7718 | | can take a long time, and give bad performance on quite ordinary |
7719 | | anchored patterns. This showed up when somebody was matching something |
7720 | | like /^\d+C/ on a 32-megabyte string... so we don't do this when the |
7721 | | string is sufficiently long, but it's worth searching a lot more for |
7722 | | unanchored patterns. */ |
7723 | | |
7724 | 361k | p = start_match + (has_first_cu? 1:0); |
7725 | 361k | if (has_req_cu && p > req_cu_ptr) |
7726 | 1.43k | { |
7727 | 1.43k | PCRE2_SIZE check_length = end_subject - start_match; |
7728 | | |
7729 | 1.43k | if (check_length < REQ_CU_MAX || |
7730 | 1.43k | (!anchored && check_length < REQ_CU_MAX * 1000)) |
7731 | 1.43k | { |
7732 | 1.43k | if (req_cu != req_cu2) /* Caseless */ |
7733 | 66 | { |
7734 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7735 | | while (p < end_subject) |
7736 | | { |
7737 | | uint32_t pp = UCHAR21INCTEST(p); |
7738 | | if (pp == req_cu || pp == req_cu2) { p--; break; } |
7739 | | } |
7740 | | #else /* 8-bit code units */ |
7741 | 66 | PCRE2_SPTR pp = p; |
7742 | 66 | p = memchr(pp, req_cu, end_subject - pp); |
7743 | 66 | if (p == NULL) |
7744 | 17 | { |
7745 | 17 | p = memchr(pp, req_cu2, end_subject - pp); |
7746 | 17 | if (p == NULL) p = end_subject; |
7747 | 17 | } |
7748 | 66 | #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ |
7749 | 66 | } |
7750 | | |
7751 | | /* The caseful case */ |
7752 | | |
7753 | 1.36k | else |
7754 | 1.36k | { |
7755 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7756 | | while (p < end_subject) |
7757 | | { |
7758 | | if (UCHAR21INCTEST(p) == req_cu) { p--; break; } |
7759 | | } |
7760 | | |
7761 | | #else /* 8-bit code units */ |
7762 | 1.36k | p = memchr(p, req_cu, end_subject - p); |
7763 | 1.36k | if (p == NULL) p = end_subject; |
7764 | 1.36k | #endif |
7765 | 1.36k | } |
7766 | | |
7767 | | /* If we can't find the required code unit, break the bumpalong loop, |
7768 | | forcing a match failure. */ |
7769 | | |
7770 | 1.43k | if (p >= end_subject) |
7771 | 171 | { |
7772 | 171 | rc = MATCH_NOMATCH; |
7773 | 171 | break; |
7774 | 171 | } |
7775 | | |
7776 | | /* If we have found the required code unit, save the point where we |
7777 | | found it, so that we don't search again next time round the bumpalong |
7778 | | loop if the start hasn't yet passed this code unit. */ |
7779 | | |
7780 | 1.26k | req_cu_ptr = p; |
7781 | 1.26k | } |
7782 | 1.43k | } |
7783 | 361k | } |
7784 | 363k | } |
7785 | | |
7786 | | /* ------------ End of start of match optimizations ------------ */ |
7787 | | |
7788 | | /* Give no match if we have passed the bumpalong limit. */ |
7789 | | |
7790 | 360k | if (start_match > bumpalong_limit) |
7791 | 0 | { |
7792 | 0 | rc = MATCH_NOMATCH; |
7793 | 0 | break; |
7794 | 0 | } |
7795 | | |
7796 | | /* OK, we can now run the match. If "hitend" is set afterwards, remember the |
7797 | | first starting point for which a partial match was found. */ |
7798 | | |
7799 | 360k | cb.start_match = (PCRE2_SIZE)(start_match - subject); |
7800 | 360k | cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH; |
7801 | | |
7802 | 360k | mb->start_used_ptr = start_match; |
7803 | 360k | mb->last_used_ptr = start_match; |
7804 | 360k | #ifdef SUPPORT_UNICODE |
7805 | 360k | mb->moptions = options | fragment_options; |
7806 | | #else |
7807 | | mb->moptions = options; |
7808 | | #endif |
7809 | 360k | mb->match_call_count = 0; |
7810 | 360k | mb->end_offset_top = 0; |
7811 | 360k | mb->skip_arg_count = 0; |
7812 | | |
7813 | | #ifdef DEBUG_SHOW_OPS |
7814 | | fprintf(stderr, "++ Calling match()\n"); |
7815 | | #endif |
7816 | | |
7817 | 360k | rc = match(start_match, mb->start_code, re->top_bracket, frame_size, |
7818 | 360k | match_data, mb); |
7819 | | |
7820 | | #ifdef DEBUG_SHOW_OPS |
7821 | | fprintf(stderr, "++ match() returned %d\n\n", rc); |
7822 | | #endif |
7823 | | |
7824 | 360k | if (mb->hitend && start_partial == NULL) |
7825 | 0 | { |
7826 | 0 | start_partial = mb->start_used_ptr; |
7827 | 0 | match_partial = start_match; |
7828 | 0 | } |
7829 | | |
7830 | 360k | switch(rc) |
7831 | 360k | { |
7832 | | /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched |
7833 | | the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP |
7834 | | entirely. The only way we can do that is to re-do the match at the same |
7835 | | point, with a flag to force SKIP with an argument to be ignored. Just |
7836 | | treating this case as NOMATCH does not work because it does not check other |
7837 | | alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ |
7838 | | |
7839 | 0 | case MATCH_SKIP_ARG: |
7840 | 0 | new_start_match = start_match; |
7841 | 0 | mb->ignore_skip_arg = mb->skip_arg_count; |
7842 | 0 | break; |
7843 | | |
7844 | | /* SKIP passes back the next starting point explicitly, but if it is no |
7845 | | greater than the match we have just done, treat it as NOMATCH. */ |
7846 | | |
7847 | 0 | case MATCH_SKIP: |
7848 | 0 | if (mb->verb_skip_ptr > start_match) |
7849 | 0 | { |
7850 | 0 | new_start_match = mb->verb_skip_ptr; |
7851 | 0 | break; |
7852 | 0 | } |
7853 | | /* Fall through */ |
7854 | | |
7855 | | /* NOMATCH and PRUNE advance by one character. THEN at this level acts |
7856 | | exactly like PRUNE. Unset ignore SKIP-with-argument. */ |
7857 | | |
7858 | 360k | case MATCH_NOMATCH: |
7859 | 360k | case MATCH_PRUNE: |
7860 | 360k | case MATCH_THEN: |
7861 | 360k | mb->ignore_skip_arg = 0; |
7862 | 360k | new_start_match = start_match + 1; |
7863 | 360k | #ifdef SUPPORT_UNICODE |
7864 | 360k | if (utf) |
7865 | 104k | ACROSSCHAR(new_start_match < end_subject, new_start_match, |
7866 | 360k | new_start_match++); |
7867 | 360k | #endif |
7868 | 360k | break; |
7869 | | |
7870 | | /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ |
7871 | | |
7872 | 0 | case MATCH_COMMIT: |
7873 | 0 | rc = MATCH_NOMATCH; |
7874 | 0 | goto ENDLOOP; |
7875 | | |
7876 | | /* Any other return is either a match, or some kind of error. */ |
7877 | | |
7878 | 589 | default: |
7879 | 589 | goto ENDLOOP; |
7880 | 360k | } |
7881 | | |
7882 | | /* Control reaches here for the various types of "no match at this point" |
7883 | | result. Reset the code to MATCH_NOMATCH for subsequent checking. */ |
7884 | | |
7885 | 360k | rc = MATCH_NOMATCH; |
7886 | | |
7887 | | /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first |
7888 | | newline in the subject (though it may continue over the newline). Therefore, |
7889 | | if we have just failed to match, starting at a newline, do not continue. */ |
7890 | | |
7891 | 360k | if (firstline && IS_NEWLINE(start_match)) break; |
7892 | | |
7893 | | /* Advance to new matching position */ |
7894 | | |
7895 | 360k | start_match = new_start_match; |
7896 | | |
7897 | | /* Break the loop if the pattern is anchored or if we have passed the end of |
7898 | | the subject. */ |
7899 | | |
7900 | 360k | if (anchored || start_match > end_subject) break; |
7901 | | |
7902 | | /* If we have just passed a CR and we are now at a LF, and the pattern does |
7903 | | not contain any explicit matches for \r or \n, and the newline option is CRLF |
7904 | | or ANY or ANYCRLF, advance the match position by one more code unit. In |
7905 | | normal matching start_match will aways be greater than the first position at |
7906 | | this stage, but a failed *SKIP can cause a return at the same point, which is |
7907 | | why the first test exists. */ |
7908 | | |
7909 | 360k | if (start_match > subject + start_offset && |
7910 | 360k | start_match[-1] == CHAR_CR && |
7911 | 360k | start_match < end_subject && |
7912 | 360k | *start_match == CHAR_NL && |
7913 | 360k | (re->flags & PCRE2_HASCRORLF) == 0 && |
7914 | 360k | (mb->nltype == NLTYPE_ANY || |
7915 | 180 | mb->nltype == NLTYPE_ANYCRLF || |
7916 | 180 | mb->nllen == 2)) |
7917 | 0 | start_match++; |
7918 | | |
7919 | 360k | mb->mark = NULL; /* Reset for start of next match attempt */ |
7920 | 360k | } /* End of for(;;) "bumpalong" loop */ |
7921 | | |
7922 | | /* ==========================================================================*/ |
7923 | | |
7924 | | /* When we reach here, one of the following stopping conditions is true: |
7925 | | |
7926 | | (1) The match succeeded, either completely, or partially; |
7927 | | |
7928 | | (2) The pattern is anchored or the match was failed after (*COMMIT); |
7929 | | |
7930 | | (3) We are past the end of the subject or the bumpalong limit; |
7931 | | |
7932 | | (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because |
7933 | | this option requests that a match occur at or before the first newline in |
7934 | | the subject. |
7935 | | |
7936 | | (5) Some kind of error occurred. |
7937 | | |
7938 | | */ |
7939 | | |
7940 | 4.18k | ENDLOOP: |
7941 | | |
7942 | | /* If end_subject != true_end_subject, it means we are handling invalid UTF, |
7943 | | and have just processed a non-terminal fragment. If this resulted in no match |
7944 | | or a partial match we must carry on to the next fragment (a partial match is |
7945 | | returned to the caller only at the very end of the subject). A loop is used to |
7946 | | avoid trying to match against empty fragments; if the pattern can match an |
7947 | | empty string it would have done so already. */ |
7948 | | |
7949 | 4.18k | #ifdef SUPPORT_UNICODE |
7950 | 4.18k | if (utf && end_subject != true_end_subject && |
7951 | 4.18k | (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL)) |
7952 | 0 | { |
7953 | 0 | for (;;) |
7954 | 0 | { |
7955 | | /* Advance past the first bad code unit, and then skip invalid character |
7956 | | starting code units in 8-bit and 16-bit modes. */ |
7957 | |
|
7958 | 0 | start_match = end_subject + 1; |
7959 | |
|
7960 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7961 | 0 | while (start_match < true_end_subject && NOT_FIRSTCU(*start_match)) |
7962 | 0 | start_match++; |
7963 | 0 | #endif |
7964 | | |
7965 | | /* If we have hit the end of the subject, there isn't another non-empty |
7966 | | fragment, so give up. */ |
7967 | |
|
7968 | 0 | if (start_match >= true_end_subject) |
7969 | 0 | { |
7970 | 0 | rc = MATCH_NOMATCH; /* In case it was partial */ |
7971 | 0 | match_partial = NULL; |
7972 | 0 | break; |
7973 | 0 | } |
7974 | | |
7975 | | /* Check the rest of the subject */ |
7976 | | |
7977 | 0 | mb->check_subject = start_match; |
7978 | 0 | rc = PRIV(valid_utf)(start_match, length - (start_match - subject), |
7979 | 0 | &(match_data->startchar)); |
7980 | | |
7981 | | /* The rest of the subject is valid UTF. */ |
7982 | |
|
7983 | 0 | if (rc == 0) |
7984 | 0 | { |
7985 | 0 | mb->end_subject = end_subject = true_end_subject; |
7986 | 0 | fragment_options = PCRE2_NOTBOL; |
7987 | 0 | goto FRAGMENT_RESTART; |
7988 | 0 | } |
7989 | | |
7990 | | /* A subsequent UTF error has been found; if the next fragment is |
7991 | | non-empty, set up to process it. Otherwise, let the loop advance. */ |
7992 | | |
7993 | 0 | else if (rc < 0) |
7994 | 0 | { |
7995 | 0 | mb->end_subject = end_subject = start_match + match_data->startchar; |
7996 | 0 | if (end_subject > start_match) |
7997 | 0 | { |
7998 | 0 | fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL; |
7999 | 0 | goto FRAGMENT_RESTART; |
8000 | 0 | } |
8001 | 0 | } |
8002 | 0 | } |
8003 | 0 | } |
8004 | 4.18k | #endif /* SUPPORT_UNICODE */ |
8005 | | |
8006 | | /* Fill in fields that are always returned in the match data. */ |
8007 | | |
8008 | 4.18k | match_data->code = re; |
8009 | 4.18k | match_data->mark = mb->mark; |
8010 | 4.18k | match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER; |
8011 | | |
8012 | | /* Handle a fully successful match. Set the return code to the number of |
8013 | | captured strings, or 0 if there were too many to fit into the ovector, and then |
8014 | | set the remaining returned values before returning. Make a copy of the subject |
8015 | | string if requested. */ |
8016 | | |
8017 | 4.18k | if (rc == MATCH_MATCH) |
8018 | 559 | { |
8019 | 559 | match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)? |
8020 | 559 | 0 : (int)mb->end_offset_top/2 + 1; |
8021 | 559 | match_data->subject_length = length; |
8022 | 559 | match_data->startchar = start_match - subject; |
8023 | 559 | match_data->leftchar = mb->start_used_ptr - subject; |
8024 | 559 | match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)? |
8025 | 390 | mb->last_used_ptr : mb->end_match_ptr) - subject; |
8026 | 559 | if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0) |
8027 | 0 | { |
8028 | 0 | length = CU2BYTES(length + was_zero_terminated); |
8029 | 0 | match_data->subject = match_data->memctl.malloc(length, |
8030 | 0 | match_data->memctl.memory_data); |
8031 | 0 | if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; |
8032 | 0 | memcpy((void *)match_data->subject, subject, length); |
8033 | 0 | match_data->flags |= PCRE2_MD_COPIED_SUBJECT; |
8034 | 0 | } |
8035 | 559 | else match_data->subject = subject; |
8036 | | |
8037 | 559 | return match_data->rc; |
8038 | 559 | } |
8039 | | |
8040 | | /* Control gets here if there has been a partial match, an error, or if the |
8041 | | overall match attempt has failed at all permitted starting positions. Any mark |
8042 | | data is in the nomatch_mark field. */ |
8043 | | |
8044 | 3.62k | match_data->mark = mb->nomatch_mark; |
8045 | | |
8046 | | /* For anything other than nomatch or partial match, just return the code. */ |
8047 | | |
8048 | 3.62k | if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc; |
8049 | | |
8050 | | /* Handle a partial match. If a "soft" partial match was requested, searching |
8051 | | for a complete match will have continued, and the value of rc at this point |
8052 | | will be MATCH_NOMATCH. For a "hard" partial match, it will already be |
8053 | | PCRE2_ERROR_PARTIAL. */ |
8054 | | |
8055 | 3.59k | else if (match_partial != NULL) |
8056 | 0 | { |
8057 | 0 | match_data->subject = subject; |
8058 | 0 | match_data->subject_length = length; |
8059 | 0 | match_data->ovector[0] = match_partial - subject; |
8060 | 0 | match_data->ovector[1] = end_subject - subject; |
8061 | 0 | match_data->startchar = match_partial - subject; |
8062 | 0 | match_data->leftchar = start_partial - subject; |
8063 | 0 | match_data->rightchar = end_subject - subject; |
8064 | 0 | match_data->rc = PCRE2_ERROR_PARTIAL; |
8065 | 0 | } |
8066 | | |
8067 | | /* Else this is the classic nomatch case. */ |
8068 | | |
8069 | 3.59k | else match_data->rc = PCRE2_ERROR_NOMATCH; |
8070 | | |
8071 | 3.62k | return match_data->rc; |
8072 | 4.18k | } |
8073 | | |
8074 | | /* These #undefs are here to enable unity builds with CMake. */ |
8075 | | |
8076 | | #undef NLBLOCK /* Block containing newline information */ |
8077 | | #undef PSSTART /* Field containing processed string start */ |
8078 | | #undef PSEND /* Field containing processed string end */ |
8079 | | |
8080 | | /* End of pcre2_match.c */ |