/src/libgit2/deps/pcre2/pcre2_match.c
Line | Count | Source |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2015-2024 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | |
42 | | #include "pcre2_internal.h" |
43 | | |
44 | | |
45 | | |
46 | | /* These defines enable debugging code */ |
47 | | |
48 | | /* #define DEBUG_FRAMES_DISPLAY */ |
49 | | /* #define DEBUG_SHOW_OPS */ |
50 | | /* #define DEBUG_SHOW_RMATCH */ |
51 | | |
52 | | #ifdef DEBUG_FRAMES_DISPLAY |
53 | | #include <stdarg.h> |
54 | | #endif |
55 | | |
56 | | #ifdef DEBUG_SHOW_OPS |
57 | | static const char *OP_names[] = { OP_NAME_LIST }; |
58 | | #endif |
59 | | |
60 | | /* These defines identify the name of the block containing "static" |
61 | | information, and fields within it. */ |
62 | | |
63 | 0 | #define NLBLOCK mb /* Block containing newline information */ |
64 | 0 | #define PSSTART start_subject /* Field containing processed string start */ |
65 | 0 | #define PSEND end_subject /* Field containing processed string end */ |
66 | | |
67 | 0 | #define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */ |
68 | | |
69 | | /* Masks for identifying the public options that are permitted at match time. */ |
70 | | |
71 | | #define PUBLIC_MATCH_OPTIONS \ |
72 | 0 | (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ |
73 | 0 | PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ |
74 | 0 | PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT| \ |
75 | 0 | PCRE2_DISABLE_RECURSELOOP_CHECK) |
76 | | |
77 | | #define PUBLIC_JIT_MATCH_OPTIONS \ |
78 | | (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\ |
79 | | PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\ |
80 | | PCRE2_COPY_MATCHED_SUBJECT) |
81 | | |
82 | | /* Non-error returns from and within the match() function. Error returns are |
83 | | externally defined PCRE2_ERROR_xxx codes, which are all negative. */ |
84 | | |
85 | 0 | #define MATCH_MATCH 1 |
86 | 0 | #define MATCH_NOMATCH 0 |
87 | | |
88 | | /* Special internal returns used in the match() function. Make them |
89 | | sufficiently negative to avoid the external error codes. */ |
90 | | |
91 | 0 | #define MATCH_ACCEPT (-999) |
92 | 0 | #define MATCH_KETRPOS (-998) |
93 | | /* The next 5 must be kept together and in sequence so that a test that checks |
94 | | for any one of them can use a range. */ |
95 | 0 | #define MATCH_COMMIT (-997) |
96 | 0 | #define MATCH_PRUNE (-996) |
97 | 0 | #define MATCH_SKIP (-995) |
98 | 0 | #define MATCH_SKIP_ARG (-994) |
99 | 0 | #define MATCH_THEN (-993) |
100 | 0 | #define MATCH_BACKTRACK_MAX MATCH_THEN |
101 | 0 | #define MATCH_BACKTRACK_MIN MATCH_COMMIT |
102 | | |
103 | | /* Group frame type values. Zero means the frame is not a group frame. The |
104 | | lower 16 bits are used for data (e.g. the capture number). Group frames are |
105 | | used for most groups so that information about the start is easily available at |
106 | | the end without having to scan back through intermediate frames (backtrack |
107 | | points). */ |
108 | | |
109 | 0 | #define GF_CAPTURE 0x00010000u |
110 | 0 | #define GF_NOCAPTURE 0x00020000u |
111 | 0 | #define GF_CONDASSERT 0x00030000u |
112 | 0 | #define GF_RECURSE 0x00040000u |
113 | | |
114 | | /* Masks for the identity and data parts of the group frame type. */ |
115 | | |
116 | 0 | #define GF_IDMASK(a) ((a) & 0xffff0000u) |
117 | 0 | #define GF_DATAMASK(a) ((a) & 0x0000ffffu) |
118 | | |
119 | | /* Repetition types */ |
120 | | |
121 | | enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS }; |
122 | | |
123 | | /* Min and max values for the common repeats; a maximum of UINT32_MAX => |
124 | | infinity. */ |
125 | | |
126 | | static const uint32_t rep_min[] = { |
127 | | 0, 0, /* * and *? */ |
128 | | 1, 1, /* + and +? */ |
129 | | 0, 0, /* ? and ?? */ |
130 | | 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ |
131 | | 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ |
132 | | |
133 | | static const uint32_t rep_max[] = { |
134 | | UINT32_MAX, UINT32_MAX, /* * and *? */ |
135 | | UINT32_MAX, UINT32_MAX, /* + and +? */ |
136 | | 1, 1, /* ? and ?? */ |
137 | | 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ |
138 | | UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ |
139 | | |
140 | | /* Repetition types - must include OP_CRPOSRANGE (not needed above) */ |
141 | | |
142 | | static const uint32_t rep_typ[] = { |
143 | | REPTYPE_MAX, REPTYPE_MIN, /* * and *? */ |
144 | | REPTYPE_MAX, REPTYPE_MIN, /* + and +? */ |
145 | | REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */ |
146 | | REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */ |
147 | | REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */ |
148 | | REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */ |
149 | | |
150 | | /* Numbers for RMATCH calls at backtracking points. When these lists are |
151 | | changed, the code at RETURN_SWITCH below must be updated in sync. */ |
152 | | |
153 | | enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, |
154 | | RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, |
155 | | RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, |
156 | | RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39 }; |
157 | | |
158 | | #ifdef SUPPORT_WIDE_CHARS |
159 | | enum { RM100=100, RM101, RM102, RM103 }; |
160 | | #endif |
161 | | |
162 | | #ifdef SUPPORT_UNICODE |
163 | | enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207, |
164 | | RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215, |
165 | | RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223, |
166 | | RM224 }; |
167 | | #endif |
168 | | |
169 | | /* Define short names for general fields in the current backtrack frame, which |
170 | | is always pointed to by the F variable. Occasional references to fields in |
171 | | other frames are written out explicitly. There are also some fields in the |
172 | | current frame whose names start with "temp" that are used for short-term, |
173 | | localised backtracking memory. These are #defined with Lxxx names at the point |
174 | | of use and undefined afterwards. */ |
175 | | |
176 | 0 | #define Fback_frame F->back_frame |
177 | 0 | #define Fcapture_last F->capture_last |
178 | 0 | #define Fcurrent_recurse F->current_recurse |
179 | 0 | #define Fecode F->ecode |
180 | 0 | #define Feptr F->eptr |
181 | 0 | #define Fgroup_frame_type F->group_frame_type |
182 | 0 | #define Flast_group_offset F->last_group_offset |
183 | 0 | #define Flength F->length |
184 | 0 | #define Fmark F->mark |
185 | 0 | #define Frdepth F->rdepth |
186 | 0 | #define Fstart_match F->start_match |
187 | 0 | #define Foffset_top F->offset_top |
188 | 0 | #define Foccu F->occu |
189 | 0 | #define Fop F->op |
190 | 0 | #define Fovector F->ovector |
191 | 0 | #define Freturn_id F->return_id |
192 | | |
193 | | |
194 | | #ifdef DEBUG_FRAMES_DISPLAY |
195 | | /************************************************* |
196 | | * Display current frames and contents * |
197 | | *************************************************/ |
198 | | |
199 | | /* This debugging function displays the current set of frames and their |
200 | | contents. It is not called automatically from anywhere, the intention being |
201 | | that calls can be inserted where necessary when debugging frame-related |
202 | | problems. |
203 | | |
204 | | Arguments: |
205 | | f the file to write to |
206 | | F the current top frame |
207 | | P a previous frame of interest |
208 | | frame_size the frame size |
209 | | mb points to the match block |
210 | | match_data points to the match data block |
211 | | s identification text |
212 | | |
213 | | Returns: nothing |
214 | | */ |
215 | | |
216 | | static void |
217 | | display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size, |
218 | | match_block *mb, pcre2_match_data *match_data, const char *s, ...) |
219 | | { |
220 | | uint32_t i; |
221 | | heapframe *Q; |
222 | | va_list ap; |
223 | | va_start(ap, s); |
224 | | |
225 | | fprintf(f, "FRAMES "); |
226 | | vfprintf(f, s, ap); |
227 | | va_end(ap); |
228 | | |
229 | | if (P != NULL) fprintf(f, " P=%lu", |
230 | | ((char *)P - (char *)(match_data->heapframes))/frame_size); |
231 | | fprintf(f, "\n"); |
232 | | |
233 | | for (i = 0, Q = match_data->heapframes; |
234 | | Q <= F; |
235 | | i++, Q = (heapframe *)((char *)Q + frame_size)) |
236 | | { |
237 | | fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d", |
238 | | i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode), |
239 | | Q->back_frame, Q->return_id); |
240 | | |
241 | | if (Q->last_group_offset == PCRE2_UNSET) |
242 | | fprintf(f, " lgoffset=unset\n"); |
243 | | else |
244 | | fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size); |
245 | | } |
246 | | } |
247 | | |
248 | | #endif |
249 | | |
250 | | |
251 | | |
252 | | /************************************************* |
253 | | * Process a callout * |
254 | | *************************************************/ |
255 | | |
256 | | /* This function is called for all callouts, whether "standalone" or at the |
257 | | start of a conditional group. Feptr will be pointing to either OP_CALLOUT or |
258 | | OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized |
259 | | with fixed values. |
260 | | |
261 | | Arguments: |
262 | | F points to the current backtracking frame |
263 | | mb points to the match block |
264 | | lengthptr where to return the length of the callout item |
265 | | |
266 | | Returns: the return from the callout |
267 | | or 0 if no callout function exists |
268 | | */ |
269 | | |
270 | | static int |
271 | | do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr) |
272 | 0 | { |
273 | 0 | int rc; |
274 | 0 | PCRE2_SIZE save0, save1; |
275 | 0 | PCRE2_SIZE *callout_ovector; |
276 | 0 | pcre2_callout_block *cb; |
277 | |
|
278 | 0 | *lengthptr = (*Fecode == OP_CALLOUT)? |
279 | 0 | PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE); |
280 | |
|
281 | 0 | if (mb->callout == NULL) return 0; /* No callout function provided */ |
282 | | |
283 | | /* The original matching code (pre 10.30) worked directly with the ovector |
284 | | passed by the user, and this was passed to callouts. Now that the working |
285 | | ovector is in the backtracking frame, it no longer needs to reserve space for |
286 | | the overall match offsets (which would waste space in the frame). For backward |
287 | | compatibility, however, we pass capture_top and offset_vector to the callout as |
288 | | if for the extended ovector, and we ensure that the first two slots are unset |
289 | | by preserving and restoring their current contents. Picky compilers complain if |
290 | | references such as Fovector[-2] are use directly, so we set up a separate |
291 | | pointer. */ |
292 | | |
293 | 0 | callout_ovector = (PCRE2_SIZE *)(Fovector) - 2; |
294 | | |
295 | | /* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields |
296 | | are set externally. The first 3 never change; the last is updated for each |
297 | | bumpalong. */ |
298 | |
|
299 | 0 | cb = mb->cb; |
300 | 0 | cb->capture_top = (uint32_t)Foffset_top/2 + 1; |
301 | 0 | cb->capture_last = Fcapture_last; |
302 | 0 | cb->offset_vector = callout_ovector; |
303 | 0 | cb->mark = mb->nomatch_mark; |
304 | 0 | cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject); |
305 | 0 | cb->pattern_position = GET(Fecode, 1); |
306 | 0 | cb->next_item_length = GET(Fecode, 1 + LINK_SIZE); |
307 | |
|
308 | 0 | if (*Fecode == OP_CALLOUT) /* Numerical callout */ |
309 | 0 | { |
310 | 0 | cb->callout_number = Fecode[1 + 2*LINK_SIZE]; |
311 | 0 | cb->callout_string_offset = 0; |
312 | 0 | cb->callout_string = NULL; |
313 | 0 | cb->callout_string_length = 0; |
314 | 0 | } |
315 | 0 | else /* String callout */ |
316 | 0 | { |
317 | 0 | cb->callout_number = 0; |
318 | 0 | cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE); |
319 | 0 | cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1; |
320 | 0 | cb->callout_string_length = |
321 | 0 | *lengthptr - (1 + 4*LINK_SIZE) - 2; |
322 | 0 | } |
323 | |
|
324 | 0 | save0 = callout_ovector[0]; |
325 | 0 | save1 = callout_ovector[1]; |
326 | 0 | callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET; |
327 | 0 | rc = mb->callout(cb, mb->callout_data); |
328 | 0 | callout_ovector[0] = save0; |
329 | 0 | callout_ovector[1] = save1; |
330 | 0 | cb->callout_flags = 0; |
331 | 0 | return rc; |
332 | 0 | } |
333 | | |
334 | | |
335 | | |
336 | | /************************************************* |
337 | | * Match a back-reference * |
338 | | *************************************************/ |
339 | | |
340 | | /* This function is called only when it is known that the offset lies within |
341 | | the offsets that have so far been used in the match. Note that in caseless |
342 | | UTF-8 mode, the number of subject bytes matched may be different to the number |
343 | | of reference bytes. (In theory this could also happen in UTF-16 mode, but it |
344 | | seems unlikely.) |
345 | | |
346 | | Arguments: |
347 | | offset index into the offset vector |
348 | | caseless TRUE if caseless |
349 | | caseopts bitmask of REFI_FLAG_XYZ values |
350 | | F the current backtracking frame pointer |
351 | | mb points to match block |
352 | | lengthptr pointer for returning the length matched |
353 | | |
354 | | Returns: = 0 sucessful match; number of code units matched is set |
355 | | < 0 no match |
356 | | > 0 partial match |
357 | | */ |
358 | | |
359 | | static int |
360 | | match_ref(PCRE2_SIZE offset, BOOL caseless, int caseopts, heapframe *F, |
361 | | match_block *mb, PCRE2_SIZE *lengthptr) |
362 | 0 | { |
363 | 0 | PCRE2_SPTR p; |
364 | 0 | PCRE2_SIZE length; |
365 | 0 | PCRE2_SPTR eptr; |
366 | 0 | PCRE2_SPTR eptr_start; |
367 | |
|
368 | | #ifndef SUPPORT_UNICODE |
369 | | (void)caseopts; /* Avoid compiler warning. */ |
370 | | #endif |
371 | | |
372 | | /* Deal with an unset group. The default is no match, but there is an option to |
373 | | match an empty string. */ |
374 | |
|
375 | 0 | if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET) |
376 | 0 | { |
377 | 0 | if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) |
378 | 0 | { |
379 | 0 | *lengthptr = 0; |
380 | 0 | return 0; /* Match */ |
381 | 0 | } |
382 | 0 | else return -1; /* No match */ |
383 | 0 | } |
384 | | |
385 | | /* Separate the caseless and UTF cases for speed. */ |
386 | | |
387 | 0 | eptr = eptr_start = Feptr; |
388 | 0 | p = mb->start_subject + Fovector[offset]; |
389 | 0 | length = Fovector[offset+1] - Fovector[offset]; |
390 | 0 | PCRE2_ASSERT(eptr <= mb->end_subject); |
391 | |
|
392 | 0 | if (caseless) |
393 | 0 | { |
394 | 0 | #if defined SUPPORT_UNICODE |
395 | 0 | BOOL utf = (mb->poptions & PCRE2_UTF) != 0; |
396 | 0 | BOOL caseless_restrict = (caseopts & REFI_FLAG_CASELESS_RESTRICT) != 0; |
397 | 0 | BOOL turkish_casing = !caseless_restrict && (caseopts & REFI_FLAG_TURKISH_CASING) != 0; |
398 | |
|
399 | 0 | if (utf || (mb->poptions & PCRE2_UCP) != 0) |
400 | 0 | { |
401 | 0 | PCRE2_SPTR endptr = p + length; |
402 | | |
403 | | /* Match characters up to the end of the reference. NOTE: the number of |
404 | | code units matched may differ, because in UTF-8 there are some characters |
405 | | whose upper and lower case codes have different numbers of bytes. For |
406 | | example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3 |
407 | | bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a |
408 | | sequence of two of the latter. It is important, therefore, to check the |
409 | | length along the reference, not along the subject (earlier code did this |
410 | | wrong). UCP uses Unicode properties but without UTF encoding. */ |
411 | |
|
412 | 0 | while (p < endptr) |
413 | 0 | { |
414 | 0 | uint32_t c, d; |
415 | 0 | const ucd_record *ur; |
416 | 0 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
417 | | |
418 | 0 | if (utf) |
419 | 0 | { |
420 | 0 | GETCHARINC(c, eptr); |
421 | 0 | GETCHARINC(d, p); |
422 | 0 | } |
423 | 0 | else |
424 | 0 | { |
425 | 0 | c = *eptr++; |
426 | 0 | d = *p++; |
427 | 0 | } |
428 | |
|
429 | 0 | if (turkish_casing && UCD_ANY_I(d)) |
430 | 0 | { |
431 | 0 | c = UCD_FOLD_I_TURKISH(c); |
432 | 0 | d = UCD_FOLD_I_TURKISH(d); |
433 | 0 | if (c != d) return -1; /* No match */ |
434 | 0 | } |
435 | 0 | else if (c != d && c != (uint32_t)((int)d + (ur = GET_UCD(d))->other_case)) |
436 | 0 | { |
437 | 0 | const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; |
438 | | |
439 | | /* When PCRE2_EXTRA_CASELESS_RESTRICT is set, ignore any caseless sets |
440 | | that start with an ASCII character. */ |
441 | 0 | if (caseless_restrict && *pp < 128) return -1; /* No match */ |
442 | | |
443 | 0 | for (;;) |
444 | 0 | { |
445 | 0 | if (c < *pp) return -1; /* No match */ |
446 | 0 | if (c == *pp++) break; |
447 | 0 | } |
448 | 0 | } |
449 | 0 | } |
450 | 0 | } |
451 | 0 | else |
452 | 0 | #endif |
453 | | |
454 | | /* Not in UTF or UCP mode */ |
455 | 0 | { |
456 | 0 | for (; length > 0; length--) |
457 | 0 | { |
458 | 0 | uint32_t cc, cp; |
459 | 0 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
460 | 0 | cc = UCHAR21TEST(eptr); |
461 | 0 | cp = UCHAR21TEST(p); |
462 | 0 | if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) |
463 | 0 | return -1; /* No match */ |
464 | 0 | p++; |
465 | 0 | eptr++; |
466 | 0 | } |
467 | 0 | } |
468 | 0 | } |
469 | | |
470 | | /* In the caseful case, we can just compare the code units, whether or not we |
471 | | are in UTF and/or UCP mode. When partial matching, we have to do this unit by |
472 | | unit. */ |
473 | | |
474 | 0 | else |
475 | 0 | { |
476 | 0 | if (mb->partial != 0) |
477 | 0 | { |
478 | 0 | for (; length > 0; length--) |
479 | 0 | { |
480 | 0 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
481 | 0 | if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */ |
482 | 0 | } |
483 | 0 | } |
484 | | |
485 | | /* Not partial matching */ |
486 | | |
487 | 0 | else |
488 | 0 | { |
489 | 0 | if ((PCRE2_SIZE)(mb->end_subject - eptr) < length || |
490 | 0 | memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */ |
491 | 0 | eptr += length; |
492 | 0 | } |
493 | 0 | } |
494 | | |
495 | 0 | *lengthptr = eptr - eptr_start; |
496 | 0 | return 0; /* Match */ |
497 | 0 | } |
498 | | |
499 | | |
500 | | |
501 | | /************************************************* |
502 | | * Restore offsets after a recurse * |
503 | | *************************************************/ |
504 | | |
505 | | /* This function restores the ovector values when |
506 | | a recursive block reaches its end, and the triggering |
507 | | recurse has and argument list. |
508 | | |
509 | | Arguments: |
510 | | F the current backtracking frame pointer |
511 | | P the previous backtracking frame pointer |
512 | | */ |
513 | | |
514 | | static void |
515 | | recurse_update_offsets(heapframe *F, heapframe *P) |
516 | 0 | { |
517 | 0 | PCRE2_SIZE *dst = F->ovector; |
518 | 0 | PCRE2_SIZE *src = P->ovector; |
519 | | /* The first bracket has offset 2, because |
520 | | offset 0 is reserved for the full match. */ |
521 | 0 | PCRE2_SIZE offset = 2; |
522 | 0 | PCRE2_SIZE offset_top = Foffset_top + 2; |
523 | 0 | PCRE2_SIZE diff; |
524 | 0 | PCRE2_SPTR ecode = Fecode; |
525 | |
|
526 | 0 | do |
527 | 0 | { |
528 | 0 | diff = (GET2(ecode, 1) << 1) - offset; |
529 | 0 | ecode += 1 + IMM2_SIZE; |
530 | |
|
531 | 0 | if (offset + diff >= offset_top) |
532 | 0 | { |
533 | | /* Some OP_CREF opcodes are not |
534 | | processed, they must be skipped. */ |
535 | 0 | while (*ecode == OP_CREF) ecode += 1 + IMM2_SIZE; |
536 | 0 | break; |
537 | 0 | } |
538 | | |
539 | 0 | if (diff == 2) |
540 | 0 | { |
541 | 0 | dst[0] = src[0]; |
542 | 0 | dst[1] = src[1]; |
543 | 0 | } |
544 | 0 | else if (diff >= 4) |
545 | 0 | memcpy(dst, src, diff * sizeof(PCRE2_SIZE)); |
546 | | |
547 | | /* Skip the unmodified entry. */ |
548 | 0 | diff += 2; |
549 | 0 | offset += diff; |
550 | 0 | dst += diff; |
551 | 0 | src += diff; |
552 | 0 | } |
553 | 0 | while (*ecode == OP_CREF); |
554 | |
|
555 | 0 | diff = offset_top - offset; |
556 | 0 | if (diff == 2) |
557 | 0 | { |
558 | 0 | dst[0] = src[0]; |
559 | 0 | dst[1] = src[1]; |
560 | 0 | } |
561 | 0 | else if (diff >= 4) |
562 | 0 | memcpy(dst, src, diff * sizeof(PCRE2_SIZE)); |
563 | |
|
564 | 0 | Fecode = ecode; |
565 | 0 | Foffset_top = (offset <= P->offset_top) ? P->offset_top : (offset - 2); |
566 | 0 | } |
567 | | |
568 | | |
569 | | |
570 | | /****************************************************************************** |
571 | | ******************************************************************************* |
572 | | "Recursion" in the match() function |
573 | | |
574 | | The original match() function was highly recursive, but this proved to be the |
575 | | source of a number of problems over the years, mostly because of the relatively |
576 | | small system stacks that are commonly found. As new features were added to |
577 | | patterns, various kludges were invented to reduce the amount of stack used, |
578 | | making the code hard to understand in places. |
579 | | |
580 | | A version did exist that used individual frames on the heap instead of calling |
581 | | match() recursively, but this ran substantially slower. The current version is |
582 | | a refactoring that uses a vector of frames to remember backtracking points. |
583 | | This runs no slower, and possibly even a bit faster than the original recursive |
584 | | implementation. |
585 | | |
586 | | At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50 |
587 | | frames) was allocated on the system stack. If this was not big enough, the heap |
588 | | was used for a larger vector. However, it turns out that there are environments |
589 | | where taking as little as 20KiB from the system stack is an embarrassment. |
590 | | After another refactoring, the heap is used exclusively, but a pointer the |
591 | | frames vector and its size are cached in the match_data block, so that there is |
592 | | no new memory allocation if the same match_data block is used for multiple |
593 | | matches (unless the frames vector has to be extended). |
594 | | ******************************************************************************* |
595 | | ******************************************************************************/ |
596 | | |
597 | | |
598 | | |
599 | | |
600 | | /************************************************* |
601 | | * Macros for the match() function * |
602 | | *************************************************/ |
603 | | |
604 | | /* These macros pack up tests that are used for partial matching several times |
605 | | in the code. The second one is used when we already know we are past the end of |
606 | | the subject. We set the "hit end" flag if the pointer is at the end of the |
607 | | subject and either (a) the pointer is past the earliest inspected character |
608 | | (i.e. something has been matched, even if not part of the actual matched |
609 | | string), or (b) the pattern contains a lookbehind. These are the conditions for |
610 | | which adding more characters may allow the current match to continue. |
611 | | |
612 | | For hard partial matching, we immediately return a partial match. Otherwise, |
613 | | carrying on means that a complete match on the current subject will be sought. |
614 | | A partial match is returned only if no complete match can be found. */ |
615 | | |
616 | | #define CHECK_PARTIAL() \ |
617 | 0 | do { \ |
618 | 0 | if (Feptr >= mb->end_subject) \ |
619 | 0 | { \ |
620 | 0 | SCHECK_PARTIAL(); \ |
621 | 0 | } \ |
622 | 0 | } \ |
623 | 0 | while (0) |
624 | | |
625 | | #define SCHECK_PARTIAL() \ |
626 | 0 | do { \ |
627 | 0 | if (mb->partial != 0 && \ |
628 | 0 | (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \ |
629 | 0 | { \ |
630 | 0 | mb->hitend = TRUE; \ |
631 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ |
632 | 0 | } \ |
633 | 0 | } \ |
634 | 0 | while (0) |
635 | | |
636 | | |
637 | | /* These macros are used to implement backtracking. They simulate a recursive |
638 | | call to the match() function by means of a local vector of frames which |
639 | | remember the backtracking points. */ |
640 | | |
641 | | #define RMATCH(ra,rb) \ |
642 | 0 | do { \ |
643 | 0 | start_ecode = ra; \ |
644 | 0 | Freturn_id = rb; \ |
645 | 0 | goto MATCH_RECURSE; \ |
646 | 0 | L_##rb:; \ |
647 | 0 | } \ |
648 | 0 | while (0) |
649 | | |
650 | | #define RRETURN(ra) \ |
651 | 0 | do { \ |
652 | 0 | rrc = ra; \ |
653 | 0 | goto RETURN_SWITCH; \ |
654 | 0 | } \ |
655 | 0 | while (0) |
656 | | |
657 | | |
658 | | |
659 | | /************************************************* |
660 | | * Match from current position * |
661 | | *************************************************/ |
662 | | |
663 | | /* This function is called to run one match attempt at a single starting point |
664 | | in the subject. |
665 | | |
666 | | Performance note: It might be tempting to extract commonly used fields from the |
667 | | mb structure (e.g. end_subject) into individual variables to improve |
668 | | performance. Tests using gcc on a SPARC disproved this; in the first case, it |
669 | | made performance worse. |
670 | | |
671 | | Arguments: |
672 | | start_eptr starting character in subject |
673 | | start_ecode starting position in compiled code |
674 | | top_bracket number of capturing parentheses in the pattern |
675 | | frame_size size of each backtracking frame |
676 | | match_data pointer to the match_data block |
677 | | mb pointer to "static" variables block |
678 | | |
679 | | Returns: MATCH_MATCH if matched ) these values are >= 0 |
680 | | MATCH_NOMATCH if failed to match ) |
681 | | negative MATCH_xxx value for PRUNE, SKIP, etc |
682 | | negative PCRE2_ERROR_xxx value if aborted by an error condition |
683 | | (e.g. stopped by repeated call or depth limit) |
684 | | */ |
685 | | |
686 | | static int |
687 | | match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, |
688 | | PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb) |
689 | 0 | { |
690 | | /* Frame-handling variables */ |
691 | |
|
692 | 0 | heapframe *F; /* Current frame pointer */ |
693 | 0 | heapframe *N = NULL; /* Temporary frame pointers */ |
694 | 0 | heapframe *P = NULL; |
695 | |
|
696 | 0 | heapframe *frames_top; /* End of frames vector */ |
697 | 0 | heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */ |
698 | 0 | PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ |
699 | | |
700 | | /* Local variables that do not need to be preserved over calls to RRMATCH(). */ |
701 | |
|
702 | 0 | PCRE2_SPTR branch_end = NULL; |
703 | 0 | PCRE2_SPTR branch_start; |
704 | 0 | PCRE2_SPTR bracode; /* Temp pointer to start of group */ |
705 | 0 | PCRE2_SIZE offset; /* Used for group offsets */ |
706 | 0 | PCRE2_SIZE length; /* Used for various length calculations */ |
707 | |
|
708 | 0 | int rrc; /* Return from functions & backtracking "recursions" */ |
709 | 0 | #ifdef SUPPORT_UNICODE |
710 | 0 | int proptype; /* Type of character property */ |
711 | 0 | #endif |
712 | |
|
713 | 0 | uint32_t i; /* Used for local loops */ |
714 | 0 | uint32_t fc; /* Character values */ |
715 | 0 | uint32_t number; /* Used for group and other numbers */ |
716 | 0 | uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */ |
717 | 0 | uint32_t group_frame_type; /* Specifies type for new group frames */ |
718 | |
|
719 | 0 | BOOL condition; /* Used in conditional groups */ |
720 | 0 | BOOL cur_is_word; /* Used in "word" tests */ |
721 | 0 | BOOL prev_is_word; /* Used in "word" tests */ |
722 | | |
723 | | /* UTF and UCP flags */ |
724 | |
|
725 | 0 | #ifdef SUPPORT_UNICODE |
726 | 0 | BOOL utf = (mb->poptions & PCRE2_UTF) != 0; |
727 | 0 | BOOL ucp = (mb->poptions & PCRE2_UCP) != 0; |
728 | | #else |
729 | | BOOL utf = FALSE; /* Required for convenience even when no Unicode support */ |
730 | | #endif |
731 | | |
732 | | /* This is the length of the last part of a backtracking frame that must be |
733 | | copied when a new frame is created. */ |
734 | |
|
735 | 0 | frame_copy_size = frame_size - offsetof(heapframe, eptr); |
736 | | |
737 | | /* Set up the first frame and the end of the frames vector. */ |
738 | |
|
739 | 0 | F = match_data->heapframes; |
740 | 0 | frames_top = (heapframe *)((char *)F + match_data->heapframes_size); |
741 | |
|
742 | 0 | Frdepth = 0; /* "Recursion" depth */ |
743 | 0 | Fcapture_last = 0; /* Number of most recent capture */ |
744 | 0 | Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */ |
745 | 0 | Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */ |
746 | 0 | Fmark = NULL; /* Most recent mark */ |
747 | 0 | Foffset_top = 0; /* End of captures within the frame */ |
748 | 0 | Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */ |
749 | 0 | group_frame_type = 0; /* Not a start of group frame */ |
750 | 0 | goto NEW_FRAME; /* Start processing with this frame */ |
751 | | |
752 | | /* Come back here when we want to create a new frame for remembering a |
753 | | backtracking point. */ |
754 | | |
755 | 0 | MATCH_RECURSE: |
756 | | |
757 | | /* Set up a new backtracking frame. If the vector is full, get a new one, |
758 | | doubling the size, but constrained by the heap limit (which is in KiB). */ |
759 | |
|
760 | 0 | N = (heapframe *)((char *)F + frame_size); |
761 | 0 | if ((heapframe *)((char *)N + frame_size) >= frames_top) |
762 | 0 | { |
763 | 0 | heapframe *new; |
764 | 0 | PCRE2_SIZE newsize; |
765 | 0 | PCRE2_SIZE usedsize = (char *)N - (char *)(match_data->heapframes); |
766 | |
|
767 | 0 | if (match_data->heapframes_size >= PCRE2_SIZE_MAX / 2) |
768 | 0 | { |
769 | 0 | if (match_data->heapframes_size == PCRE2_SIZE_MAX - 1) |
770 | 0 | return PCRE2_ERROR_NOMEMORY; |
771 | 0 | newsize = PCRE2_SIZE_MAX - 1; |
772 | 0 | } |
773 | 0 | else |
774 | 0 | newsize = match_data->heapframes_size * 2; |
775 | | |
776 | 0 | if (newsize / 1024 >= mb->heap_limit) |
777 | 0 | { |
778 | 0 | PCRE2_SIZE old_size = match_data->heapframes_size / 1024; |
779 | 0 | if (mb->heap_limit <= old_size) |
780 | 0 | return PCRE2_ERROR_HEAPLIMIT; |
781 | 0 | else |
782 | 0 | { |
783 | 0 | PCRE2_SIZE max_delta = 1024 * (mb->heap_limit - old_size); |
784 | 0 | int over_bytes = match_data->heapframes_size % 1024; |
785 | 0 | if (over_bytes) max_delta -= (1024 - over_bytes); |
786 | 0 | newsize = match_data->heapframes_size + max_delta; |
787 | 0 | } |
788 | 0 | } |
789 | | |
790 | | /* With a heap limit set, the permitted additional size may not be enough for |
791 | | another frame, so do a final check. */ |
792 | | |
793 | 0 | if (newsize - usedsize < frame_size) return PCRE2_ERROR_HEAPLIMIT; |
794 | 0 | new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data); |
795 | 0 | if (new == NULL) return PCRE2_ERROR_NOMEMORY; |
796 | 0 | memcpy(new, match_data->heapframes, usedsize); |
797 | |
|
798 | 0 | N = (heapframe *)((char *)new + usedsize); |
799 | 0 | F = (heapframe *)((char *)N - frame_size); |
800 | |
|
801 | 0 | match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data); |
802 | 0 | match_data->heapframes = new; |
803 | 0 | match_data->heapframes_size = newsize; |
804 | 0 | frames_top = (heapframe *)((char *)new + newsize); |
805 | 0 | } |
806 | | |
807 | | #ifdef DEBUG_SHOW_RMATCH |
808 | | fprintf(stderr, "++ RMATCH %d frame=%d", Freturn_id, Frdepth + 1); |
809 | | if (group_frame_type != 0) |
810 | | { |
811 | | fprintf(stderr, " type=%x ", group_frame_type); |
812 | | switch (GF_IDMASK(group_frame_type)) |
813 | | { |
814 | | case GF_CAPTURE: |
815 | | fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type)); |
816 | | break; |
817 | | |
818 | | case GF_NOCAPTURE: |
819 | | fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type)); |
820 | | break; |
821 | | |
822 | | case GF_CONDASSERT: |
823 | | fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type)); |
824 | | break; |
825 | | |
826 | | case GF_RECURSE: |
827 | | fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type)); |
828 | | break; |
829 | | |
830 | | default: |
831 | | fprintf(stderr, "*** unknown ***"); |
832 | | break; |
833 | | } |
834 | | } |
835 | | fprintf(stderr, "\n"); |
836 | | #endif |
837 | | |
838 | | /* Copy those fields that must be copied into the new frame, increase the |
839 | | "recursion" depth (i.e. the new frame's index) and then make the new frame |
840 | | current. */ |
841 | | |
842 | 0 | memcpy((char *)N + offsetof(heapframe, eptr), |
843 | 0 | (char *)F + offsetof(heapframe, eptr), |
844 | 0 | frame_copy_size); |
845 | |
|
846 | 0 | N->rdepth = Frdepth + 1; |
847 | 0 | F = N; |
848 | | |
849 | | /* Carry on processing with a new frame. */ |
850 | |
|
851 | 0 | NEW_FRAME: |
852 | 0 | Fgroup_frame_type = group_frame_type; |
853 | 0 | Fecode = start_ecode; /* Starting code pointer */ |
854 | 0 | Fback_frame = frame_size; /* Default is go back one frame */ |
855 | | |
856 | | /* If this is a special type of group frame, remember its offset for quick |
857 | | access at the end of the group. If this is a recursion, set a new current |
858 | | recursion value. */ |
859 | |
|
860 | 0 | if (group_frame_type != 0) |
861 | 0 | { |
862 | 0 | Flast_group_offset = (char *)F - (char *)match_data->heapframes; |
863 | 0 | if (GF_IDMASK(group_frame_type) == GF_RECURSE) |
864 | 0 | Fcurrent_recurse = GF_DATAMASK(group_frame_type); |
865 | 0 | group_frame_type = 0; |
866 | 0 | } |
867 | | |
868 | | |
869 | | /* ========================================================================= */ |
870 | | /* This is the main processing loop. First check that we haven't recorded too |
871 | | many backtracks (search tree is too large), or that we haven't exceeded the |
872 | | recursive depth limit (used too many backtracking frames). If not, process the |
873 | | opcodes. */ |
874 | |
|
875 | 0 | if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT; |
876 | 0 | if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT; |
877 | | |
878 | | #ifdef DEBUG_SHOW_OPS |
879 | | fprintf(stderr, "\n++ New frame: type=0x%x subject offset %ld\n", |
880 | | GF_IDMASK(Fgroup_frame_type), Feptr - mb->start_subject); |
881 | | #endif |
882 | | |
883 | 0 | for (;;) |
884 | 0 | { |
885 | | #ifdef DEBUG_SHOW_OPS |
886 | | fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, |
887 | | OP_names[*Fecode]); |
888 | | #endif |
889 | |
|
890 | 0 | Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */ |
891 | 0 | switch(Fop) |
892 | 0 | { |
893 | | /* ===================================================================== */ |
894 | | /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close |
895 | | any currently open capturing brackets. Unlike reaching the end of a group, |
896 | | where we know the starting frame is at the top of the chained frames, in |
897 | | this case we have to search back for the relevant frame in case other types |
898 | | of group that use chained frames have intervened. Multiple OP_CLOSEs always |
899 | | come innermost first, which matches the chain order. We can ignore this in |
900 | | a recursion, because captures are not passed out of recursions. */ |
901 | | |
902 | 0 | case OP_CLOSE: |
903 | 0 | if (Fcurrent_recurse == RECURSE_UNSET) |
904 | 0 | { |
905 | 0 | number = GET2(Fecode, 1); |
906 | 0 | offset = Flast_group_offset; |
907 | 0 | for(;;) |
908 | 0 | { |
909 | | /* Corrupted heapframes?. Trigger an assert and return an error */ |
910 | 0 | PCRE2_ASSERT(offset != PCRE2_UNSET); |
911 | 0 | if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; |
912 | | |
913 | 0 | N = (heapframe *)((char *)match_data->heapframes + offset); |
914 | 0 | P = (heapframe *)((char *)N - frame_size); |
915 | 0 | if (N->group_frame_type == (GF_CAPTURE | number)) break; |
916 | 0 | offset = P->last_group_offset; |
917 | 0 | } |
918 | 0 | offset = (number << 1) - 2; |
919 | 0 | Fcapture_last = number; |
920 | 0 | Fovector[offset] = P->eptr - mb->start_subject; |
921 | 0 | Fovector[offset+1] = Feptr - mb->start_subject; |
922 | 0 | if (offset >= Foffset_top) Foffset_top = offset + 2; |
923 | 0 | } |
924 | 0 | Fecode += PRIV(OP_lengths)[*Fecode]; |
925 | 0 | break; |
926 | | |
927 | | |
928 | | /* ===================================================================== */ |
929 | | /* Real or forced end of the pattern, assertion, or recursion. In an |
930 | | assertion ACCEPT, update the last used pointer and remember the current |
931 | | frame so that the captures and mark can be fished out of it. */ |
932 | | |
933 | 0 | case OP_ASSERT_ACCEPT: |
934 | 0 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
935 | 0 | assert_accept_frame = F; |
936 | 0 | RRETURN(MATCH_ACCEPT); |
937 | | |
938 | | /* For ACCEPT within a recursion, we have to find the most recent |
939 | | recursion. If not in a recursion, fall through to code that is common with |
940 | | OP_END. */ |
941 | | |
942 | 0 | case OP_ACCEPT: |
943 | 0 | if (Fcurrent_recurse != RECURSE_UNSET) |
944 | 0 | { |
945 | | #ifdef DEBUG_SHOW_OPS |
946 | | fprintf(stderr, "++ Accept within recursion\n"); |
947 | | #endif |
948 | 0 | offset = Flast_group_offset; |
949 | 0 | for(;;) |
950 | 0 | { |
951 | | /* Corrupted heapframes?. Trigger an assert and return an error */ |
952 | 0 | PCRE2_ASSERT(offset != PCRE2_UNSET); |
953 | 0 | if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; |
954 | | |
955 | 0 | N = (heapframe *)((char *)match_data->heapframes + offset); |
956 | 0 | P = (heapframe *)((char *)N - frame_size); |
957 | 0 | if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break; |
958 | 0 | offset = P->last_group_offset; |
959 | 0 | } |
960 | | |
961 | | /* N is now the frame of the recursion; the previous frame is at the |
962 | | OP_RECURSE position. Go back there, copying the current subject position |
963 | | and mark, and the start_match position (\K might have changed it), and |
964 | | then move on past the OP_RECURSE. */ |
965 | | |
966 | 0 | P->eptr = Feptr; |
967 | 0 | P->mark = Fmark; |
968 | 0 | P->start_match = Fstart_match; |
969 | 0 | F = P; |
970 | 0 | Fecode += 1 + LINK_SIZE; |
971 | 0 | continue; |
972 | 0 | } |
973 | 0 | PCRE2_FALLTHROUGH /* Fall through */ |
974 | 0 |
|
975 | 0 | /* OP_END itself can never be reached within a recursion because that is |
976 | 0 | picked up when the OP_KET that always precedes OP_END is reached. */ |
977 | 0 |
|
978 | 0 | case OP_END: |
979 | | |
980 | | /* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if |
981 | | PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the |
982 | | subject. In both cases, backtracking will then try other alternatives, if |
983 | | any. */ |
984 | |
|
985 | 0 | if (Feptr == Fstart_match && |
986 | 0 | ((mb->moptions & PCRE2_NOTEMPTY) != 0 || |
987 | 0 | ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 && |
988 | 0 | Fstart_match == mb->start_subject + mb->start_offset))) |
989 | 0 | { |
990 | | #ifdef DEBUG_SHOW_OPS |
991 | | fprintf(stderr, "++ Backtrack because empty string\n"); |
992 | | #endif |
993 | 0 | RRETURN(MATCH_NOMATCH); |
994 | 0 | } |
995 | | |
996 | | /* Fail if PCRE2_ENDANCHORED is set and the end of the match is not |
997 | | the end of the subject. After (*ACCEPT) we fail the entire match (at this |
998 | | position) but backtrack if we've reached the end of the pattern. This |
999 | | applies whether or not we are in a recursion. */ |
1000 | | |
1001 | 0 | if (Feptr < mb->end_subject && |
1002 | 0 | ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0) |
1003 | 0 | { |
1004 | 0 | if (Fop == OP_END) |
1005 | 0 | { |
1006 | | #ifdef DEBUG_SHOW_OPS |
1007 | | fprintf(stderr, "++ Backtrack because not at end (endanchored set)\n"); |
1008 | | #endif |
1009 | 0 | RRETURN(MATCH_NOMATCH); |
1010 | 0 | } |
1011 | | |
1012 | | #ifdef DEBUG_SHOW_OPS |
1013 | | fprintf(stderr, "++ Failed ACCEPT not at end (endanchored set)\n"); |
1014 | | #endif |
1015 | 0 | return MATCH_NOMATCH; /* (*ACCEPT) */ |
1016 | 0 | } |
1017 | | |
1018 | | /* Fail if we detect that the start position was moved to be either after |
1019 | | the end position (\K in lookahead) or before the start offset (\K in |
1020 | | lookbehind). If this occurs, the pattern must have used \K in a somewhat |
1021 | | sneaky way (e.g. by pattern recursion), because if the \K is actually |
1022 | | syntactically inside the lookaround, it's blocked at compile-time. */ |
1023 | | |
1024 | 0 | if (Fstart_match < mb->start_subject + mb->start_offset || |
1025 | 0 | Fstart_match > Feptr) |
1026 | 0 | { |
1027 | | /* The \K expression is fairly rare. We assert it was used so that we |
1028 | | catch any unexpected invalid data in start_match. */ |
1029 | 0 | PCRE2_ASSERT(mb->hasbsk); |
1030 | |
|
1031 | 0 | if (!mb->allowlookaroundbsk) |
1032 | 0 | return PCRE2_ERROR_BAD_BACKSLASH_K; |
1033 | 0 | } |
1034 | | |
1035 | | /* We have a successful match of the whole pattern. Record the result and |
1036 | | then do a direct return from the function. If there is space in the offset |
1037 | | vector, set any pairs that follow the highest-numbered captured string but |
1038 | | are less than the number of capturing groups in the pattern to PCRE2_UNSET. |
1039 | | It is documented that this happens. "Gaps" are set to PCRE2_UNSET |
1040 | | dynamically. It is only those at the end that need setting here. */ |
1041 | | |
1042 | 0 | mb->end_match_ptr = Feptr; /* Record where we ended */ |
1043 | 0 | mb->end_offset_top = Foffset_top; /* and how many extracts were taken */ |
1044 | 0 | mb->mark = Fmark; /* and the last success mark */ |
1045 | 0 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
1046 | |
|
1047 | 0 | match_data->ovector[0] = Fstart_match - mb->start_subject; |
1048 | 0 | match_data->ovector[1] = Feptr - mb->start_subject; |
1049 | | |
1050 | | /* Set i to the smaller of the sizes of the external and frame ovectors. */ |
1051 | |
|
1052 | 0 | i = 2 * ((top_bracket + 1 > match_data->oveccount)? |
1053 | 0 | match_data->oveccount : top_bracket + 1); |
1054 | 0 | memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE)); |
1055 | 0 | while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET; |
1056 | 0 | return MATCH_MATCH; /* Note: NOT RRETURN */ |
1057 | | |
1058 | | |
1059 | | /*===================================================================== */ |
1060 | | /* Match any single character type except newline; have to take care with |
1061 | | CRLF newlines and partial matching. */ |
1062 | | |
1063 | 0 | case OP_ANY: |
1064 | 0 | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
1065 | 0 | if (mb->partial != 0 && |
1066 | 0 | Feptr == mb->end_subject - 1 && |
1067 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
1068 | 0 | NLBLOCK->nllen == 2 && |
1069 | 0 | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
1070 | 0 | { |
1071 | 0 | mb->hitend = TRUE; |
1072 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
1073 | 0 | } |
1074 | 0 | PCRE2_FALLTHROUGH /* Fall through */ |
1075 | 0 |
|
1076 | 0 | /* Match any single character whatsoever. */ |
1077 | 0 |
|
1078 | 0 | case OP_ALLANY: |
1079 | 0 | if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ |
1080 | 0 | { /* not be updated before SCHECK_PARTIAL. */ |
1081 | 0 | SCHECK_PARTIAL(); |
1082 | 0 | RRETURN(MATCH_NOMATCH); |
1083 | 0 | } |
1084 | 0 | Feptr++; |
1085 | 0 | #ifdef SUPPORT_UNICODE |
1086 | 0 | if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
1087 | 0 | #endif |
1088 | 0 | Fecode++; |
1089 | 0 | break; |
1090 | | |
1091 | | |
1092 | | /* ===================================================================== */ |
1093 | | /* Match a single code unit, even in UTF mode. This opcode really does |
1094 | | match any code unit, even newline. (It really should be called ANYCODEUNIT, |
1095 | | of course - the byte name is from pre-16 bit days.) */ |
1096 | | |
1097 | 0 | case OP_ANYBYTE: |
1098 | 0 | if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ |
1099 | 0 | { /* not be updated before SCHECK_PARTIAL. */ |
1100 | 0 | SCHECK_PARTIAL(); |
1101 | 0 | RRETURN(MATCH_NOMATCH); |
1102 | 0 | } |
1103 | 0 | Feptr++; |
1104 | 0 | Fecode++; |
1105 | 0 | break; |
1106 | | |
1107 | | |
1108 | | /* ===================================================================== */ |
1109 | | /* Match a single character, casefully */ |
1110 | | |
1111 | 0 | case OP_CHAR: |
1112 | 0 | #ifdef SUPPORT_UNICODE |
1113 | 0 | if (utf) |
1114 | 0 | { |
1115 | 0 | Flength = 1; |
1116 | 0 | Fecode++; |
1117 | 0 | GETCHARLEN(fc, Fecode, Flength); |
1118 | 0 | if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr)) |
1119 | 0 | { |
1120 | 0 | CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ |
1121 | 0 | RRETURN(MATCH_NOMATCH); |
1122 | 0 | } |
1123 | 0 | for (; Flength > 0; Flength--) |
1124 | 0 | { |
1125 | 0 | if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH); |
1126 | 0 | } |
1127 | 0 | } |
1128 | 0 | else |
1129 | 0 | #endif |
1130 | | |
1131 | | /* Not UTF mode */ |
1132 | 0 | { |
1133 | 0 | if (mb->end_subject - Feptr < 1) |
1134 | 0 | { |
1135 | 0 | SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ |
1136 | 0 | RRETURN(MATCH_NOMATCH); |
1137 | 0 | } |
1138 | 0 | if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH); |
1139 | 0 | Fecode += 2; |
1140 | 0 | } |
1141 | 0 | break; |
1142 | | |
1143 | | |
1144 | | /* ===================================================================== */ |
1145 | | /* Match a single character, caselessly. If we are at the end of the |
1146 | | subject, give up immediately. We get here only when the pattern character |
1147 | | has at most one other case. Characters with more than two cases are coded |
1148 | | as OP_PROP with the pseudo-property PT_CLIST. */ |
1149 | | |
1150 | 0 | case OP_CHARI: |
1151 | 0 | if (Feptr >= mb->end_subject) |
1152 | 0 | { |
1153 | 0 | SCHECK_PARTIAL(); |
1154 | 0 | RRETURN(MATCH_NOMATCH); |
1155 | 0 | } |
1156 | | |
1157 | 0 | #ifdef SUPPORT_UNICODE |
1158 | 0 | if (utf) |
1159 | 0 | { |
1160 | 0 | Flength = 1; |
1161 | 0 | Fecode++; |
1162 | 0 | GETCHARLEN(fc, Fecode, Flength); |
1163 | | |
1164 | | /* If the pattern character's value is < 128, we know that its other case |
1165 | | (if any) is also < 128 (and therefore only one code unit long in all |
1166 | | code-unit widths), so we can use the fast lookup table. We checked above |
1167 | | that there is at least one character left in the subject. */ |
1168 | |
|
1169 | 0 | if (fc < 128) |
1170 | 0 | { |
1171 | 0 | uint32_t cc = UCHAR21(Feptr); |
1172 | 0 | if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); |
1173 | 0 | Fecode++; |
1174 | 0 | Feptr++; |
1175 | 0 | } |
1176 | | |
1177 | | /* Otherwise we must pick up the subject character and use Unicode |
1178 | | property support to test its other case. Note that we cannot use the |
1179 | | value of "Flength" to check for sufficient bytes left, because the other |
1180 | | case of the character may have more or fewer code units. */ |
1181 | | |
1182 | 0 | else |
1183 | 0 | { |
1184 | 0 | uint32_t dc; |
1185 | 0 | GETCHARINC(dc, Feptr); |
1186 | 0 | Fecode += Flength; |
1187 | 0 | if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); |
1188 | 0 | } |
1189 | 0 | } |
1190 | | |
1191 | | /* If UCP is set without UTF we must do the same as above, but with one |
1192 | | character per code unit. */ |
1193 | | |
1194 | 0 | else if (ucp) |
1195 | 0 | { |
1196 | 0 | uint32_t cc = UCHAR21(Feptr); |
1197 | 0 | fc = Fecode[1]; |
1198 | 0 | if (fc < 128) |
1199 | 0 | { |
1200 | 0 | if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); |
1201 | 0 | } |
1202 | 0 | else |
1203 | 0 | { |
1204 | 0 | if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); |
1205 | 0 | } |
1206 | 0 | Feptr++; |
1207 | 0 | Fecode += 2; |
1208 | 0 | } |
1209 | | |
1210 | 0 | else |
1211 | 0 | #endif /* SUPPORT_UNICODE */ |
1212 | | |
1213 | | /* Not UTF or UCP mode; use the table for characters < 256. */ |
1214 | 0 | { |
1215 | 0 | if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) |
1216 | 0 | != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); |
1217 | 0 | Feptr++; |
1218 | 0 | Fecode += 2; |
1219 | 0 | } |
1220 | 0 | break; |
1221 | | |
1222 | | |
1223 | | /* ===================================================================== */ |
1224 | | /* Match not a single character. */ |
1225 | | |
1226 | 0 | case OP_NOT: |
1227 | 0 | case OP_NOTI: |
1228 | 0 | if (Feptr >= mb->end_subject) |
1229 | 0 | { |
1230 | 0 | SCHECK_PARTIAL(); |
1231 | 0 | RRETURN(MATCH_NOMATCH); |
1232 | 0 | } |
1233 | | |
1234 | 0 | #ifdef SUPPORT_UNICODE |
1235 | 0 | if (utf) |
1236 | 0 | { |
1237 | 0 | uint32_t ch; |
1238 | 0 | Fecode++; |
1239 | 0 | GETCHARINC(ch, Fecode); |
1240 | 0 | GETCHARINC(fc, Feptr); |
1241 | 0 | if (ch == fc) |
1242 | 0 | { |
1243 | 0 | RRETURN(MATCH_NOMATCH); /* Caseful match */ |
1244 | 0 | } |
1245 | 0 | else if (Fop == OP_NOTI) /* If caseless */ |
1246 | 0 | { |
1247 | 0 | if (ch > 127) |
1248 | 0 | ch = UCD_OTHERCASE(ch); |
1249 | 0 | else |
1250 | 0 | ch = (mb->fcc)[ch]; |
1251 | 0 | if (ch == fc) RRETURN(MATCH_NOMATCH); |
1252 | 0 | } |
1253 | 0 | } |
1254 | | |
1255 | | /* UCP without UTF is as above, but with one character per code unit. */ |
1256 | | |
1257 | 0 | else if (ucp) |
1258 | 0 | { |
1259 | 0 | uint32_t ch; |
1260 | 0 | fc = UCHAR21INC(Feptr); |
1261 | 0 | ch = Fecode[1]; |
1262 | 0 | Fecode += 2; |
1263 | |
|
1264 | 0 | if (ch == fc) |
1265 | 0 | { |
1266 | 0 | RRETURN(MATCH_NOMATCH); /* Caseful match */ |
1267 | 0 | } |
1268 | 0 | else if (Fop == OP_NOTI) /* If caseless */ |
1269 | 0 | { |
1270 | 0 | if (ch > 127) |
1271 | 0 | ch = UCD_OTHERCASE(ch); |
1272 | 0 | else |
1273 | 0 | ch = (mb->fcc)[ch]; |
1274 | 0 | if (ch == fc) RRETURN(MATCH_NOMATCH); |
1275 | 0 | } |
1276 | 0 | } |
1277 | | |
1278 | 0 | else |
1279 | 0 | #endif /* SUPPORT_UNICODE */ |
1280 | | |
1281 | | /* Neither UTF nor UCP is set */ |
1282 | | |
1283 | 0 | { |
1284 | 0 | uint32_t ch = Fecode[1]; |
1285 | 0 | fc = UCHAR21INC(Feptr); |
1286 | 0 | if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) |
1287 | 0 | RRETURN(MATCH_NOMATCH); |
1288 | 0 | Fecode += 2; |
1289 | 0 | } |
1290 | 0 | break; |
1291 | | |
1292 | | |
1293 | | /* ===================================================================== */ |
1294 | | /* Match a single character repeatedly. */ |
1295 | | |
1296 | 0 | #define Loclength F->temp_size |
1297 | 0 | #define Lstart_eptr F->temp_sptr[0] |
1298 | 0 | #define Lcharptr F->temp_sptr[1] |
1299 | 0 | #define Lmin F->temp_32[0] |
1300 | 0 | #define Lmax F->temp_32[1] |
1301 | 0 | #define Lc F->temp_32[2] |
1302 | 0 | #define Loc F->temp_32[3] |
1303 | | |
1304 | 0 | case OP_EXACT: |
1305 | 0 | case OP_EXACTI: |
1306 | 0 | Lmin = Lmax = GET2(Fecode, 1); |
1307 | 0 | Fecode += 1 + IMM2_SIZE; |
1308 | 0 | goto REPEATCHAR; |
1309 | | |
1310 | 0 | case OP_POSUPTO: |
1311 | 0 | case OP_POSUPTOI: |
1312 | 0 | reptype = REPTYPE_POS; |
1313 | 0 | Lmin = 0; |
1314 | 0 | Lmax = GET2(Fecode, 1); |
1315 | 0 | Fecode += 1 + IMM2_SIZE; |
1316 | 0 | goto REPEATCHAR; |
1317 | | |
1318 | 0 | case OP_UPTO: |
1319 | 0 | case OP_UPTOI: |
1320 | 0 | reptype = REPTYPE_MAX; |
1321 | 0 | Lmin = 0; |
1322 | 0 | Lmax = GET2(Fecode, 1); |
1323 | 0 | Fecode += 1 + IMM2_SIZE; |
1324 | 0 | goto REPEATCHAR; |
1325 | | |
1326 | 0 | case OP_MINUPTO: |
1327 | 0 | case OP_MINUPTOI: |
1328 | 0 | reptype = REPTYPE_MIN; |
1329 | 0 | Lmin = 0; |
1330 | 0 | Lmax = GET2(Fecode, 1); |
1331 | 0 | Fecode += 1 + IMM2_SIZE; |
1332 | 0 | goto REPEATCHAR; |
1333 | | |
1334 | 0 | case OP_POSSTAR: |
1335 | 0 | case OP_POSSTARI: |
1336 | 0 | reptype = REPTYPE_POS; |
1337 | 0 | Lmin = 0; |
1338 | 0 | Lmax = UINT32_MAX; |
1339 | 0 | Fecode++; |
1340 | 0 | goto REPEATCHAR; |
1341 | | |
1342 | 0 | case OP_POSPLUS: |
1343 | 0 | case OP_POSPLUSI: |
1344 | 0 | reptype = REPTYPE_POS; |
1345 | 0 | Lmin = 1; |
1346 | 0 | Lmax = UINT32_MAX; |
1347 | 0 | Fecode++; |
1348 | 0 | goto REPEATCHAR; |
1349 | | |
1350 | 0 | case OP_POSQUERY: |
1351 | 0 | case OP_POSQUERYI: |
1352 | 0 | reptype = REPTYPE_POS; |
1353 | 0 | Lmin = 0; |
1354 | 0 | Lmax = 1; |
1355 | 0 | Fecode++; |
1356 | 0 | goto REPEATCHAR; |
1357 | | |
1358 | 0 | case OP_STAR: |
1359 | 0 | case OP_STARI: |
1360 | 0 | case OP_MINSTAR: |
1361 | 0 | case OP_MINSTARI: |
1362 | 0 | case OP_PLUS: |
1363 | 0 | case OP_PLUSI: |
1364 | 0 | case OP_MINPLUS: |
1365 | 0 | case OP_MINPLUSI: |
1366 | 0 | case OP_QUERY: |
1367 | 0 | case OP_QUERYI: |
1368 | 0 | case OP_MINQUERY: |
1369 | 0 | case OP_MINQUERYI: |
1370 | 0 | fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI); |
1371 | 0 | Lmin = rep_min[fc]; |
1372 | 0 | Lmax = rep_max[fc]; |
1373 | 0 | reptype = rep_typ[fc]; |
1374 | | |
1375 | | /* Common code for all repeated single-character matches. We first check |
1376 | | for the minimum number of characters. If the minimum equals the maximum, we |
1377 | | are done. Otherwise, if minimizing, check the rest of the pattern for a |
1378 | | match; if there isn't one, advance up to the maximum, one character at a |
1379 | | time. |
1380 | | |
1381 | | If maximizing, advance up to the maximum number of matching characters, |
1382 | | until Feptr is past the end of the maximum run. If possessive, we are |
1383 | | then done (no backing up). Otherwise, match at this position; anything |
1384 | | other than no match is immediately returned. For nomatch, back up one |
1385 | | character, unless we are matching \R and the last thing matched was |
1386 | | \r\n, in which case, back up two code units until we reach the first |
1387 | | optional character position. |
1388 | | |
1389 | | The various UTF/non-UTF and caseful/caseless cases are handled separately, |
1390 | | for speed. */ |
1391 | |
|
1392 | 0 | REPEATCHAR: |
1393 | 0 | #ifdef SUPPORT_UNICODE |
1394 | 0 | if (utf) |
1395 | 0 | { |
1396 | 0 | Flength = 1; |
1397 | 0 | Lcharptr = Fecode; |
1398 | 0 | GETCHARLEN(fc, Fecode, Flength); |
1399 | 0 | Fecode += Flength; |
1400 | | |
1401 | | /* Handle multi-code-unit character matching, caseful and caseless. */ |
1402 | |
|
1403 | 0 | if (Flength > 1) |
1404 | 0 | { |
1405 | 0 | uint32_t othercase; |
1406 | |
|
1407 | 0 | if (Fop >= OP_STARI && /* Caseless */ |
1408 | 0 | (othercase = UCD_OTHERCASE(fc)) != fc) |
1409 | 0 | Loclength = PRIV(ord2utf)(othercase, Foccu); |
1410 | 0 | else Loclength = 0; |
1411 | |
|
1412 | 0 | for (i = 1; i <= Lmin; i++) |
1413 | 0 | { |
1414 | 0 | if (Feptr <= mb->end_subject - Flength && |
1415 | 0 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; |
1416 | 0 | else if (Loclength > 0 && |
1417 | 0 | Feptr <= mb->end_subject - Loclength && |
1418 | 0 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1419 | 0 | Feptr += Loclength; |
1420 | 0 | else |
1421 | 0 | { |
1422 | 0 | CHECK_PARTIAL(); |
1423 | 0 | RRETURN(MATCH_NOMATCH); |
1424 | 0 | } |
1425 | 0 | } |
1426 | | |
1427 | 0 | if (Lmin == Lmax) continue; |
1428 | | |
1429 | 0 | if (reptype == REPTYPE_MIN) |
1430 | 0 | { |
1431 | 0 | for (;;) |
1432 | 0 | { |
1433 | 0 | RMATCH(Fecode, RM202); |
1434 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1435 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1436 | 0 | if (Feptr <= mb->end_subject - Flength && |
1437 | 0 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; |
1438 | 0 | else if (Loclength > 0 && |
1439 | 0 | Feptr <= mb->end_subject - Loclength && |
1440 | 0 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1441 | 0 | Feptr += Loclength; |
1442 | 0 | else |
1443 | 0 | { |
1444 | 0 | CHECK_PARTIAL(); |
1445 | 0 | RRETURN(MATCH_NOMATCH); |
1446 | 0 | } |
1447 | 0 | } |
1448 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
1449 | 0 | } |
1450 | | |
1451 | 0 | else /* Maximize */ |
1452 | 0 | { |
1453 | 0 | Lstart_eptr = Feptr; |
1454 | 0 | for (i = Lmin; i < Lmax; i++) |
1455 | 0 | { |
1456 | 0 | if (Feptr <= mb->end_subject - Flength && |
1457 | 0 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) |
1458 | 0 | Feptr += Flength; |
1459 | 0 | else if (Loclength > 0 && |
1460 | 0 | Feptr <= mb->end_subject - Loclength && |
1461 | 0 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1462 | 0 | Feptr += Loclength; |
1463 | 0 | else |
1464 | 0 | { |
1465 | 0 | CHECK_PARTIAL(); |
1466 | 0 | break; |
1467 | 0 | } |
1468 | 0 | } |
1469 | | |
1470 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1471 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1472 | | go too far. */ |
1473 | | |
1474 | 0 | if (reptype != REPTYPE_POS) for(;;) |
1475 | 0 | { |
1476 | 0 | if (Feptr <= Lstart_eptr) break; |
1477 | 0 | RMATCH(Fecode, RM203); |
1478 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1479 | 0 | Feptr--; |
1480 | 0 | BACKCHAR(Feptr); |
1481 | 0 | } |
1482 | 0 | } |
1483 | 0 | break; /* End of repeated wide character handling */ |
1484 | 0 | } |
1485 | | |
1486 | | /* Length of UTF character is 1. Put it into the preserved variable and |
1487 | | fall through to the non-UTF code. */ |
1488 | | |
1489 | 0 | Lc = fc; |
1490 | 0 | } |
1491 | 0 | else |
1492 | 0 | #endif /* SUPPORT_UNICODE */ |
1493 | | |
1494 | | /* When not in UTF mode, load a single-code-unit character. Then proceed as |
1495 | | above, using Unicode casing if either UTF or UCP is set. */ |
1496 | | |
1497 | 0 | Lc = *Fecode++; |
1498 | | |
1499 | | /* Caseless comparison */ |
1500 | | |
1501 | 0 | if (Fop >= OP_STARI) |
1502 | 0 | { |
1503 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1504 | 0 | #ifdef SUPPORT_UNICODE |
1505 | 0 | if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); |
1506 | 0 | else |
1507 | 0 | #endif /* SUPPORT_UNICODE */ |
1508 | | /* Lc will be < 128 in UTF-8 mode. */ |
1509 | 0 | Loc = mb->fcc[Lc]; |
1510 | | #else /* 16-bit & 32-bit */ |
1511 | | #ifdef SUPPORT_UNICODE |
1512 | | if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); |
1513 | | else |
1514 | | #endif /* SUPPORT_UNICODE */ |
1515 | | Loc = TABLE_GET(Lc, mb->fcc, Lc); |
1516 | | #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
1517 | |
|
1518 | 0 | for (i = 1; i <= Lmin; i++) |
1519 | 0 | { |
1520 | 0 | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1521 | 0 | if (Feptr >= mb->end_subject) |
1522 | 0 | { |
1523 | 0 | SCHECK_PARTIAL(); |
1524 | 0 | RRETURN(MATCH_NOMATCH); |
1525 | 0 | } |
1526 | 0 | cc = UCHAR21TEST(Feptr); |
1527 | 0 | if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); |
1528 | 0 | Feptr++; |
1529 | 0 | } |
1530 | 0 | if (Lmin == Lmax) continue; |
1531 | | |
1532 | 0 | if (reptype == REPTYPE_MIN) |
1533 | 0 | { |
1534 | 0 | for (;;) |
1535 | 0 | { |
1536 | 0 | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1537 | 0 | RMATCH(Fecode, RM25); |
1538 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1539 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1540 | 0 | if (Feptr >= mb->end_subject) |
1541 | 0 | { |
1542 | 0 | SCHECK_PARTIAL(); |
1543 | 0 | RRETURN(MATCH_NOMATCH); |
1544 | 0 | } |
1545 | 0 | cc = UCHAR21TEST(Feptr); |
1546 | 0 | if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); |
1547 | 0 | Feptr++; |
1548 | 0 | } |
1549 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
1550 | 0 | } |
1551 | | |
1552 | 0 | else /* Maximize */ |
1553 | 0 | { |
1554 | 0 | Lstart_eptr = Feptr; |
1555 | 0 | for (i = Lmin; i < Lmax; i++) |
1556 | 0 | { |
1557 | 0 | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1558 | 0 | if (Feptr >= mb->end_subject) |
1559 | 0 | { |
1560 | 0 | SCHECK_PARTIAL(); |
1561 | 0 | break; |
1562 | 0 | } |
1563 | 0 | cc = UCHAR21TEST(Feptr); |
1564 | 0 | if (Lc != cc && Loc != cc) break; |
1565 | 0 | Feptr++; |
1566 | 0 | } |
1567 | 0 | if (reptype != REPTYPE_POS) for (;;) |
1568 | 0 | { |
1569 | 0 | if (Feptr == Lstart_eptr) break; |
1570 | 0 | RMATCH(Fecode, RM26); |
1571 | 0 | Feptr--; |
1572 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1573 | 0 | } |
1574 | 0 | } |
1575 | 0 | } |
1576 | | |
1577 | | /* Caseful comparisons (includes all multi-byte characters) */ |
1578 | | |
1579 | 0 | else |
1580 | 0 | { |
1581 | 0 | for (i = 1; i <= Lmin; i++) |
1582 | 0 | { |
1583 | 0 | if (Feptr >= mb->end_subject) |
1584 | 0 | { |
1585 | 0 | SCHECK_PARTIAL(); |
1586 | 0 | RRETURN(MATCH_NOMATCH); |
1587 | 0 | } |
1588 | 0 | if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); |
1589 | 0 | } |
1590 | | |
1591 | 0 | if (Lmin == Lmax) continue; |
1592 | | |
1593 | 0 | if (reptype == REPTYPE_MIN) |
1594 | 0 | { |
1595 | 0 | for (;;) |
1596 | 0 | { |
1597 | 0 | RMATCH(Fecode, RM27); |
1598 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1599 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1600 | 0 | if (Feptr >= mb->end_subject) |
1601 | 0 | { |
1602 | 0 | SCHECK_PARTIAL(); |
1603 | 0 | RRETURN(MATCH_NOMATCH); |
1604 | 0 | } |
1605 | 0 | if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); |
1606 | 0 | } |
1607 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
1608 | 0 | } |
1609 | 0 | else /* Maximize */ |
1610 | 0 | { |
1611 | 0 | Lstart_eptr = Feptr; |
1612 | 0 | for (i = Lmin; i < Lmax; i++) |
1613 | 0 | { |
1614 | 0 | if (Feptr >= mb->end_subject) |
1615 | 0 | { |
1616 | 0 | SCHECK_PARTIAL(); |
1617 | 0 | break; |
1618 | 0 | } |
1619 | | |
1620 | 0 | if (Lc != UCHAR21TEST(Feptr)) break; |
1621 | 0 | Feptr++; |
1622 | 0 | } |
1623 | | |
1624 | 0 | if (reptype != REPTYPE_POS) for (;;) |
1625 | 0 | { |
1626 | 0 | if (Feptr <= Lstart_eptr) break; |
1627 | 0 | RMATCH(Fecode, RM28); |
1628 | 0 | Feptr--; |
1629 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1630 | 0 | } |
1631 | 0 | } |
1632 | 0 | } |
1633 | 0 | break; |
1634 | | |
1635 | 0 | #undef Loclength |
1636 | 0 | #undef Lstart_eptr |
1637 | 0 | #undef Lcharptr |
1638 | 0 | #undef Lmin |
1639 | 0 | #undef Lmax |
1640 | 0 | #undef Lc |
1641 | 0 | #undef Loc |
1642 | | |
1643 | | |
1644 | | /* ===================================================================== */ |
1645 | | /* Match a negated single one-byte character repeatedly. This is almost a |
1646 | | repeat of the code for a repeated single character, but I haven't found a |
1647 | | nice way of commoning these up that doesn't require a test of the |
1648 | | positive/negative option for each character match. Maybe that wouldn't add |
1649 | | very much to the time taken, but character matching *is* what this is all |
1650 | | about... */ |
1651 | | |
1652 | 0 | #define Lstart_eptr F->temp_sptr[0] |
1653 | 0 | #define Lmin F->temp_32[0] |
1654 | 0 | #define Lmax F->temp_32[1] |
1655 | 0 | #define Lc F->temp_32[2] |
1656 | 0 | #define Loc F->temp_32[3] |
1657 | | |
1658 | 0 | case OP_NOTEXACT: |
1659 | 0 | case OP_NOTEXACTI: |
1660 | 0 | Lmin = Lmax = GET2(Fecode, 1); |
1661 | 0 | Fecode += 1 + IMM2_SIZE; |
1662 | 0 | goto REPEATNOTCHAR; |
1663 | | |
1664 | 0 | case OP_NOTUPTO: |
1665 | 0 | case OP_NOTUPTOI: |
1666 | 0 | Lmin = 0; |
1667 | 0 | Lmax = GET2(Fecode, 1); |
1668 | 0 | reptype = REPTYPE_MAX; |
1669 | 0 | Fecode += 1 + IMM2_SIZE; |
1670 | 0 | goto REPEATNOTCHAR; |
1671 | | |
1672 | 0 | case OP_NOTMINUPTO: |
1673 | 0 | case OP_NOTMINUPTOI: |
1674 | 0 | Lmin = 0; |
1675 | 0 | Lmax = GET2(Fecode, 1); |
1676 | 0 | reptype = REPTYPE_MIN; |
1677 | 0 | Fecode += 1 + IMM2_SIZE; |
1678 | 0 | goto REPEATNOTCHAR; |
1679 | | |
1680 | 0 | case OP_NOTPOSSTAR: |
1681 | 0 | case OP_NOTPOSSTARI: |
1682 | 0 | reptype = REPTYPE_POS; |
1683 | 0 | Lmin = 0; |
1684 | 0 | Lmax = UINT32_MAX; |
1685 | 0 | Fecode++; |
1686 | 0 | goto REPEATNOTCHAR; |
1687 | | |
1688 | 0 | case OP_NOTPOSPLUS: |
1689 | 0 | case OP_NOTPOSPLUSI: |
1690 | 0 | reptype = REPTYPE_POS; |
1691 | 0 | Lmin = 1; |
1692 | 0 | Lmax = UINT32_MAX; |
1693 | 0 | Fecode++; |
1694 | 0 | goto REPEATNOTCHAR; |
1695 | | |
1696 | 0 | case OP_NOTPOSQUERY: |
1697 | 0 | case OP_NOTPOSQUERYI: |
1698 | 0 | reptype = REPTYPE_POS; |
1699 | 0 | Lmin = 0; |
1700 | 0 | Lmax = 1; |
1701 | 0 | Fecode++; |
1702 | 0 | goto REPEATNOTCHAR; |
1703 | | |
1704 | 0 | case OP_NOTPOSUPTO: |
1705 | 0 | case OP_NOTPOSUPTOI: |
1706 | 0 | reptype = REPTYPE_POS; |
1707 | 0 | Lmin = 0; |
1708 | 0 | Lmax = GET2(Fecode, 1); |
1709 | 0 | Fecode += 1 + IMM2_SIZE; |
1710 | 0 | goto REPEATNOTCHAR; |
1711 | | |
1712 | 0 | case OP_NOTSTAR: |
1713 | 0 | case OP_NOTSTARI: |
1714 | 0 | case OP_NOTMINSTAR: |
1715 | 0 | case OP_NOTMINSTARI: |
1716 | 0 | case OP_NOTPLUS: |
1717 | 0 | case OP_NOTPLUSI: |
1718 | 0 | case OP_NOTMINPLUS: |
1719 | 0 | case OP_NOTMINPLUSI: |
1720 | 0 | case OP_NOTQUERY: |
1721 | 0 | case OP_NOTQUERYI: |
1722 | 0 | case OP_NOTMINQUERY: |
1723 | 0 | case OP_NOTMINQUERYI: |
1724 | 0 | fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); |
1725 | 0 | Lmin = rep_min[fc]; |
1726 | 0 | Lmax = rep_max[fc]; |
1727 | 0 | reptype = rep_typ[fc]; |
1728 | | |
1729 | | /* Common code for all repeated single-character non-matches. */ |
1730 | |
|
1731 | 0 | REPEATNOTCHAR: |
1732 | 0 | GETCHARINCTEST(Lc, Fecode); |
1733 | | |
1734 | | /* The code is duplicated for the caseless and caseful cases, for speed, |
1735 | | since matching characters is likely to be quite common. First, ensure the |
1736 | | minimum number of matches are present. If Lmin = Lmax, we are done. |
1737 | | Otherwise, if minimizing, keep trying the rest of the expression and |
1738 | | advancing one matching character if failing, up to the maximum. |
1739 | | Alternatively, if maximizing, find the maximum number of characters and |
1740 | | work backwards. */ |
1741 | |
|
1742 | 0 | if (Fop >= OP_NOTSTARI) /* Caseless */ |
1743 | 0 | { |
1744 | 0 | #ifdef SUPPORT_UNICODE |
1745 | 0 | if ((utf || ucp) && Lc > 127) |
1746 | 0 | Loc = UCD_OTHERCASE(Lc); |
1747 | 0 | else |
1748 | 0 | #endif /* SUPPORT_UNICODE */ |
1749 | | |
1750 | 0 | Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */ |
1751 | |
|
1752 | 0 | #ifdef SUPPORT_UNICODE |
1753 | 0 | if (utf) |
1754 | 0 | { |
1755 | 0 | uint32_t d; |
1756 | 0 | for (i = 1; i <= Lmin; i++) |
1757 | 0 | { |
1758 | 0 | if (Feptr >= mb->end_subject) |
1759 | 0 | { |
1760 | 0 | SCHECK_PARTIAL(); |
1761 | 0 | RRETURN(MATCH_NOMATCH); |
1762 | 0 | } |
1763 | 0 | GETCHARINC(d, Feptr); |
1764 | 0 | if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); |
1765 | 0 | } |
1766 | 0 | } |
1767 | 0 | else |
1768 | 0 | #endif /* SUPPORT_UNICODE */ |
1769 | | |
1770 | | /* Not UTF mode */ |
1771 | 0 | { |
1772 | 0 | for (i = 1; i <= Lmin; i++) |
1773 | 0 | { |
1774 | 0 | if (Feptr >= mb->end_subject) |
1775 | 0 | { |
1776 | 0 | SCHECK_PARTIAL(); |
1777 | 0 | RRETURN(MATCH_NOMATCH); |
1778 | 0 | } |
1779 | 0 | if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); |
1780 | 0 | Feptr++; |
1781 | 0 | } |
1782 | 0 | } |
1783 | | |
1784 | 0 | if (Lmin == Lmax) continue; /* Finished for exact count */ |
1785 | | |
1786 | 0 | if (reptype == REPTYPE_MIN) |
1787 | 0 | { |
1788 | 0 | #ifdef SUPPORT_UNICODE |
1789 | 0 | if (utf) |
1790 | 0 | { |
1791 | 0 | uint32_t d; |
1792 | 0 | for (;;) |
1793 | 0 | { |
1794 | 0 | RMATCH(Fecode, RM204); |
1795 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1796 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1797 | 0 | if (Feptr >= mb->end_subject) |
1798 | 0 | { |
1799 | 0 | SCHECK_PARTIAL(); |
1800 | 0 | RRETURN(MATCH_NOMATCH); |
1801 | 0 | } |
1802 | 0 | GETCHARINC(d, Feptr); |
1803 | 0 | if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); |
1804 | 0 | } |
1805 | 0 | } |
1806 | 0 | else |
1807 | 0 | #endif /*SUPPORT_UNICODE */ |
1808 | | |
1809 | | /* Not UTF mode */ |
1810 | 0 | { |
1811 | 0 | for (;;) |
1812 | 0 | { |
1813 | 0 | RMATCH(Fecode, RM29); |
1814 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1815 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1816 | 0 | if (Feptr >= mb->end_subject) |
1817 | 0 | { |
1818 | 0 | SCHECK_PARTIAL(); |
1819 | 0 | RRETURN(MATCH_NOMATCH); |
1820 | 0 | } |
1821 | 0 | if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); |
1822 | 0 | Feptr++; |
1823 | 0 | } |
1824 | 0 | } |
1825 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
1826 | 0 | } |
1827 | | |
1828 | | /* Maximize case */ |
1829 | | |
1830 | 0 | else |
1831 | 0 | { |
1832 | 0 | Lstart_eptr = Feptr; |
1833 | |
|
1834 | 0 | #ifdef SUPPORT_UNICODE |
1835 | 0 | if (utf) |
1836 | 0 | { |
1837 | 0 | uint32_t d; |
1838 | 0 | for (i = Lmin; i < Lmax; i++) |
1839 | 0 | { |
1840 | 0 | int len = 1; |
1841 | 0 | if (Feptr >= mb->end_subject) |
1842 | 0 | { |
1843 | 0 | SCHECK_PARTIAL(); |
1844 | 0 | break; |
1845 | 0 | } |
1846 | 0 | GETCHARLEN(d, Feptr, len); |
1847 | 0 | if (Lc == d || Loc == d) break; |
1848 | 0 | Feptr += len; |
1849 | 0 | } |
1850 | | |
1851 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1852 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1853 | | go too far. */ |
1854 | | |
1855 | 0 | if (reptype != REPTYPE_POS) for(;;) |
1856 | 0 | { |
1857 | 0 | if (Feptr <= Lstart_eptr) break; |
1858 | 0 | RMATCH(Fecode, RM205); |
1859 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1860 | 0 | Feptr--; |
1861 | 0 | BACKCHAR(Feptr); |
1862 | 0 | } |
1863 | 0 | } |
1864 | 0 | else |
1865 | 0 | #endif /* SUPPORT_UNICODE */ |
1866 | | |
1867 | | /* Not UTF mode */ |
1868 | 0 | { |
1869 | 0 | for (i = Lmin; i < Lmax; i++) |
1870 | 0 | { |
1871 | 0 | if (Feptr >= mb->end_subject) |
1872 | 0 | { |
1873 | 0 | SCHECK_PARTIAL(); |
1874 | 0 | break; |
1875 | 0 | } |
1876 | 0 | if (Lc == *Feptr || Loc == *Feptr) break; |
1877 | 0 | Feptr++; |
1878 | 0 | } |
1879 | 0 | if (reptype != REPTYPE_POS) for (;;) |
1880 | 0 | { |
1881 | 0 | if (Feptr == Lstart_eptr) break; |
1882 | 0 | RMATCH(Fecode, RM30); |
1883 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1884 | 0 | Feptr--; |
1885 | 0 | } |
1886 | 0 | } |
1887 | 0 | } |
1888 | 0 | } |
1889 | | |
1890 | | /* Caseful comparisons */ |
1891 | | |
1892 | 0 | else |
1893 | 0 | { |
1894 | 0 | #ifdef SUPPORT_UNICODE |
1895 | 0 | if (utf) |
1896 | 0 | { |
1897 | 0 | uint32_t d; |
1898 | 0 | for (i = 1; i <= Lmin; i++) |
1899 | 0 | { |
1900 | 0 | if (Feptr >= mb->end_subject) |
1901 | 0 | { |
1902 | 0 | SCHECK_PARTIAL(); |
1903 | 0 | RRETURN(MATCH_NOMATCH); |
1904 | 0 | } |
1905 | 0 | GETCHARINC(d, Feptr); |
1906 | 0 | if (Lc == d) RRETURN(MATCH_NOMATCH); |
1907 | 0 | } |
1908 | 0 | } |
1909 | 0 | else |
1910 | 0 | #endif |
1911 | | /* Not UTF mode */ |
1912 | 0 | { |
1913 | 0 | for (i = 1; i <= Lmin; i++) |
1914 | 0 | { |
1915 | 0 | if (Feptr >= mb->end_subject) |
1916 | 0 | { |
1917 | 0 | SCHECK_PARTIAL(); |
1918 | 0 | RRETURN(MATCH_NOMATCH); |
1919 | 0 | } |
1920 | 0 | if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); |
1921 | 0 | } |
1922 | 0 | } |
1923 | | |
1924 | 0 | if (Lmin == Lmax) continue; |
1925 | | |
1926 | 0 | if (reptype == REPTYPE_MIN) |
1927 | 0 | { |
1928 | 0 | #ifdef SUPPORT_UNICODE |
1929 | 0 | if (utf) |
1930 | 0 | { |
1931 | 0 | uint32_t d; |
1932 | 0 | for (;;) |
1933 | 0 | { |
1934 | 0 | RMATCH(Fecode, RM206); |
1935 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1936 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1937 | 0 | if (Feptr >= mb->end_subject) |
1938 | 0 | { |
1939 | 0 | SCHECK_PARTIAL(); |
1940 | 0 | RRETURN(MATCH_NOMATCH); |
1941 | 0 | } |
1942 | 0 | GETCHARINC(d, Feptr); |
1943 | 0 | if (Lc == d) RRETURN(MATCH_NOMATCH); |
1944 | 0 | } |
1945 | 0 | } |
1946 | 0 | else |
1947 | 0 | #endif |
1948 | | /* Not UTF mode */ |
1949 | 0 | { |
1950 | 0 | for (;;) |
1951 | 0 | { |
1952 | 0 | RMATCH(Fecode, RM31); |
1953 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1954 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1955 | 0 | if (Feptr >= mb->end_subject) |
1956 | 0 | { |
1957 | 0 | SCHECK_PARTIAL(); |
1958 | 0 | RRETURN(MATCH_NOMATCH); |
1959 | 0 | } |
1960 | 0 | if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); |
1961 | 0 | } |
1962 | 0 | } |
1963 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
1964 | 0 | } |
1965 | | |
1966 | | /* Maximize case */ |
1967 | | |
1968 | 0 | else |
1969 | 0 | { |
1970 | 0 | Lstart_eptr = Feptr; |
1971 | |
|
1972 | 0 | #ifdef SUPPORT_UNICODE |
1973 | 0 | if (utf) |
1974 | 0 | { |
1975 | 0 | uint32_t d; |
1976 | 0 | for (i = Lmin; i < Lmax; i++) |
1977 | 0 | { |
1978 | 0 | int len = 1; |
1979 | 0 | if (Feptr >= mb->end_subject) |
1980 | 0 | { |
1981 | 0 | SCHECK_PARTIAL(); |
1982 | 0 | break; |
1983 | 0 | } |
1984 | 0 | GETCHARLEN(d, Feptr, len); |
1985 | 0 | if (Lc == d) break; |
1986 | 0 | Feptr += len; |
1987 | 0 | } |
1988 | | |
1989 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1990 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1991 | | go too far. */ |
1992 | | |
1993 | 0 | if (reptype != REPTYPE_POS) for(;;) |
1994 | 0 | { |
1995 | 0 | if (Feptr <= Lstart_eptr) break; |
1996 | 0 | RMATCH(Fecode, RM207); |
1997 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1998 | 0 | Feptr--; |
1999 | 0 | BACKCHAR(Feptr); |
2000 | 0 | } |
2001 | 0 | } |
2002 | 0 | else |
2003 | 0 | #endif |
2004 | | /* Not UTF mode */ |
2005 | 0 | { |
2006 | 0 | for (i = Lmin; i < Lmax; i++) |
2007 | 0 | { |
2008 | 0 | if (Feptr >= mb->end_subject) |
2009 | 0 | { |
2010 | 0 | SCHECK_PARTIAL(); |
2011 | 0 | break; |
2012 | 0 | } |
2013 | 0 | if (Lc == *Feptr) break; |
2014 | 0 | Feptr++; |
2015 | 0 | } |
2016 | 0 | if (reptype != REPTYPE_POS) for (;;) |
2017 | 0 | { |
2018 | 0 | if (Feptr == Lstart_eptr) break; |
2019 | 0 | RMATCH(Fecode, RM32); |
2020 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2021 | 0 | Feptr--; |
2022 | 0 | } |
2023 | 0 | } |
2024 | 0 | } |
2025 | 0 | } |
2026 | 0 | break; |
2027 | | |
2028 | 0 | #undef Lstart_eptr |
2029 | 0 | #undef Lmin |
2030 | 0 | #undef Lmax |
2031 | 0 | #undef Lc |
2032 | 0 | #undef Loc |
2033 | | |
2034 | | |
2035 | | /* ===================================================================== */ |
2036 | | /* Match a bit-mapped character class, possibly repeatedly. These opcodes |
2037 | | are used when all the characters in the class have values in the range |
2038 | | 0-255, and either the matching is caseful, or the characters are in the |
2039 | | range 0-127 when UTF processing is enabled. The only difference between |
2040 | | OP_CLASS and OP_NCLASS occurs when a data character outside the range is |
2041 | | encountered. */ |
2042 | | |
2043 | 0 | #define Lmin F->temp_32[0] |
2044 | 0 | #define Lmax F->temp_32[1] |
2045 | 0 | #define Lstart_eptr F->temp_sptr[0] |
2046 | 0 | #define Lbyte_map_address F->temp_sptr[1] |
2047 | 0 | #define Lbyte_map ((const unsigned char *)Lbyte_map_address) |
2048 | | |
2049 | 0 | case OP_NCLASS: |
2050 | 0 | case OP_CLASS: |
2051 | 0 | { |
2052 | 0 | Lbyte_map_address = Fecode + 1; /* Save for matching */ |
2053 | 0 | Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */ |
2054 | | |
2055 | | /* Look past the end of the item to see if there is repeat information |
2056 | | following. Then obey similar code to character type repeats. */ |
2057 | |
|
2058 | 0 | switch (*Fecode) |
2059 | 0 | { |
2060 | 0 | case OP_CRSTAR: |
2061 | 0 | case OP_CRMINSTAR: |
2062 | 0 | case OP_CRPLUS: |
2063 | 0 | case OP_CRMINPLUS: |
2064 | 0 | case OP_CRQUERY: |
2065 | 0 | case OP_CRMINQUERY: |
2066 | 0 | case OP_CRPOSSTAR: |
2067 | 0 | case OP_CRPOSPLUS: |
2068 | 0 | case OP_CRPOSQUERY: |
2069 | 0 | fc = *Fecode++ - OP_CRSTAR; |
2070 | 0 | Lmin = rep_min[fc]; |
2071 | 0 | Lmax = rep_max[fc]; |
2072 | 0 | reptype = rep_typ[fc]; |
2073 | 0 | break; |
2074 | | |
2075 | 0 | case OP_CRRANGE: |
2076 | 0 | case OP_CRMINRANGE: |
2077 | 0 | case OP_CRPOSRANGE: |
2078 | 0 | Lmin = GET2(Fecode, 1); |
2079 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
2080 | 0 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
2081 | 0 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
2082 | 0 | Fecode += 1 + 2 * IMM2_SIZE; |
2083 | 0 | break; |
2084 | | |
2085 | 0 | default: /* No repeat follows */ |
2086 | 0 | Lmin = Lmax = 1; |
2087 | 0 | break; |
2088 | 0 | } |
2089 | | |
2090 | | /* First, ensure the minimum number of matches are present. */ |
2091 | | |
2092 | 0 | #ifdef SUPPORT_UNICODE |
2093 | 0 | if (utf) |
2094 | 0 | { |
2095 | 0 | for (i = 1; i <= Lmin; i++) |
2096 | 0 | { |
2097 | 0 | if (Feptr >= mb->end_subject) |
2098 | 0 | { |
2099 | 0 | SCHECK_PARTIAL(); |
2100 | 0 | RRETURN(MATCH_NOMATCH); |
2101 | 0 | } |
2102 | 0 | GETCHARINC(fc, Feptr); |
2103 | 0 | if (fc > 255) |
2104 | 0 | { |
2105 | 0 | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2106 | 0 | } |
2107 | 0 | else |
2108 | 0 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
2109 | 0 | } |
2110 | 0 | } |
2111 | 0 | else |
2112 | 0 | #endif |
2113 | | /* Not UTF mode */ |
2114 | 0 | { |
2115 | 0 | for (i = 1; i <= Lmin; i++) |
2116 | 0 | { |
2117 | 0 | if (Feptr >= mb->end_subject) |
2118 | 0 | { |
2119 | 0 | SCHECK_PARTIAL(); |
2120 | 0 | RRETURN(MATCH_NOMATCH); |
2121 | 0 | } |
2122 | 0 | fc = *Feptr++; |
2123 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
2124 | | if (fc > 255) |
2125 | | { |
2126 | | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2127 | | } |
2128 | | else |
2129 | | #endif |
2130 | 0 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
2131 | 0 | } |
2132 | 0 | } |
2133 | | |
2134 | | /* If Lmax == Lmin we are done. Continue with main loop. */ |
2135 | | |
2136 | 0 | if (Lmin == Lmax) continue; |
2137 | | |
2138 | | /* If minimizing, keep testing the rest of the expression and advancing |
2139 | | the pointer while it matches the class. */ |
2140 | | |
2141 | 0 | if (reptype == REPTYPE_MIN) |
2142 | 0 | { |
2143 | 0 | #ifdef SUPPORT_UNICODE |
2144 | 0 | if (utf) |
2145 | 0 | { |
2146 | 0 | for (;;) |
2147 | 0 | { |
2148 | 0 | RMATCH(Fecode, RM200); |
2149 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2150 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2151 | 0 | if (Feptr >= mb->end_subject) |
2152 | 0 | { |
2153 | 0 | SCHECK_PARTIAL(); |
2154 | 0 | RRETURN(MATCH_NOMATCH); |
2155 | 0 | } |
2156 | 0 | GETCHARINC(fc, Feptr); |
2157 | 0 | if (fc > 255) |
2158 | 0 | { |
2159 | 0 | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2160 | 0 | } |
2161 | 0 | else |
2162 | 0 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
2163 | 0 | } |
2164 | 0 | } |
2165 | 0 | else |
2166 | 0 | #endif |
2167 | | /* Not UTF mode */ |
2168 | 0 | { |
2169 | 0 | for (;;) |
2170 | 0 | { |
2171 | 0 | RMATCH(Fecode, RM23); |
2172 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2173 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2174 | 0 | if (Feptr >= mb->end_subject) |
2175 | 0 | { |
2176 | 0 | SCHECK_PARTIAL(); |
2177 | 0 | RRETURN(MATCH_NOMATCH); |
2178 | 0 | } |
2179 | 0 | fc = *Feptr++; |
2180 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
2181 | | if (fc > 255) |
2182 | | { |
2183 | | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2184 | | } |
2185 | | else |
2186 | | #endif |
2187 | 0 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
2188 | 0 | } |
2189 | 0 | } |
2190 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2191 | 0 | } |
2192 | | |
2193 | | /* If maximizing, find the longest possible run, then work backwards. */ |
2194 | | |
2195 | 0 | else |
2196 | 0 | { |
2197 | 0 | Lstart_eptr = Feptr; |
2198 | |
|
2199 | 0 | #ifdef SUPPORT_UNICODE |
2200 | 0 | if (utf) |
2201 | 0 | { |
2202 | 0 | for (i = Lmin; i < Lmax; i++) |
2203 | 0 | { |
2204 | 0 | int len = 1; |
2205 | 0 | if (Feptr >= mb->end_subject) |
2206 | 0 | { |
2207 | 0 | SCHECK_PARTIAL(); |
2208 | 0 | break; |
2209 | 0 | } |
2210 | 0 | GETCHARLEN(fc, Feptr, len); |
2211 | 0 | if (fc > 255) |
2212 | 0 | { |
2213 | 0 | if (Fop == OP_CLASS) break; |
2214 | 0 | } |
2215 | 0 | else |
2216 | 0 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; |
2217 | 0 | Feptr += len; |
2218 | 0 | } |
2219 | | |
2220 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2221 | | |
2222 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
2223 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
2224 | | go too far. */ |
2225 | | |
2226 | 0 | for (;;) |
2227 | 0 | { |
2228 | 0 | RMATCH(Fecode, RM201); |
2229 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2230 | 0 | if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ |
2231 | 0 | BACKCHAR(Feptr); |
2232 | 0 | } |
2233 | 0 | } |
2234 | 0 | else |
2235 | 0 | #endif |
2236 | | /* Not UTF mode */ |
2237 | 0 | { |
2238 | 0 | for (i = Lmin; i < Lmax; i++) |
2239 | 0 | { |
2240 | 0 | if (Feptr >= mb->end_subject) |
2241 | 0 | { |
2242 | 0 | SCHECK_PARTIAL(); |
2243 | 0 | break; |
2244 | 0 | } |
2245 | 0 | fc = *Feptr; |
2246 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
2247 | | if (fc > 255) |
2248 | | { |
2249 | | if (Fop == OP_CLASS) break; |
2250 | | } |
2251 | | else |
2252 | | #endif |
2253 | 0 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; |
2254 | 0 | Feptr++; |
2255 | 0 | } |
2256 | | |
2257 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2258 | | |
2259 | 0 | while (Feptr >= Lstart_eptr) |
2260 | 0 | { |
2261 | 0 | RMATCH(Fecode, RM24); |
2262 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2263 | 0 | Feptr--; |
2264 | 0 | } |
2265 | 0 | } |
2266 | | |
2267 | 0 | RRETURN(MATCH_NOMATCH); |
2268 | 0 | } |
2269 | 0 | } |
2270 | | |
2271 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2272 | | |
2273 | 0 | #undef Lbyte_map_address |
2274 | 0 | #undef Lbyte_map |
2275 | 0 | #undef Lstart_eptr |
2276 | 0 | #undef Lmin |
2277 | 0 | #undef Lmax |
2278 | | |
2279 | | |
2280 | | /* ===================================================================== */ |
2281 | | /* Match an extended character class. In the 8-bit library, this opcode is |
2282 | | encountered only when UTF-8 mode mode is supported. In the 16-bit and |
2283 | | 32-bit libraries, codepoints greater than 255 may be encountered even when |
2284 | | UTF is not supported. */ |
2285 | | |
2286 | 0 | #define Lstart_eptr F->temp_sptr[0] |
2287 | 0 | #define Lxclass_data F->temp_sptr[1] |
2288 | 0 | #define Lmin F->temp_32[0] |
2289 | 0 | #define Lmax F->temp_32[1] |
2290 | | |
2291 | 0 | #ifdef SUPPORT_WIDE_CHARS |
2292 | 0 | case OP_XCLASS: |
2293 | 0 | { |
2294 | 0 | Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */ |
2295 | 0 | Fecode += GET(Fecode, 1); /* Advance past the item */ |
2296 | |
|
2297 | 0 | switch (*Fecode) |
2298 | 0 | { |
2299 | 0 | case OP_CRSTAR: |
2300 | 0 | case OP_CRMINSTAR: |
2301 | 0 | case OP_CRPLUS: |
2302 | 0 | case OP_CRMINPLUS: |
2303 | 0 | case OP_CRQUERY: |
2304 | 0 | case OP_CRMINQUERY: |
2305 | 0 | case OP_CRPOSSTAR: |
2306 | 0 | case OP_CRPOSPLUS: |
2307 | 0 | case OP_CRPOSQUERY: |
2308 | 0 | fc = *Fecode++ - OP_CRSTAR; |
2309 | 0 | Lmin = rep_min[fc]; |
2310 | 0 | Lmax = rep_max[fc]; |
2311 | 0 | reptype = rep_typ[fc]; |
2312 | 0 | break; |
2313 | | |
2314 | 0 | case OP_CRRANGE: |
2315 | 0 | case OP_CRMINRANGE: |
2316 | 0 | case OP_CRPOSRANGE: |
2317 | 0 | Lmin = GET2(Fecode, 1); |
2318 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
2319 | 0 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
2320 | 0 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
2321 | 0 | Fecode += 1 + 2 * IMM2_SIZE; |
2322 | 0 | break; |
2323 | | |
2324 | 0 | default: /* No repeat follows */ |
2325 | 0 | Lmin = Lmax = 1; |
2326 | 0 | break; |
2327 | 0 | } |
2328 | | |
2329 | | /* First, ensure the minimum number of matches are present. */ |
2330 | | |
2331 | 0 | for (i = 1; i <= Lmin; i++) |
2332 | 0 | { |
2333 | 0 | if (Feptr >= mb->end_subject) |
2334 | 0 | { |
2335 | 0 | SCHECK_PARTIAL(); |
2336 | 0 | RRETURN(MATCH_NOMATCH); |
2337 | 0 | } |
2338 | 0 | GETCHARINCTEST(fc, Feptr); |
2339 | 0 | if (!PRIV(xclass)(fc, Lxclass_data, |
2340 | 0 | (const uint8_t*)mb->start_code, utf)) |
2341 | 0 | RRETURN(MATCH_NOMATCH); |
2342 | 0 | } |
2343 | | |
2344 | | /* If Lmax == Lmin we can just continue with the main loop. */ |
2345 | | |
2346 | 0 | if (Lmin == Lmax) continue; |
2347 | | |
2348 | | /* If minimizing, keep testing the rest of the expression and advancing |
2349 | | the pointer while it matches the class. */ |
2350 | | |
2351 | 0 | if (reptype == REPTYPE_MIN) |
2352 | 0 | { |
2353 | 0 | for (;;) |
2354 | 0 | { |
2355 | 0 | RMATCH(Fecode, RM100); |
2356 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2357 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2358 | 0 | if (Feptr >= mb->end_subject) |
2359 | 0 | { |
2360 | 0 | SCHECK_PARTIAL(); |
2361 | 0 | RRETURN(MATCH_NOMATCH); |
2362 | 0 | } |
2363 | 0 | GETCHARINCTEST(fc, Feptr); |
2364 | 0 | if (!PRIV(xclass)(fc, Lxclass_data, |
2365 | 0 | (const uint8_t*)mb->start_code, utf)) |
2366 | 0 | RRETURN(MATCH_NOMATCH); |
2367 | 0 | } |
2368 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2369 | 0 | } |
2370 | | |
2371 | | /* If maximizing, find the longest possible run, then work backwards. */ |
2372 | | |
2373 | 0 | else |
2374 | 0 | { |
2375 | 0 | Lstart_eptr = Feptr; |
2376 | 0 | for (i = Lmin; i < Lmax; i++) |
2377 | 0 | { |
2378 | 0 | int len = 1; |
2379 | 0 | if (Feptr >= mb->end_subject) |
2380 | 0 | { |
2381 | 0 | SCHECK_PARTIAL(); |
2382 | 0 | break; |
2383 | 0 | } |
2384 | 0 | #ifdef SUPPORT_UNICODE |
2385 | 0 | GETCHARLENTEST(fc, Feptr, len); |
2386 | | #else |
2387 | | fc = *Feptr; |
2388 | | #endif |
2389 | 0 | if (!PRIV(xclass)(fc, Lxclass_data, |
2390 | 0 | (const uint8_t*)mb->start_code, utf)) break; |
2391 | 0 | Feptr += len; |
2392 | 0 | } |
2393 | | |
2394 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2395 | | |
2396 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
2397 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
2398 | | go too far. */ |
2399 | | |
2400 | 0 | for(;;) |
2401 | 0 | { |
2402 | 0 | RMATCH(Fecode, RM101); |
2403 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2404 | 0 | if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ |
2405 | 0 | #ifdef SUPPORT_UNICODE |
2406 | 0 | if (utf) BACKCHAR(Feptr); |
2407 | 0 | #endif |
2408 | 0 | } |
2409 | 0 | RRETURN(MATCH_NOMATCH); |
2410 | 0 | } |
2411 | | |
2412 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2413 | 0 | } |
2414 | 0 | #endif /* SUPPORT_WIDE_CHARS: end of XCLASS */ |
2415 | | |
2416 | 0 | #undef Lstart_eptr |
2417 | 0 | #undef Lxclass_data |
2418 | 0 | #undef Lmin |
2419 | 0 | #undef Lmax |
2420 | | |
2421 | | |
2422 | | /* ===================================================================== */ |
2423 | | /* Match a complex, set-based character class. This opcodes are used when |
2424 | | there is complex nesting or logical operations within the character |
2425 | | class. */ |
2426 | | |
2427 | 0 | #define Lstart_eptr F->temp_sptr[0] |
2428 | 0 | #define Leclass_data F->temp_sptr[1] |
2429 | 0 | #define Leclass_len F->temp_size |
2430 | 0 | #define Lmin F->temp_32[0] |
2431 | 0 | #define Lmax F->temp_32[1] |
2432 | | |
2433 | 0 | #ifdef SUPPORT_WIDE_CHARS |
2434 | 0 | case OP_ECLASS: |
2435 | 0 | { |
2436 | 0 | Leclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */ |
2437 | 0 | Fecode += GET(Fecode, 1); /* Advance past the item */ |
2438 | 0 | Leclass_len = (PCRE2_SIZE)(Fecode - Leclass_data); |
2439 | |
|
2440 | 0 | switch (*Fecode) |
2441 | 0 | { |
2442 | 0 | case OP_CRSTAR: |
2443 | 0 | case OP_CRMINSTAR: |
2444 | 0 | case OP_CRPLUS: |
2445 | 0 | case OP_CRMINPLUS: |
2446 | 0 | case OP_CRQUERY: |
2447 | 0 | case OP_CRMINQUERY: |
2448 | 0 | case OP_CRPOSSTAR: |
2449 | 0 | case OP_CRPOSPLUS: |
2450 | 0 | case OP_CRPOSQUERY: |
2451 | 0 | fc = *Fecode++ - OP_CRSTAR; |
2452 | 0 | Lmin = rep_min[fc]; |
2453 | 0 | Lmax = rep_max[fc]; |
2454 | 0 | reptype = rep_typ[fc]; |
2455 | 0 | break; |
2456 | | |
2457 | 0 | case OP_CRRANGE: |
2458 | 0 | case OP_CRMINRANGE: |
2459 | 0 | case OP_CRPOSRANGE: |
2460 | 0 | Lmin = GET2(Fecode, 1); |
2461 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
2462 | 0 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
2463 | 0 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
2464 | 0 | Fecode += 1 + 2 * IMM2_SIZE; |
2465 | 0 | break; |
2466 | | |
2467 | 0 | default: /* No repeat follows */ |
2468 | 0 | Lmin = Lmax = 1; |
2469 | 0 | break; |
2470 | 0 | } |
2471 | | |
2472 | | /* First, ensure the minimum number of matches are present. */ |
2473 | | |
2474 | 0 | for (i = 1; i <= Lmin; i++) |
2475 | 0 | { |
2476 | 0 | if (Feptr >= mb->end_subject) |
2477 | 0 | { |
2478 | 0 | SCHECK_PARTIAL(); |
2479 | 0 | RRETURN(MATCH_NOMATCH); |
2480 | 0 | } |
2481 | 0 | GETCHARINCTEST(fc, Feptr); |
2482 | 0 | if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len, |
2483 | 0 | (const uint8_t*)mb->start_code, utf)) |
2484 | 0 | RRETURN(MATCH_NOMATCH); |
2485 | 0 | } |
2486 | | |
2487 | | /* If Lmax == Lmin we can just continue with the main loop. */ |
2488 | | |
2489 | 0 | if (Lmin == Lmax) continue; |
2490 | | |
2491 | | /* If minimizing, keep testing the rest of the expression and advancing |
2492 | | the pointer while it matches the class. */ |
2493 | | |
2494 | 0 | if (reptype == REPTYPE_MIN) |
2495 | 0 | { |
2496 | 0 | for (;;) |
2497 | 0 | { |
2498 | 0 | RMATCH(Fecode, RM102); |
2499 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2500 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2501 | 0 | if (Feptr >= mb->end_subject) |
2502 | 0 | { |
2503 | 0 | SCHECK_PARTIAL(); |
2504 | 0 | RRETURN(MATCH_NOMATCH); |
2505 | 0 | } |
2506 | 0 | GETCHARINCTEST(fc, Feptr); |
2507 | 0 | if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len, |
2508 | 0 | (const uint8_t*)mb->start_code, utf)) |
2509 | 0 | RRETURN(MATCH_NOMATCH); |
2510 | 0 | } |
2511 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2512 | 0 | } |
2513 | | |
2514 | | /* If maximizing, find the longest possible run, then work backwards. */ |
2515 | | |
2516 | 0 | else |
2517 | 0 | { |
2518 | 0 | Lstart_eptr = Feptr; |
2519 | 0 | for (i = Lmin; i < Lmax; i++) |
2520 | 0 | { |
2521 | 0 | int len = 1; |
2522 | 0 | if (Feptr >= mb->end_subject) |
2523 | 0 | { |
2524 | 0 | SCHECK_PARTIAL(); |
2525 | 0 | break; |
2526 | 0 | } |
2527 | 0 | #ifdef SUPPORT_UNICODE |
2528 | 0 | GETCHARLENTEST(fc, Feptr, len); |
2529 | | #else |
2530 | | fc = *Feptr; |
2531 | | #endif |
2532 | 0 | if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len, |
2533 | 0 | (const uint8_t*)mb->start_code, utf)) |
2534 | 0 | break; |
2535 | 0 | Feptr += len; |
2536 | 0 | } |
2537 | | |
2538 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2539 | | |
2540 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
2541 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
2542 | | go too far. */ |
2543 | | |
2544 | 0 | for(;;) |
2545 | 0 | { |
2546 | 0 | RMATCH(Fecode, RM103); |
2547 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2548 | 0 | if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ |
2549 | 0 | #ifdef SUPPORT_UNICODE |
2550 | 0 | if (utf) BACKCHAR(Feptr); |
2551 | 0 | #endif |
2552 | 0 | } |
2553 | 0 | RRETURN(MATCH_NOMATCH); |
2554 | 0 | } |
2555 | | |
2556 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
2557 | 0 | } |
2558 | 0 | #endif /* SUPPORT_WIDE_CHARS: end of ECLASS */ |
2559 | | |
2560 | 0 | #undef Lstart_eptr |
2561 | 0 | #undef Leclass_data |
2562 | 0 | #undef Leclass_len |
2563 | 0 | #undef Lmin |
2564 | 0 | #undef Lmax |
2565 | | |
2566 | | |
2567 | | /* ===================================================================== */ |
2568 | | /* Match various character types when PCRE2_UCP is not set. These opcodes |
2569 | | are not generated when PCRE2_UCP is set - instead appropriate property |
2570 | | tests are compiled. */ |
2571 | | |
2572 | 0 | case OP_NOT_DIGIT: |
2573 | 0 | if (Feptr >= mb->end_subject) |
2574 | 0 | { |
2575 | 0 | SCHECK_PARTIAL(); |
2576 | 0 | RRETURN(MATCH_NOMATCH); |
2577 | 0 | } |
2578 | 0 | GETCHARINCTEST(fc, Feptr); |
2579 | 0 | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) |
2580 | 0 | RRETURN(MATCH_NOMATCH); |
2581 | 0 | Fecode++; |
2582 | 0 | break; |
2583 | | |
2584 | 0 | case OP_DIGIT: |
2585 | 0 | if (Feptr >= mb->end_subject) |
2586 | 0 | { |
2587 | 0 | SCHECK_PARTIAL(); |
2588 | 0 | RRETURN(MATCH_NOMATCH); |
2589 | 0 | } |
2590 | 0 | GETCHARINCTEST(fc, Feptr); |
2591 | 0 | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) |
2592 | 0 | RRETURN(MATCH_NOMATCH); |
2593 | 0 | Fecode++; |
2594 | 0 | break; |
2595 | | |
2596 | 0 | case OP_NOT_WHITESPACE: |
2597 | 0 | if (Feptr >= mb->end_subject) |
2598 | 0 | { |
2599 | 0 | SCHECK_PARTIAL(); |
2600 | 0 | RRETURN(MATCH_NOMATCH); |
2601 | 0 | } |
2602 | 0 | GETCHARINCTEST(fc, Feptr); |
2603 | 0 | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) |
2604 | 0 | RRETURN(MATCH_NOMATCH); |
2605 | 0 | Fecode++; |
2606 | 0 | break; |
2607 | | |
2608 | 0 | case OP_WHITESPACE: |
2609 | 0 | if (Feptr >= mb->end_subject) |
2610 | 0 | { |
2611 | 0 | SCHECK_PARTIAL(); |
2612 | 0 | RRETURN(MATCH_NOMATCH); |
2613 | 0 | } |
2614 | 0 | GETCHARINCTEST(fc, Feptr); |
2615 | 0 | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) |
2616 | 0 | RRETURN(MATCH_NOMATCH); |
2617 | 0 | Fecode++; |
2618 | 0 | break; |
2619 | | |
2620 | 0 | case OP_NOT_WORDCHAR: |
2621 | 0 | if (Feptr >= mb->end_subject) |
2622 | 0 | { |
2623 | 0 | SCHECK_PARTIAL(); |
2624 | 0 | RRETURN(MATCH_NOMATCH); |
2625 | 0 | } |
2626 | 0 | GETCHARINCTEST(fc, Feptr); |
2627 | 0 | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) |
2628 | 0 | RRETURN(MATCH_NOMATCH); |
2629 | 0 | Fecode++; |
2630 | 0 | break; |
2631 | | |
2632 | 0 | case OP_WORDCHAR: |
2633 | 0 | if (Feptr >= mb->end_subject) |
2634 | 0 | { |
2635 | 0 | SCHECK_PARTIAL(); |
2636 | 0 | RRETURN(MATCH_NOMATCH); |
2637 | 0 | } |
2638 | 0 | GETCHARINCTEST(fc, Feptr); |
2639 | 0 | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) |
2640 | 0 | RRETURN(MATCH_NOMATCH); |
2641 | 0 | Fecode++; |
2642 | 0 | break; |
2643 | | |
2644 | 0 | case OP_ANYNL: |
2645 | 0 | if (Feptr >= mb->end_subject) |
2646 | 0 | { |
2647 | 0 | SCHECK_PARTIAL(); |
2648 | 0 | RRETURN(MATCH_NOMATCH); |
2649 | 0 | } |
2650 | 0 | GETCHARINCTEST(fc, Feptr); |
2651 | 0 | switch(fc) |
2652 | 0 | { |
2653 | 0 | default: RRETURN(MATCH_NOMATCH); |
2654 | | |
2655 | 0 | case CHAR_CR: |
2656 | 0 | if (Feptr >= mb->end_subject) |
2657 | 0 | { |
2658 | 0 | SCHECK_PARTIAL(); |
2659 | 0 | } |
2660 | 0 | else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++; |
2661 | 0 | break; |
2662 | | |
2663 | 0 | case CHAR_LF: |
2664 | 0 | break; |
2665 | | |
2666 | 0 | case CHAR_VT: |
2667 | 0 | case CHAR_FF: |
2668 | 0 | case CHAR_NEL: |
2669 | 0 | #ifndef EBCDIC |
2670 | 0 | case 0x2028: |
2671 | 0 | case 0x2029: |
2672 | 0 | #endif /* Not EBCDIC */ |
2673 | 0 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
2674 | 0 | break; |
2675 | 0 | } |
2676 | 0 | Fecode++; |
2677 | 0 | break; |
2678 | | |
2679 | 0 | case OP_NOT_HSPACE: |
2680 | 0 | if (Feptr >= mb->end_subject) |
2681 | 0 | { |
2682 | 0 | SCHECK_PARTIAL(); |
2683 | 0 | RRETURN(MATCH_NOMATCH); |
2684 | 0 | } |
2685 | 0 | GETCHARINCTEST(fc, Feptr); |
2686 | 0 | switch(fc) |
2687 | 0 | { |
2688 | 0 | HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ |
2689 | 0 | default: break; |
2690 | 0 | } |
2691 | 0 | Fecode++; |
2692 | 0 | break; |
2693 | | |
2694 | 0 | case OP_HSPACE: |
2695 | 0 | if (Feptr >= mb->end_subject) |
2696 | 0 | { |
2697 | 0 | SCHECK_PARTIAL(); |
2698 | 0 | RRETURN(MATCH_NOMATCH); |
2699 | 0 | } |
2700 | 0 | GETCHARINCTEST(fc, Feptr); |
2701 | 0 | switch(fc) |
2702 | 0 | { |
2703 | 0 | HSPACE_CASES: break; /* Byte and multibyte cases */ |
2704 | 0 | default: RRETURN(MATCH_NOMATCH); |
2705 | 0 | } |
2706 | 0 | Fecode++; |
2707 | 0 | break; |
2708 | | |
2709 | 0 | case OP_NOT_VSPACE: |
2710 | 0 | if (Feptr >= mb->end_subject) |
2711 | 0 | { |
2712 | 0 | SCHECK_PARTIAL(); |
2713 | 0 | RRETURN(MATCH_NOMATCH); |
2714 | 0 | } |
2715 | 0 | GETCHARINCTEST(fc, Feptr); |
2716 | 0 | switch(fc) |
2717 | 0 | { |
2718 | 0 | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
2719 | 0 | default: break; |
2720 | 0 | } |
2721 | 0 | Fecode++; |
2722 | 0 | break; |
2723 | | |
2724 | 0 | case OP_VSPACE: |
2725 | 0 | if (Feptr >= mb->end_subject) |
2726 | 0 | { |
2727 | 0 | SCHECK_PARTIAL(); |
2728 | 0 | RRETURN(MATCH_NOMATCH); |
2729 | 0 | } |
2730 | 0 | GETCHARINCTEST(fc, Feptr); |
2731 | 0 | switch(fc) |
2732 | 0 | { |
2733 | 0 | VSPACE_CASES: break; |
2734 | 0 | default: RRETURN(MATCH_NOMATCH); |
2735 | 0 | } |
2736 | 0 | Fecode++; |
2737 | 0 | break; |
2738 | | |
2739 | | |
2740 | 0 | #ifdef SUPPORT_UNICODE |
2741 | | |
2742 | | /* ===================================================================== */ |
2743 | | /* Check the next character by Unicode property. We will get here only |
2744 | | if the support is in the binary; otherwise a compile-time error occurs. */ |
2745 | | |
2746 | 0 | case OP_PROP: |
2747 | 0 | case OP_NOTPROP: |
2748 | 0 | if (Feptr >= mb->end_subject) |
2749 | 0 | { |
2750 | 0 | SCHECK_PARTIAL(); |
2751 | 0 | RRETURN(MATCH_NOMATCH); |
2752 | 0 | } |
2753 | 0 | GETCHARINCTEST(fc, Feptr); |
2754 | 0 | { |
2755 | 0 | const uint32_t *cp; |
2756 | 0 | uint32_t chartype; |
2757 | 0 | const ucd_record *prop = GET_UCD(fc); |
2758 | 0 | BOOL notmatch = Fop == OP_NOTPROP; |
2759 | |
|
2760 | 0 | switch(Fecode[1]) |
2761 | 0 | { |
2762 | 0 | case PT_LAMP: |
2763 | 0 | chartype = prop->chartype; |
2764 | 0 | if ((chartype == ucp_Lu || |
2765 | 0 | chartype == ucp_Ll || |
2766 | 0 | chartype == ucp_Lt) == notmatch) |
2767 | 0 | RRETURN(MATCH_NOMATCH); |
2768 | 0 | break; |
2769 | | |
2770 | 0 | case PT_GC: |
2771 | 0 | if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch) |
2772 | 0 | RRETURN(MATCH_NOMATCH); |
2773 | 0 | break; |
2774 | | |
2775 | 0 | case PT_PC: |
2776 | 0 | if ((Fecode[2] == prop->chartype) == notmatch) |
2777 | 0 | RRETURN(MATCH_NOMATCH); |
2778 | 0 | break; |
2779 | | |
2780 | 0 | case PT_SC: |
2781 | 0 | if ((Fecode[2] == prop->script) == notmatch) |
2782 | 0 | RRETURN(MATCH_NOMATCH); |
2783 | 0 | break; |
2784 | | |
2785 | 0 | case PT_SCX: |
2786 | 0 | { |
2787 | 0 | BOOL ok = (Fecode[2] == prop->script || |
2788 | 0 | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0); |
2789 | 0 | if (ok == notmatch) RRETURN(MATCH_NOMATCH); |
2790 | 0 | } |
2791 | 0 | break; |
2792 | | |
2793 | | /* These are specials */ |
2794 | | |
2795 | 0 | case PT_ALNUM: |
2796 | 0 | chartype = prop->chartype; |
2797 | 0 | if ((PRIV(ucp_gentype)[chartype] == ucp_L || |
2798 | 0 | PRIV(ucp_gentype)[chartype] == ucp_N) == notmatch) |
2799 | 0 | RRETURN(MATCH_NOMATCH); |
2800 | 0 | break; |
2801 | | |
2802 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
2803 | | which means that Perl space and POSIX space are now identical. PCRE |
2804 | | was changed at release 8.34. */ |
2805 | | |
2806 | 0 | case PT_SPACE: /* Perl space */ |
2807 | 0 | case PT_PXSPACE: /* POSIX space */ |
2808 | 0 | switch(fc) |
2809 | 0 | { |
2810 | 0 | HSPACE_CASES: |
2811 | 0 | VSPACE_CASES: |
2812 | 0 | if (notmatch) RRETURN(MATCH_NOMATCH); |
2813 | 0 | break; |
2814 | | |
2815 | 0 | default: |
2816 | 0 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch) |
2817 | 0 | RRETURN(MATCH_NOMATCH); |
2818 | 0 | break; |
2819 | 0 | } |
2820 | 0 | break; |
2821 | | |
2822 | 0 | case PT_WORD: |
2823 | 0 | chartype = prop->chartype; |
2824 | 0 | if ((PRIV(ucp_gentype)[chartype] == ucp_L || |
2825 | 0 | PRIV(ucp_gentype)[chartype] == ucp_N || |
2826 | 0 | chartype == ucp_Mn || |
2827 | 0 | chartype == ucp_Pc) == notmatch) |
2828 | 0 | RRETURN(MATCH_NOMATCH); |
2829 | 0 | break; |
2830 | | |
2831 | 0 | case PT_CLIST: |
2832 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
2833 | | if (fc > MAX_UTF_CODE_POINT) |
2834 | | { |
2835 | | if (notmatch) break;; |
2836 | | RRETURN(MATCH_NOMATCH); |
2837 | | } |
2838 | | #endif |
2839 | 0 | cp = PRIV(ucd_caseless_sets) + Fecode[2]; |
2840 | 0 | for (;;) |
2841 | 0 | { |
2842 | 0 | if (fc < *cp) |
2843 | 0 | { if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } } |
2844 | 0 | if (fc == *cp++) |
2845 | 0 | { if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; } |
2846 | 0 | } |
2847 | 0 | break; |
2848 | | |
2849 | 0 | case PT_UCNC: |
2850 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
2851 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
2852 | 0 | fc >= 0xe000) == notmatch) |
2853 | 0 | RRETURN(MATCH_NOMATCH); |
2854 | 0 | break; |
2855 | | |
2856 | 0 | case PT_BIDICL: |
2857 | 0 | if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch) |
2858 | 0 | RRETURN(MATCH_NOMATCH); |
2859 | 0 | break; |
2860 | | |
2861 | 0 | case PT_BOOL: |
2862 | 0 | { |
2863 | 0 | BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
2864 | 0 | UCD_BPROPS_PROP(prop), Fecode[2]) != 0; |
2865 | 0 | if (ok == notmatch) RRETURN(MATCH_NOMATCH); |
2866 | 0 | } |
2867 | 0 | break; |
2868 | | |
2869 | | /* This should never occur */ |
2870 | | |
2871 | | /* LCOV_EXCL_START */ |
2872 | 0 | default: |
2873 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
2874 | 0 | return PCRE2_ERROR_INTERNAL; |
2875 | | /* LCOV_EXCL_STOP */ |
2876 | 0 | } |
2877 | | |
2878 | 0 | Fecode += 3; |
2879 | 0 | } |
2880 | 0 | break; |
2881 | | |
2882 | | |
2883 | | /* ===================================================================== */ |
2884 | | /* Match an extended Unicode sequence. We will get here only if the support |
2885 | | is in the binary; otherwise a compile-time error occurs. */ |
2886 | | |
2887 | 0 | case OP_EXTUNI: |
2888 | 0 | if (Feptr >= mb->end_subject) |
2889 | 0 | { |
2890 | 0 | SCHECK_PARTIAL(); |
2891 | 0 | RRETURN(MATCH_NOMATCH); |
2892 | 0 | } |
2893 | 0 | else |
2894 | 0 | { |
2895 | 0 | GETCHARINCTEST(fc, Feptr); |
2896 | 0 | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf, |
2897 | 0 | NULL); |
2898 | 0 | } |
2899 | 0 | CHECK_PARTIAL(); |
2900 | 0 | Fecode++; |
2901 | 0 | break; |
2902 | | |
2903 | 0 | #endif /* SUPPORT_UNICODE */ |
2904 | | |
2905 | | |
2906 | | /* ===================================================================== */ |
2907 | | /* Match a single character type repeatedly. Note that the property type |
2908 | | does not need to be in a stack frame as it is not used within an RMATCH() |
2909 | | loop. */ |
2910 | | |
2911 | 0 | #define Lstart_eptr F->temp_sptr[0] |
2912 | 0 | #define Lmin F->temp_32[0] |
2913 | 0 | #define Lmax F->temp_32[1] |
2914 | 0 | #define Lctype F->temp_32[2] |
2915 | 0 | #define Lpropvalue F->temp_32[3] |
2916 | | |
2917 | 0 | case OP_TYPEEXACT: |
2918 | 0 | Lmin = Lmax = GET2(Fecode, 1); |
2919 | 0 | Fecode += 1 + IMM2_SIZE; |
2920 | 0 | goto REPEATTYPE; |
2921 | | |
2922 | 0 | case OP_TYPEUPTO: |
2923 | 0 | case OP_TYPEMINUPTO: |
2924 | 0 | Lmin = 0; |
2925 | 0 | Lmax = GET2(Fecode, 1); |
2926 | 0 | reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX; |
2927 | 0 | Fecode += 1 + IMM2_SIZE; |
2928 | 0 | goto REPEATTYPE; |
2929 | | |
2930 | 0 | case OP_TYPEPOSSTAR: |
2931 | 0 | reptype = REPTYPE_POS; |
2932 | 0 | Lmin = 0; |
2933 | 0 | Lmax = UINT32_MAX; |
2934 | 0 | Fecode++; |
2935 | 0 | goto REPEATTYPE; |
2936 | | |
2937 | 0 | case OP_TYPEPOSPLUS: |
2938 | 0 | reptype = REPTYPE_POS; |
2939 | 0 | Lmin = 1; |
2940 | 0 | Lmax = UINT32_MAX; |
2941 | 0 | Fecode++; |
2942 | 0 | goto REPEATTYPE; |
2943 | | |
2944 | 0 | case OP_TYPEPOSQUERY: |
2945 | 0 | reptype = REPTYPE_POS; |
2946 | 0 | Lmin = 0; |
2947 | 0 | Lmax = 1; |
2948 | 0 | Fecode++; |
2949 | 0 | goto REPEATTYPE; |
2950 | | |
2951 | 0 | case OP_TYPEPOSUPTO: |
2952 | 0 | reptype = REPTYPE_POS; |
2953 | 0 | Lmin = 0; |
2954 | 0 | Lmax = GET2(Fecode, 1); |
2955 | 0 | Fecode += 1 + IMM2_SIZE; |
2956 | 0 | goto REPEATTYPE; |
2957 | | |
2958 | 0 | case OP_TYPESTAR: |
2959 | 0 | case OP_TYPEMINSTAR: |
2960 | 0 | case OP_TYPEPLUS: |
2961 | 0 | case OP_TYPEMINPLUS: |
2962 | 0 | case OP_TYPEQUERY: |
2963 | 0 | case OP_TYPEMINQUERY: |
2964 | 0 | fc = *Fecode++ - OP_TYPESTAR; |
2965 | 0 | Lmin = rep_min[fc]; |
2966 | 0 | Lmax = rep_max[fc]; |
2967 | 0 | reptype = rep_typ[fc]; |
2968 | | |
2969 | | /* Common code for all repeated character type matches. */ |
2970 | |
|
2971 | 0 | REPEATTYPE: |
2972 | 0 | Lctype = *Fecode++; /* Code for the character type */ |
2973 | |
|
2974 | 0 | #ifdef SUPPORT_UNICODE |
2975 | 0 | if (Lctype == OP_PROP || Lctype == OP_NOTPROP) |
2976 | 0 | { |
2977 | 0 | proptype = *Fecode++; |
2978 | 0 | Lpropvalue = *Fecode++; |
2979 | 0 | } |
2980 | 0 | else proptype = -1; |
2981 | 0 | #endif |
2982 | | |
2983 | | /* First, ensure the minimum number of matches are present. Use inline |
2984 | | code for maximizing the speed, and do the type test once at the start |
2985 | | (i.e. keep it out of the loops). As there are no calls to RMATCH in the |
2986 | | loops, we can use an ordinary variable for "notmatch". The code for UTF |
2987 | | mode is separated out for tidiness, except for Unicode property tests. */ |
2988 | |
|
2989 | 0 | if (Lmin > 0) |
2990 | 0 | { |
2991 | 0 | #ifdef SUPPORT_UNICODE |
2992 | 0 | if (proptype >= 0) /* Property tests in all modes */ |
2993 | 0 | { |
2994 | 0 | BOOL notmatch = Lctype == OP_NOTPROP; |
2995 | 0 | switch(proptype) |
2996 | 0 | { |
2997 | 0 | case PT_LAMP: |
2998 | 0 | for (i = 1; i <= Lmin; i++) |
2999 | 0 | { |
3000 | 0 | int chartype; |
3001 | 0 | if (Feptr >= mb->end_subject) |
3002 | 0 | { |
3003 | 0 | SCHECK_PARTIAL(); |
3004 | 0 | RRETURN(MATCH_NOMATCH); |
3005 | 0 | } |
3006 | 0 | GETCHARINCTEST(fc, Feptr); |
3007 | 0 | chartype = UCD_CHARTYPE(fc); |
3008 | 0 | if ((chartype == ucp_Lu || |
3009 | 0 | chartype == ucp_Ll || |
3010 | 0 | chartype == ucp_Lt) == notmatch) |
3011 | 0 | RRETURN(MATCH_NOMATCH); |
3012 | 0 | } |
3013 | 0 | break; |
3014 | | |
3015 | 0 | case PT_GC: |
3016 | 0 | for (i = 1; i <= Lmin; i++) |
3017 | 0 | { |
3018 | 0 | if (Feptr >= mb->end_subject) |
3019 | 0 | { |
3020 | 0 | SCHECK_PARTIAL(); |
3021 | 0 | RRETURN(MATCH_NOMATCH); |
3022 | 0 | } |
3023 | 0 | GETCHARINCTEST(fc, Feptr); |
3024 | 0 | if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) |
3025 | 0 | RRETURN(MATCH_NOMATCH); |
3026 | 0 | } |
3027 | 0 | break; |
3028 | | |
3029 | 0 | case PT_PC: |
3030 | 0 | for (i = 1; i <= Lmin; i++) |
3031 | 0 | { |
3032 | 0 | if (Feptr >= mb->end_subject) |
3033 | 0 | { |
3034 | 0 | SCHECK_PARTIAL(); |
3035 | 0 | RRETURN(MATCH_NOMATCH); |
3036 | 0 | } |
3037 | 0 | GETCHARINCTEST(fc, Feptr); |
3038 | 0 | if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) |
3039 | 0 | RRETURN(MATCH_NOMATCH); |
3040 | 0 | } |
3041 | 0 | break; |
3042 | | |
3043 | 0 | case PT_SC: |
3044 | 0 | for (i = 1; i <= Lmin; i++) |
3045 | 0 | { |
3046 | 0 | if (Feptr >= mb->end_subject) |
3047 | 0 | { |
3048 | 0 | SCHECK_PARTIAL(); |
3049 | 0 | RRETURN(MATCH_NOMATCH); |
3050 | 0 | } |
3051 | 0 | GETCHARINCTEST(fc, Feptr); |
3052 | 0 | if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) |
3053 | 0 | RRETURN(MATCH_NOMATCH); |
3054 | 0 | } |
3055 | 0 | break; |
3056 | | |
3057 | 0 | case PT_SCX: |
3058 | 0 | for (i = 1; i <= Lmin; i++) |
3059 | 0 | { |
3060 | 0 | BOOL ok; |
3061 | 0 | const ucd_record *prop; |
3062 | 0 | if (Feptr >= mb->end_subject) |
3063 | 0 | { |
3064 | 0 | SCHECK_PARTIAL(); |
3065 | 0 | RRETURN(MATCH_NOMATCH); |
3066 | 0 | } |
3067 | 0 | GETCHARINCTEST(fc, Feptr); |
3068 | 0 | prop = GET_UCD(fc); |
3069 | 0 | ok = (prop->script == Lpropvalue || |
3070 | 0 | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); |
3071 | 0 | if (ok == notmatch) |
3072 | 0 | RRETURN(MATCH_NOMATCH); |
3073 | 0 | } |
3074 | 0 | break; |
3075 | | |
3076 | 0 | case PT_ALNUM: |
3077 | 0 | for (i = 1; i <= Lmin; i++) |
3078 | 0 | { |
3079 | 0 | int category; |
3080 | 0 | if (Feptr >= mb->end_subject) |
3081 | 0 | { |
3082 | 0 | SCHECK_PARTIAL(); |
3083 | 0 | RRETURN(MATCH_NOMATCH); |
3084 | 0 | } |
3085 | 0 | GETCHARINCTEST(fc, Feptr); |
3086 | 0 | category = UCD_CATEGORY(fc); |
3087 | 0 | if ((category == ucp_L || category == ucp_N) == notmatch) |
3088 | 0 | RRETURN(MATCH_NOMATCH); |
3089 | 0 | } |
3090 | 0 | break; |
3091 | | |
3092 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
3093 | | which means that Perl space and POSIX space are now identical. PCRE |
3094 | | was changed at release 8.34. */ |
3095 | | |
3096 | 0 | case PT_SPACE: /* Perl space */ |
3097 | 0 | case PT_PXSPACE: /* POSIX space */ |
3098 | 0 | for (i = 1; i <= Lmin; i++) |
3099 | 0 | { |
3100 | 0 | if (Feptr >= mb->end_subject) |
3101 | 0 | { |
3102 | 0 | SCHECK_PARTIAL(); |
3103 | 0 | RRETURN(MATCH_NOMATCH); |
3104 | 0 | } |
3105 | 0 | GETCHARINCTEST(fc, Feptr); |
3106 | 0 | switch(fc) |
3107 | 0 | { |
3108 | 0 | HSPACE_CASES: |
3109 | 0 | VSPACE_CASES: |
3110 | 0 | if (notmatch) RRETURN(MATCH_NOMATCH); |
3111 | 0 | break; |
3112 | | |
3113 | 0 | default: |
3114 | 0 | if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch) |
3115 | 0 | RRETURN(MATCH_NOMATCH); |
3116 | 0 | break; |
3117 | 0 | } |
3118 | 0 | } |
3119 | 0 | break; |
3120 | | |
3121 | 0 | case PT_WORD: |
3122 | 0 | for (i = 1; i <= Lmin; i++) |
3123 | 0 | { |
3124 | 0 | int chartype, category; |
3125 | 0 | if (Feptr >= mb->end_subject) |
3126 | 0 | { |
3127 | 0 | SCHECK_PARTIAL(); |
3128 | 0 | RRETURN(MATCH_NOMATCH); |
3129 | 0 | } |
3130 | 0 | GETCHARINCTEST(fc, Feptr); |
3131 | 0 | chartype = UCD_CHARTYPE(fc); |
3132 | 0 | category = PRIV(ucp_gentype)[chartype]; |
3133 | 0 | if ((category == ucp_L || category == ucp_N || |
3134 | 0 | chartype == ucp_Mn || chartype == ucp_Pc) == notmatch) |
3135 | 0 | RRETURN(MATCH_NOMATCH); |
3136 | 0 | } |
3137 | 0 | break; |
3138 | | |
3139 | 0 | case PT_CLIST: |
3140 | 0 | for (i = 1; i <= Lmin; i++) |
3141 | 0 | { |
3142 | 0 | const uint32_t *cp; |
3143 | 0 | if (Feptr >= mb->end_subject) |
3144 | 0 | { |
3145 | 0 | SCHECK_PARTIAL(); |
3146 | 0 | RRETURN(MATCH_NOMATCH); |
3147 | 0 | } |
3148 | 0 | GETCHARINCTEST(fc, Feptr); |
3149 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
3150 | | if (fc > MAX_UTF_CODE_POINT) |
3151 | | { |
3152 | | if (notmatch) continue; |
3153 | | RRETURN(MATCH_NOMATCH); |
3154 | | } |
3155 | | #endif |
3156 | 0 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
3157 | 0 | for (;;) |
3158 | 0 | { |
3159 | 0 | if (fc < *cp) |
3160 | 0 | { |
3161 | 0 | if (notmatch) break; |
3162 | 0 | RRETURN(MATCH_NOMATCH); |
3163 | 0 | } |
3164 | 0 | if (fc == *cp++) |
3165 | 0 | { |
3166 | 0 | if (notmatch) RRETURN(MATCH_NOMATCH); |
3167 | 0 | break; |
3168 | 0 | } |
3169 | 0 | } |
3170 | 0 | } |
3171 | 0 | break; |
3172 | | |
3173 | 0 | case PT_UCNC: |
3174 | 0 | for (i = 1; i <= Lmin; i++) |
3175 | 0 | { |
3176 | 0 | if (Feptr >= mb->end_subject) |
3177 | 0 | { |
3178 | 0 | SCHECK_PARTIAL(); |
3179 | 0 | RRETURN(MATCH_NOMATCH); |
3180 | 0 | } |
3181 | 0 | GETCHARINCTEST(fc, Feptr); |
3182 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
3183 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
3184 | 0 | fc >= 0xe000) == notmatch) |
3185 | 0 | RRETURN(MATCH_NOMATCH); |
3186 | 0 | } |
3187 | 0 | break; |
3188 | | |
3189 | 0 | case PT_BIDICL: |
3190 | 0 | for (i = 1; i <= Lmin; i++) |
3191 | 0 | { |
3192 | 0 | if (Feptr >= mb->end_subject) |
3193 | 0 | { |
3194 | 0 | SCHECK_PARTIAL(); |
3195 | 0 | RRETURN(MATCH_NOMATCH); |
3196 | 0 | } |
3197 | 0 | GETCHARINCTEST(fc, Feptr); |
3198 | 0 | if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) |
3199 | 0 | RRETURN(MATCH_NOMATCH); |
3200 | 0 | } |
3201 | 0 | break; |
3202 | | |
3203 | 0 | case PT_BOOL: |
3204 | 0 | for (i = 1; i <= Lmin; i++) |
3205 | 0 | { |
3206 | 0 | BOOL ok; |
3207 | 0 | const ucd_record *prop; |
3208 | 0 | if (Feptr >= mb->end_subject) |
3209 | 0 | { |
3210 | 0 | SCHECK_PARTIAL(); |
3211 | 0 | RRETURN(MATCH_NOMATCH); |
3212 | 0 | } |
3213 | 0 | GETCHARINCTEST(fc, Feptr); |
3214 | 0 | prop = GET_UCD(fc); |
3215 | 0 | ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
3216 | 0 | UCD_BPROPS_PROP(prop), Lpropvalue) != 0; |
3217 | 0 | if (ok == notmatch) |
3218 | 0 | RRETURN(MATCH_NOMATCH); |
3219 | 0 | } |
3220 | 0 | break; |
3221 | | |
3222 | | /* This should not occur */ |
3223 | | |
3224 | | /* LCOV_EXCL_START */ |
3225 | 0 | default: |
3226 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
3227 | 0 | return PCRE2_ERROR_INTERNAL; |
3228 | | /* LCOV_EXCL_STOP */ |
3229 | 0 | } |
3230 | 0 | } |
3231 | | |
3232 | | /* Match extended Unicode sequences. We will get here only if the |
3233 | | support is in the binary; otherwise a compile-time error occurs. */ |
3234 | | |
3235 | 0 | else if (Lctype == OP_EXTUNI) |
3236 | 0 | { |
3237 | 0 | for (i = 1; i <= Lmin; i++) |
3238 | 0 | { |
3239 | 0 | if (Feptr >= mb->end_subject) |
3240 | 0 | { |
3241 | 0 | SCHECK_PARTIAL(); |
3242 | 0 | RRETURN(MATCH_NOMATCH); |
3243 | 0 | } |
3244 | 0 | else |
3245 | 0 | { |
3246 | 0 | GETCHARINCTEST(fc, Feptr); |
3247 | 0 | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, |
3248 | 0 | mb->end_subject, utf, NULL); |
3249 | 0 | } |
3250 | 0 | CHECK_PARTIAL(); |
3251 | 0 | } |
3252 | 0 | } |
3253 | 0 | else |
3254 | 0 | #endif /* SUPPORT_UNICODE */ |
3255 | | |
3256 | | /* Handle all other cases in UTF mode */ |
3257 | | |
3258 | 0 | #ifdef SUPPORT_UNICODE |
3259 | 0 | if (utf) switch(Lctype) |
3260 | 0 | { |
3261 | 0 | case OP_ANY: |
3262 | 0 | for (i = 1; i <= Lmin; i++) |
3263 | 0 | { |
3264 | 0 | if (Feptr >= mb->end_subject) |
3265 | 0 | { |
3266 | 0 | SCHECK_PARTIAL(); |
3267 | 0 | RRETURN(MATCH_NOMATCH); |
3268 | 0 | } |
3269 | 0 | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
3270 | 0 | if (mb->partial != 0 && |
3271 | 0 | Feptr + 1 >= mb->end_subject && |
3272 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
3273 | 0 | NLBLOCK->nllen == 2 && |
3274 | 0 | UCHAR21(Feptr) == NLBLOCK->nl[0]) |
3275 | 0 | { |
3276 | 0 | mb->hitend = TRUE; |
3277 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
3278 | 0 | } |
3279 | 0 | Feptr++; |
3280 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3281 | 0 | } |
3282 | 0 | break; |
3283 | | |
3284 | 0 | case OP_ALLANY: |
3285 | 0 | for (i = 1; i <= Lmin; i++) |
3286 | 0 | { |
3287 | 0 | if (Feptr >= mb->end_subject) |
3288 | 0 | { |
3289 | 0 | SCHECK_PARTIAL(); |
3290 | 0 | RRETURN(MATCH_NOMATCH); |
3291 | 0 | } |
3292 | 0 | Feptr++; |
3293 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3294 | 0 | } |
3295 | 0 | break; |
3296 | | |
3297 | 0 | case OP_ANYBYTE: |
3298 | 0 | if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH); |
3299 | 0 | Feptr += Lmin; |
3300 | 0 | break; |
3301 | | |
3302 | 0 | case OP_ANYNL: |
3303 | 0 | for (i = 1; i <= Lmin; i++) |
3304 | 0 | { |
3305 | 0 | if (Feptr >= mb->end_subject) |
3306 | 0 | { |
3307 | 0 | SCHECK_PARTIAL(); |
3308 | 0 | RRETURN(MATCH_NOMATCH); |
3309 | 0 | } |
3310 | 0 | GETCHARINC(fc, Feptr); |
3311 | 0 | switch(fc) |
3312 | 0 | { |
3313 | 0 | default: RRETURN(MATCH_NOMATCH); |
3314 | | |
3315 | 0 | case CHAR_CR: |
3316 | 0 | if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; |
3317 | 0 | break; |
3318 | | |
3319 | 0 | case CHAR_LF: |
3320 | 0 | break; |
3321 | | |
3322 | 0 | case CHAR_VT: |
3323 | 0 | case CHAR_FF: |
3324 | 0 | case CHAR_NEL: |
3325 | 0 | #ifndef EBCDIC |
3326 | 0 | case 0x2028: |
3327 | 0 | case 0x2029: |
3328 | 0 | #endif /* Not EBCDIC */ |
3329 | 0 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
3330 | 0 | break; |
3331 | 0 | } |
3332 | 0 | } |
3333 | 0 | break; |
3334 | | |
3335 | 0 | case OP_NOT_HSPACE: |
3336 | 0 | for (i = 1; i <= Lmin; i++) |
3337 | 0 | { |
3338 | 0 | if (Feptr >= mb->end_subject) |
3339 | 0 | { |
3340 | 0 | SCHECK_PARTIAL(); |
3341 | 0 | RRETURN(MATCH_NOMATCH); |
3342 | 0 | } |
3343 | 0 | GETCHARINC(fc, Feptr); |
3344 | 0 | switch(fc) |
3345 | 0 | { |
3346 | 0 | HSPACE_CASES: RRETURN(MATCH_NOMATCH); |
3347 | 0 | default: break; |
3348 | 0 | } |
3349 | 0 | } |
3350 | 0 | break; |
3351 | | |
3352 | 0 | case OP_HSPACE: |
3353 | 0 | for (i = 1; i <= Lmin; i++) |
3354 | 0 | { |
3355 | 0 | if (Feptr >= mb->end_subject) |
3356 | 0 | { |
3357 | 0 | SCHECK_PARTIAL(); |
3358 | 0 | RRETURN(MATCH_NOMATCH); |
3359 | 0 | } |
3360 | 0 | GETCHARINC(fc, Feptr); |
3361 | 0 | switch(fc) |
3362 | 0 | { |
3363 | 0 | HSPACE_CASES: break; |
3364 | 0 | default: RRETURN(MATCH_NOMATCH); |
3365 | 0 | } |
3366 | 0 | } |
3367 | 0 | break; |
3368 | | |
3369 | 0 | case OP_NOT_VSPACE: |
3370 | 0 | for (i = 1; i <= Lmin; i++) |
3371 | 0 | { |
3372 | 0 | if (Feptr >= mb->end_subject) |
3373 | 0 | { |
3374 | 0 | SCHECK_PARTIAL(); |
3375 | 0 | RRETURN(MATCH_NOMATCH); |
3376 | 0 | } |
3377 | 0 | GETCHARINC(fc, Feptr); |
3378 | 0 | switch(fc) |
3379 | 0 | { |
3380 | 0 | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
3381 | 0 | default: break; |
3382 | 0 | } |
3383 | 0 | } |
3384 | 0 | break; |
3385 | | |
3386 | 0 | case OP_VSPACE: |
3387 | 0 | for (i = 1; i <= Lmin; i++) |
3388 | 0 | { |
3389 | 0 | if (Feptr >= mb->end_subject) |
3390 | 0 | { |
3391 | 0 | SCHECK_PARTIAL(); |
3392 | 0 | RRETURN(MATCH_NOMATCH); |
3393 | 0 | } |
3394 | 0 | GETCHARINC(fc, Feptr); |
3395 | 0 | switch(fc) |
3396 | 0 | { |
3397 | 0 | VSPACE_CASES: break; |
3398 | 0 | default: RRETURN(MATCH_NOMATCH); |
3399 | 0 | } |
3400 | 0 | } |
3401 | 0 | break; |
3402 | | |
3403 | 0 | case OP_NOT_DIGIT: |
3404 | 0 | for (i = 1; i <= Lmin; i++) |
3405 | 0 | { |
3406 | 0 | if (Feptr >= mb->end_subject) |
3407 | 0 | { |
3408 | 0 | SCHECK_PARTIAL(); |
3409 | 0 | RRETURN(MATCH_NOMATCH); |
3410 | 0 | } |
3411 | 0 | GETCHARINC(fc, Feptr); |
3412 | 0 | if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0) |
3413 | 0 | RRETURN(MATCH_NOMATCH); |
3414 | 0 | } |
3415 | 0 | break; |
3416 | | |
3417 | 0 | case OP_DIGIT: |
3418 | 0 | for (i = 1; i <= Lmin; i++) |
3419 | 0 | { |
3420 | 0 | uint32_t cc; |
3421 | 0 | if (Feptr >= mb->end_subject) |
3422 | 0 | { |
3423 | 0 | SCHECK_PARTIAL(); |
3424 | 0 | RRETURN(MATCH_NOMATCH); |
3425 | 0 | } |
3426 | 0 | cc = UCHAR21(Feptr); |
3427 | 0 | if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0) |
3428 | 0 | RRETURN(MATCH_NOMATCH); |
3429 | 0 | Feptr++; |
3430 | | /* No need to skip more code units - we know it has only one. */ |
3431 | 0 | } |
3432 | 0 | break; |
3433 | | |
3434 | 0 | case OP_NOT_WHITESPACE: |
3435 | 0 | for (i = 1; i <= Lmin; i++) |
3436 | 0 | { |
3437 | 0 | uint32_t cc; |
3438 | 0 | if (Feptr >= mb->end_subject) |
3439 | 0 | { |
3440 | 0 | SCHECK_PARTIAL(); |
3441 | 0 | RRETURN(MATCH_NOMATCH); |
3442 | 0 | } |
3443 | 0 | cc = UCHAR21(Feptr); |
3444 | 0 | if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0) |
3445 | 0 | RRETURN(MATCH_NOMATCH); |
3446 | 0 | Feptr++; |
3447 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3448 | 0 | } |
3449 | 0 | break; |
3450 | | |
3451 | 0 | case OP_WHITESPACE: |
3452 | 0 | for (i = 1; i <= Lmin; i++) |
3453 | 0 | { |
3454 | 0 | uint32_t cc; |
3455 | 0 | if (Feptr >= mb->end_subject) |
3456 | 0 | { |
3457 | 0 | SCHECK_PARTIAL(); |
3458 | 0 | RRETURN(MATCH_NOMATCH); |
3459 | 0 | } |
3460 | 0 | cc = UCHAR21(Feptr); |
3461 | 0 | if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0) |
3462 | 0 | RRETURN(MATCH_NOMATCH); |
3463 | 0 | Feptr++; |
3464 | | /* No need to skip more code units - we know it has only one. */ |
3465 | 0 | } |
3466 | 0 | break; |
3467 | | |
3468 | 0 | case OP_NOT_WORDCHAR: |
3469 | 0 | for (i = 1; i <= Lmin; i++) |
3470 | 0 | { |
3471 | 0 | uint32_t cc; |
3472 | 0 | if (Feptr >= mb->end_subject) |
3473 | 0 | { |
3474 | 0 | SCHECK_PARTIAL(); |
3475 | 0 | RRETURN(MATCH_NOMATCH); |
3476 | 0 | } |
3477 | 0 | cc = UCHAR21(Feptr); |
3478 | 0 | if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0) |
3479 | 0 | RRETURN(MATCH_NOMATCH); |
3480 | 0 | Feptr++; |
3481 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3482 | 0 | } |
3483 | 0 | break; |
3484 | | |
3485 | 0 | case OP_WORDCHAR: |
3486 | 0 | for (i = 1; i <= Lmin; i++) |
3487 | 0 | { |
3488 | 0 | uint32_t cc; |
3489 | 0 | if (Feptr >= mb->end_subject) |
3490 | 0 | { |
3491 | 0 | SCHECK_PARTIAL(); |
3492 | 0 | RRETURN(MATCH_NOMATCH); |
3493 | 0 | } |
3494 | 0 | cc = UCHAR21(Feptr); |
3495 | 0 | if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0) |
3496 | 0 | RRETURN(MATCH_NOMATCH); |
3497 | 0 | Feptr++; |
3498 | | /* No need to skip more code units - we know it has only one. */ |
3499 | 0 | } |
3500 | 0 | break; |
3501 | | |
3502 | | /* LCOV_EXCL_START */ |
3503 | 0 | default: |
3504 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
3505 | 0 | return PCRE2_ERROR_INTERNAL; |
3506 | | /* LCOV_EXCL_STOP */ |
3507 | 0 | } /* End switch(Lctype) */ |
3508 | | |
3509 | 0 | else |
3510 | 0 | #endif /* SUPPORT_UNICODE */ |
3511 | | |
3512 | | /* Code for the non-UTF case for minimum matching of operators other |
3513 | | than OP_PROP and OP_NOTPROP. */ |
3514 | | |
3515 | 0 | switch(Lctype) |
3516 | 0 | { |
3517 | 0 | case OP_ANY: |
3518 | 0 | for (i = 1; i <= Lmin; i++) |
3519 | 0 | { |
3520 | 0 | if (Feptr >= mb->end_subject) |
3521 | 0 | { |
3522 | 0 | SCHECK_PARTIAL(); |
3523 | 0 | RRETURN(MATCH_NOMATCH); |
3524 | 0 | } |
3525 | 0 | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
3526 | 0 | if (mb->partial != 0 && |
3527 | 0 | Feptr + 1 >= mb->end_subject && |
3528 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
3529 | 0 | NLBLOCK->nllen == 2 && |
3530 | 0 | *Feptr == NLBLOCK->nl[0]) |
3531 | 0 | { |
3532 | 0 | mb->hitend = TRUE; |
3533 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
3534 | 0 | } |
3535 | 0 | Feptr++; |
3536 | 0 | } |
3537 | 0 | break; |
3538 | | |
3539 | 0 | case OP_ALLANY: |
3540 | 0 | if (Feptr > mb->end_subject - Lmin) |
3541 | 0 | { |
3542 | 0 | SCHECK_PARTIAL(); |
3543 | 0 | RRETURN(MATCH_NOMATCH); |
3544 | 0 | } |
3545 | 0 | Feptr += Lmin; |
3546 | 0 | break; |
3547 | | |
3548 | | /* This OP_ANYBYTE case will never be reached because \C gets turned |
3549 | | into OP_ALLANY in non-UTF mode. Cut out the code so that coverage |
3550 | | reports don't complain about it's never being used. */ |
3551 | | |
3552 | | /* case OP_ANYBYTE: |
3553 | | * if (Feptr > mb->end_subject - Lmin) |
3554 | | * { |
3555 | | * SCHECK_PARTIAL(); |
3556 | | * RRETURN(MATCH_NOMATCH); |
3557 | | * } |
3558 | | * Feptr += Lmin; |
3559 | | * break; |
3560 | | */ |
3561 | 0 | case OP_ANYNL: |
3562 | 0 | for (i = 1; i <= Lmin; i++) |
3563 | 0 | { |
3564 | 0 | if (Feptr >= mb->end_subject) |
3565 | 0 | { |
3566 | 0 | SCHECK_PARTIAL(); |
3567 | 0 | RRETURN(MATCH_NOMATCH); |
3568 | 0 | } |
3569 | 0 | switch(*Feptr++) |
3570 | 0 | { |
3571 | 0 | default: RRETURN(MATCH_NOMATCH); |
3572 | | |
3573 | 0 | case CHAR_CR: |
3574 | 0 | if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; |
3575 | 0 | break; |
3576 | | |
3577 | 0 | case CHAR_LF: |
3578 | 0 | break; |
3579 | | |
3580 | 0 | case CHAR_VT: |
3581 | 0 | case CHAR_FF: |
3582 | 0 | case CHAR_NEL: |
3583 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3584 | | case 0x2028: |
3585 | | case 0x2029: |
3586 | | #endif |
3587 | 0 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
3588 | 0 | break; |
3589 | 0 | } |
3590 | 0 | } |
3591 | 0 | break; |
3592 | | |
3593 | 0 | case OP_NOT_HSPACE: |
3594 | 0 | for (i = 1; i <= Lmin; i++) |
3595 | 0 | { |
3596 | 0 | if (Feptr >= mb->end_subject) |
3597 | 0 | { |
3598 | 0 | SCHECK_PARTIAL(); |
3599 | 0 | RRETURN(MATCH_NOMATCH); |
3600 | 0 | } |
3601 | 0 | switch(*Feptr++) |
3602 | 0 | { |
3603 | 0 | default: break; |
3604 | 0 | HSPACE_BYTE_CASES: |
3605 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3606 | | HSPACE_MULTIBYTE_CASES: |
3607 | | #endif |
3608 | 0 | RRETURN(MATCH_NOMATCH); |
3609 | 0 | } |
3610 | 0 | } |
3611 | 0 | break; |
3612 | | |
3613 | 0 | case OP_HSPACE: |
3614 | 0 | for (i = 1; i <= Lmin; i++) |
3615 | 0 | { |
3616 | 0 | if (Feptr >= mb->end_subject) |
3617 | 0 | { |
3618 | 0 | SCHECK_PARTIAL(); |
3619 | 0 | RRETURN(MATCH_NOMATCH); |
3620 | 0 | } |
3621 | 0 | switch(*Feptr++) |
3622 | 0 | { |
3623 | 0 | default: RRETURN(MATCH_NOMATCH); |
3624 | 0 | HSPACE_BYTE_CASES: |
3625 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3626 | | HSPACE_MULTIBYTE_CASES: |
3627 | | #endif |
3628 | 0 | break; |
3629 | 0 | } |
3630 | 0 | } |
3631 | 0 | break; |
3632 | | |
3633 | 0 | case OP_NOT_VSPACE: |
3634 | 0 | for (i = 1; i <= Lmin; i++) |
3635 | 0 | { |
3636 | 0 | if (Feptr >= mb->end_subject) |
3637 | 0 | { |
3638 | 0 | SCHECK_PARTIAL(); |
3639 | 0 | RRETURN(MATCH_NOMATCH); |
3640 | 0 | } |
3641 | 0 | switch(*Feptr++) |
3642 | 0 | { |
3643 | 0 | VSPACE_BYTE_CASES: |
3644 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3645 | | VSPACE_MULTIBYTE_CASES: |
3646 | | #endif |
3647 | 0 | RRETURN(MATCH_NOMATCH); |
3648 | 0 | default: break; |
3649 | 0 | } |
3650 | 0 | } |
3651 | 0 | break; |
3652 | | |
3653 | 0 | case OP_VSPACE: |
3654 | 0 | for (i = 1; i <= Lmin; i++) |
3655 | 0 | { |
3656 | 0 | if (Feptr >= mb->end_subject) |
3657 | 0 | { |
3658 | 0 | SCHECK_PARTIAL(); |
3659 | 0 | RRETURN(MATCH_NOMATCH); |
3660 | 0 | } |
3661 | 0 | switch(*Feptr++) |
3662 | 0 | { |
3663 | 0 | default: RRETURN(MATCH_NOMATCH); |
3664 | 0 | VSPACE_BYTE_CASES: |
3665 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3666 | | VSPACE_MULTIBYTE_CASES: |
3667 | | #endif |
3668 | 0 | break; |
3669 | 0 | } |
3670 | 0 | } |
3671 | 0 | break; |
3672 | | |
3673 | 0 | case OP_NOT_DIGIT: |
3674 | 0 | for (i = 1; i <= Lmin; i++) |
3675 | 0 | { |
3676 | 0 | if (Feptr >= mb->end_subject) |
3677 | 0 | { |
3678 | 0 | SCHECK_PARTIAL(); |
3679 | 0 | RRETURN(MATCH_NOMATCH); |
3680 | 0 | } |
3681 | 0 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) |
3682 | 0 | RRETURN(MATCH_NOMATCH); |
3683 | 0 | Feptr++; |
3684 | 0 | } |
3685 | 0 | break; |
3686 | | |
3687 | 0 | case OP_DIGIT: |
3688 | 0 | for (i = 1; i <= Lmin; i++) |
3689 | 0 | { |
3690 | 0 | if (Feptr >= mb->end_subject) |
3691 | 0 | { |
3692 | 0 | SCHECK_PARTIAL(); |
3693 | 0 | RRETURN(MATCH_NOMATCH); |
3694 | 0 | } |
3695 | 0 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) |
3696 | 0 | RRETURN(MATCH_NOMATCH); |
3697 | 0 | Feptr++; |
3698 | 0 | } |
3699 | 0 | break; |
3700 | | |
3701 | 0 | case OP_NOT_WHITESPACE: |
3702 | 0 | for (i = 1; i <= Lmin; i++) |
3703 | 0 | { |
3704 | 0 | if (Feptr >= mb->end_subject) |
3705 | 0 | { |
3706 | 0 | SCHECK_PARTIAL(); |
3707 | 0 | RRETURN(MATCH_NOMATCH); |
3708 | 0 | } |
3709 | 0 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) |
3710 | 0 | RRETURN(MATCH_NOMATCH); |
3711 | 0 | Feptr++; |
3712 | 0 | } |
3713 | 0 | break; |
3714 | | |
3715 | 0 | case OP_WHITESPACE: |
3716 | 0 | for (i = 1; i <= Lmin; i++) |
3717 | 0 | { |
3718 | 0 | if (Feptr >= mb->end_subject) |
3719 | 0 | { |
3720 | 0 | SCHECK_PARTIAL(); |
3721 | 0 | RRETURN(MATCH_NOMATCH); |
3722 | 0 | } |
3723 | 0 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) |
3724 | 0 | RRETURN(MATCH_NOMATCH); |
3725 | 0 | Feptr++; |
3726 | 0 | } |
3727 | 0 | break; |
3728 | | |
3729 | 0 | case OP_NOT_WORDCHAR: |
3730 | 0 | for (i = 1; i <= Lmin; i++) |
3731 | 0 | { |
3732 | 0 | if (Feptr >= mb->end_subject) |
3733 | 0 | { |
3734 | 0 | SCHECK_PARTIAL(); |
3735 | 0 | RRETURN(MATCH_NOMATCH); |
3736 | 0 | } |
3737 | 0 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) |
3738 | 0 | RRETURN(MATCH_NOMATCH); |
3739 | 0 | Feptr++; |
3740 | 0 | } |
3741 | 0 | break; |
3742 | | |
3743 | 0 | case OP_WORDCHAR: |
3744 | 0 | for (i = 1; i <= Lmin; i++) |
3745 | 0 | { |
3746 | 0 | if (Feptr >= mb->end_subject) |
3747 | 0 | { |
3748 | 0 | SCHECK_PARTIAL(); |
3749 | 0 | RRETURN(MATCH_NOMATCH); |
3750 | 0 | } |
3751 | 0 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) |
3752 | 0 | RRETURN(MATCH_NOMATCH); |
3753 | 0 | Feptr++; |
3754 | 0 | } |
3755 | 0 | break; |
3756 | | |
3757 | | /* LCOV_EXCL_START */ |
3758 | 0 | default: |
3759 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
3760 | 0 | return PCRE2_ERROR_INTERNAL; |
3761 | | /* LCOV_EXCL_STOP */ |
3762 | 0 | } |
3763 | 0 | } |
3764 | | |
3765 | | /* If Lmin = Lmax we are done. Continue with the main loop. */ |
3766 | | |
3767 | 0 | if (Lmin == Lmax) continue; |
3768 | | |
3769 | | /* If minimizing, we have to test the rest of the pattern before each |
3770 | | subsequent match. This means we cannot use a local "notmatch" variable as |
3771 | | in the other cases. As all 4 temporary 32-bit values in the frame are |
3772 | | already in use, just test the type each time. */ |
3773 | | |
3774 | 0 | if (reptype == REPTYPE_MIN) |
3775 | 0 | { |
3776 | 0 | #ifdef SUPPORT_UNICODE |
3777 | 0 | if (proptype >= 0) |
3778 | 0 | { |
3779 | 0 | switch(proptype) |
3780 | 0 | { |
3781 | 0 | case PT_LAMP: |
3782 | 0 | for (;;) |
3783 | 0 | { |
3784 | 0 | int chartype; |
3785 | 0 | RMATCH(Fecode, RM208); |
3786 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3787 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3788 | 0 | if (Feptr >= mb->end_subject) |
3789 | 0 | { |
3790 | 0 | SCHECK_PARTIAL(); |
3791 | 0 | RRETURN(MATCH_NOMATCH); |
3792 | 0 | } |
3793 | 0 | GETCHARINCTEST(fc, Feptr); |
3794 | 0 | chartype = UCD_CHARTYPE(fc); |
3795 | 0 | if ((chartype == ucp_Lu || |
3796 | 0 | chartype == ucp_Ll || |
3797 | 0 | chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) |
3798 | 0 | RRETURN(MATCH_NOMATCH); |
3799 | 0 | } |
3800 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3801 | | |
3802 | 0 | case PT_GC: |
3803 | 0 | for (;;) |
3804 | 0 | { |
3805 | 0 | RMATCH(Fecode, RM209); |
3806 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3807 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3808 | 0 | if (Feptr >= mb->end_subject) |
3809 | 0 | { |
3810 | 0 | SCHECK_PARTIAL(); |
3811 | 0 | RRETURN(MATCH_NOMATCH); |
3812 | 0 | } |
3813 | 0 | GETCHARINCTEST(fc, Feptr); |
3814 | 0 | if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3815 | 0 | RRETURN(MATCH_NOMATCH); |
3816 | 0 | } |
3817 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3818 | | |
3819 | 0 | case PT_PC: |
3820 | 0 | for (;;) |
3821 | 0 | { |
3822 | 0 | RMATCH(Fecode, RM210); |
3823 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3824 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3825 | 0 | if (Feptr >= mb->end_subject) |
3826 | 0 | { |
3827 | 0 | SCHECK_PARTIAL(); |
3828 | 0 | RRETURN(MATCH_NOMATCH); |
3829 | 0 | } |
3830 | 0 | GETCHARINCTEST(fc, Feptr); |
3831 | 0 | if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3832 | 0 | RRETURN(MATCH_NOMATCH); |
3833 | 0 | } |
3834 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3835 | | |
3836 | 0 | case PT_SC: |
3837 | 0 | for (;;) |
3838 | 0 | { |
3839 | 0 | RMATCH(Fecode, RM211); |
3840 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3841 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3842 | 0 | if (Feptr >= mb->end_subject) |
3843 | 0 | { |
3844 | 0 | SCHECK_PARTIAL(); |
3845 | 0 | RRETURN(MATCH_NOMATCH); |
3846 | 0 | } |
3847 | 0 | GETCHARINCTEST(fc, Feptr); |
3848 | 0 | if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3849 | 0 | RRETURN(MATCH_NOMATCH); |
3850 | 0 | } |
3851 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3852 | | |
3853 | 0 | case PT_SCX: |
3854 | 0 | for (;;) |
3855 | 0 | { |
3856 | 0 | BOOL ok; |
3857 | 0 | const ucd_record *prop; |
3858 | 0 | RMATCH(Fecode, RM224); |
3859 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3860 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3861 | 0 | if (Feptr >= mb->end_subject) |
3862 | 0 | { |
3863 | 0 | SCHECK_PARTIAL(); |
3864 | 0 | RRETURN(MATCH_NOMATCH); |
3865 | 0 | } |
3866 | 0 | GETCHARINCTEST(fc, Feptr); |
3867 | 0 | prop = GET_UCD(fc); |
3868 | 0 | ok = (prop->script == Lpropvalue |
3869 | 0 | || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); |
3870 | 0 | if (ok == (Lctype == OP_NOTPROP)) |
3871 | 0 | RRETURN(MATCH_NOMATCH); |
3872 | 0 | } |
3873 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3874 | | |
3875 | 0 | case PT_ALNUM: |
3876 | 0 | for (;;) |
3877 | 0 | { |
3878 | 0 | int category; |
3879 | 0 | RMATCH(Fecode, RM212); |
3880 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3881 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3882 | 0 | if (Feptr >= mb->end_subject) |
3883 | 0 | { |
3884 | 0 | SCHECK_PARTIAL(); |
3885 | 0 | RRETURN(MATCH_NOMATCH); |
3886 | 0 | } |
3887 | 0 | GETCHARINCTEST(fc, Feptr); |
3888 | 0 | category = UCD_CATEGORY(fc); |
3889 | 0 | if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP)) |
3890 | 0 | RRETURN(MATCH_NOMATCH); |
3891 | 0 | } |
3892 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3893 | | |
3894 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
3895 | | which means that Perl space and POSIX space are now identical. PCRE |
3896 | | was changed at release 8.34. */ |
3897 | | |
3898 | 0 | case PT_SPACE: /* Perl space */ |
3899 | 0 | case PT_PXSPACE: /* POSIX space */ |
3900 | 0 | for (;;) |
3901 | 0 | { |
3902 | 0 | RMATCH(Fecode, RM213); |
3903 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3904 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3905 | 0 | if (Feptr >= mb->end_subject) |
3906 | 0 | { |
3907 | 0 | SCHECK_PARTIAL(); |
3908 | 0 | RRETURN(MATCH_NOMATCH); |
3909 | 0 | } |
3910 | 0 | GETCHARINCTEST(fc, Feptr); |
3911 | 0 | switch(fc) |
3912 | 0 | { |
3913 | 0 | HSPACE_CASES: |
3914 | 0 | VSPACE_CASES: |
3915 | 0 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
3916 | 0 | break; |
3917 | | |
3918 | 0 | default: |
3919 | 0 | if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) |
3920 | 0 | RRETURN(MATCH_NOMATCH); |
3921 | 0 | break; |
3922 | 0 | } |
3923 | 0 | } |
3924 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3925 | | |
3926 | 0 | case PT_WORD: |
3927 | 0 | for (;;) |
3928 | 0 | { |
3929 | 0 | int chartype, category; |
3930 | 0 | RMATCH(Fecode, RM214); |
3931 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3932 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3933 | 0 | if (Feptr >= mb->end_subject) |
3934 | 0 | { |
3935 | 0 | SCHECK_PARTIAL(); |
3936 | 0 | RRETURN(MATCH_NOMATCH); |
3937 | 0 | } |
3938 | 0 | GETCHARINCTEST(fc, Feptr); |
3939 | 0 | chartype = UCD_CHARTYPE(fc); |
3940 | 0 | category = PRIV(ucp_gentype)[chartype]; |
3941 | 0 | if ((category == ucp_L || |
3942 | 0 | category == ucp_N || |
3943 | 0 | chartype == ucp_Mn || |
3944 | 0 | chartype == ucp_Pc) == (Lctype == OP_NOTPROP)) |
3945 | 0 | RRETURN(MATCH_NOMATCH); |
3946 | 0 | } |
3947 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3948 | | |
3949 | 0 | case PT_CLIST: |
3950 | 0 | for (;;) |
3951 | 0 | { |
3952 | 0 | const uint32_t *cp; |
3953 | 0 | RMATCH(Fecode, RM215); |
3954 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3955 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3956 | 0 | if (Feptr >= mb->end_subject) |
3957 | 0 | { |
3958 | 0 | SCHECK_PARTIAL(); |
3959 | 0 | RRETURN(MATCH_NOMATCH); |
3960 | 0 | } |
3961 | 0 | GETCHARINCTEST(fc, Feptr); |
3962 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
3963 | | if (fc > MAX_UTF_CODE_POINT) |
3964 | | { |
3965 | | if (Lctype == OP_NOTPROP) continue; |
3966 | | RRETURN(MATCH_NOMATCH); |
3967 | | } |
3968 | | #endif |
3969 | 0 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
3970 | 0 | for (;;) |
3971 | 0 | { |
3972 | 0 | if (fc < *cp) |
3973 | 0 | { |
3974 | 0 | if (Lctype == OP_NOTPROP) break; |
3975 | 0 | RRETURN(MATCH_NOMATCH); |
3976 | 0 | } |
3977 | 0 | if (fc == *cp++) |
3978 | 0 | { |
3979 | 0 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
3980 | 0 | break; |
3981 | 0 | } |
3982 | 0 | } |
3983 | 0 | } |
3984 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
3985 | | |
3986 | 0 | case PT_UCNC: |
3987 | 0 | for (;;) |
3988 | 0 | { |
3989 | 0 | RMATCH(Fecode, RM216); |
3990 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3991 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3992 | 0 | if (Feptr >= mb->end_subject) |
3993 | 0 | { |
3994 | 0 | SCHECK_PARTIAL(); |
3995 | 0 | RRETURN(MATCH_NOMATCH); |
3996 | 0 | } |
3997 | 0 | GETCHARINCTEST(fc, Feptr); |
3998 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
3999 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
4000 | 0 | fc >= 0xe000) == (Lctype == OP_NOTPROP)) |
4001 | 0 | RRETURN(MATCH_NOMATCH); |
4002 | 0 | } |
4003 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
4004 | | |
4005 | 0 | case PT_BIDICL: |
4006 | 0 | for (;;) |
4007 | 0 | { |
4008 | 0 | RMATCH(Fecode, RM223); |
4009 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4010 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
4011 | 0 | if (Feptr >= mb->end_subject) |
4012 | 0 | { |
4013 | 0 | SCHECK_PARTIAL(); |
4014 | 0 | RRETURN(MATCH_NOMATCH); |
4015 | 0 | } |
4016 | 0 | GETCHARINCTEST(fc, Feptr); |
4017 | 0 | if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
4018 | 0 | RRETURN(MATCH_NOMATCH); |
4019 | 0 | } |
4020 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
4021 | | |
4022 | 0 | case PT_BOOL: |
4023 | 0 | for (;;) |
4024 | 0 | { |
4025 | 0 | BOOL ok; |
4026 | 0 | const ucd_record *prop; |
4027 | 0 | RMATCH(Fecode, RM222); |
4028 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4029 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
4030 | 0 | if (Feptr >= mb->end_subject) |
4031 | 0 | { |
4032 | 0 | SCHECK_PARTIAL(); |
4033 | 0 | RRETURN(MATCH_NOMATCH); |
4034 | 0 | } |
4035 | 0 | GETCHARINCTEST(fc, Feptr); |
4036 | 0 | prop = GET_UCD(fc); |
4037 | 0 | ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
4038 | 0 | UCD_BPROPS_PROP(prop), Lpropvalue) != 0; |
4039 | 0 | if (ok == (Lctype == OP_NOTPROP)) |
4040 | 0 | RRETURN(MATCH_NOMATCH); |
4041 | 0 | } |
4042 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
4043 | | |
4044 | | /* This should never occur */ |
4045 | | |
4046 | | /* LCOV_EXCL_START */ |
4047 | 0 | default: |
4048 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
4049 | 0 | return PCRE2_ERROR_INTERNAL; |
4050 | | /* LCOV_EXCL_STOP */ |
4051 | 0 | } |
4052 | 0 | } |
4053 | | |
4054 | | /* Match extended Unicode sequences. We will get here only if the |
4055 | | support is in the binary; otherwise a compile-time error occurs. */ |
4056 | | |
4057 | 0 | else if (Lctype == OP_EXTUNI) |
4058 | 0 | { |
4059 | 0 | for (;;) |
4060 | 0 | { |
4061 | 0 | RMATCH(Fecode, RM217); |
4062 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4063 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
4064 | 0 | if (Feptr >= mb->end_subject) |
4065 | 0 | { |
4066 | 0 | SCHECK_PARTIAL(); |
4067 | 0 | RRETURN(MATCH_NOMATCH); |
4068 | 0 | } |
4069 | 0 | else |
4070 | 0 | { |
4071 | 0 | GETCHARINCTEST(fc, Feptr); |
4072 | 0 | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, |
4073 | 0 | utf, NULL); |
4074 | 0 | } |
4075 | 0 | CHECK_PARTIAL(); |
4076 | 0 | } |
4077 | 0 | } |
4078 | 0 | else |
4079 | 0 | #endif /* SUPPORT_UNICODE */ |
4080 | | |
4081 | | /* UTF mode for non-property testing character types. */ |
4082 | | |
4083 | 0 | #ifdef SUPPORT_UNICODE |
4084 | 0 | if (utf) |
4085 | 0 | { |
4086 | 0 | for (;;) |
4087 | 0 | { |
4088 | 0 | RMATCH(Fecode, RM218); |
4089 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4090 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
4091 | 0 | if (Feptr >= mb->end_subject) |
4092 | 0 | { |
4093 | 0 | SCHECK_PARTIAL(); |
4094 | 0 | RRETURN(MATCH_NOMATCH); |
4095 | 0 | } |
4096 | 0 | if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
4097 | 0 | GETCHARINC(fc, Feptr); |
4098 | 0 | switch(Lctype) |
4099 | 0 | { |
4100 | 0 | case OP_ANY: /* This is the non-NL case */ |
4101 | 0 | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4102 | 0 | Feptr >= mb->end_subject && |
4103 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
4104 | 0 | NLBLOCK->nllen == 2 && |
4105 | 0 | fc == NLBLOCK->nl[0]) |
4106 | 0 | { |
4107 | 0 | mb->hitend = TRUE; |
4108 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4109 | 0 | } |
4110 | 0 | break; |
4111 | | |
4112 | 0 | case OP_ALLANY: |
4113 | 0 | case OP_ANYBYTE: |
4114 | 0 | break; |
4115 | | |
4116 | 0 | case OP_ANYNL: |
4117 | 0 | switch(fc) |
4118 | 0 | { |
4119 | 0 | default: RRETURN(MATCH_NOMATCH); |
4120 | | |
4121 | 0 | case CHAR_CR: |
4122 | 0 | if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; |
4123 | 0 | break; |
4124 | | |
4125 | 0 | case CHAR_LF: |
4126 | 0 | break; |
4127 | | |
4128 | 0 | case CHAR_VT: |
4129 | 0 | case CHAR_FF: |
4130 | 0 | case CHAR_NEL: |
4131 | 0 | #ifndef EBCDIC |
4132 | 0 | case 0x2028: |
4133 | 0 | case 0x2029: |
4134 | 0 | #endif /* Not EBCDIC */ |
4135 | 0 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) |
4136 | 0 | RRETURN(MATCH_NOMATCH); |
4137 | 0 | break; |
4138 | 0 | } |
4139 | 0 | break; |
4140 | | |
4141 | 0 | case OP_NOT_HSPACE: |
4142 | 0 | switch(fc) |
4143 | 0 | { |
4144 | 0 | HSPACE_CASES: RRETURN(MATCH_NOMATCH); |
4145 | 0 | default: break; |
4146 | 0 | } |
4147 | 0 | break; |
4148 | | |
4149 | 0 | case OP_HSPACE: |
4150 | 0 | switch(fc) |
4151 | 0 | { |
4152 | 0 | HSPACE_CASES: break; |
4153 | 0 | default: RRETURN(MATCH_NOMATCH); |
4154 | 0 | } |
4155 | 0 | break; |
4156 | | |
4157 | 0 | case OP_NOT_VSPACE: |
4158 | 0 | switch(fc) |
4159 | 0 | { |
4160 | 0 | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
4161 | 0 | default: break; |
4162 | 0 | } |
4163 | 0 | break; |
4164 | | |
4165 | 0 | case OP_VSPACE: |
4166 | 0 | switch(fc) |
4167 | 0 | { |
4168 | 0 | VSPACE_CASES: break; |
4169 | 0 | default: RRETURN(MATCH_NOMATCH); |
4170 | 0 | } |
4171 | 0 | break; |
4172 | | |
4173 | 0 | case OP_NOT_DIGIT: |
4174 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) |
4175 | 0 | RRETURN(MATCH_NOMATCH); |
4176 | 0 | break; |
4177 | | |
4178 | 0 | case OP_DIGIT: |
4179 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0) |
4180 | 0 | RRETURN(MATCH_NOMATCH); |
4181 | 0 | break; |
4182 | | |
4183 | 0 | case OP_NOT_WHITESPACE: |
4184 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) |
4185 | 0 | RRETURN(MATCH_NOMATCH); |
4186 | 0 | break; |
4187 | | |
4188 | 0 | case OP_WHITESPACE: |
4189 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0) |
4190 | 0 | RRETURN(MATCH_NOMATCH); |
4191 | 0 | break; |
4192 | | |
4193 | 0 | case OP_NOT_WORDCHAR: |
4194 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) |
4195 | 0 | RRETURN(MATCH_NOMATCH); |
4196 | 0 | break; |
4197 | | |
4198 | 0 | case OP_WORDCHAR: |
4199 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) |
4200 | 0 | RRETURN(MATCH_NOMATCH); |
4201 | 0 | break; |
4202 | | |
4203 | | /* LCOV_EXCL_START */ |
4204 | 0 | default: |
4205 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
4206 | 0 | return PCRE2_ERROR_INTERNAL; |
4207 | | /* LCOV_EXCL_STOP */ |
4208 | 0 | } |
4209 | 0 | } |
4210 | 0 | } |
4211 | 0 | else |
4212 | 0 | #endif /* SUPPORT_UNICODE */ |
4213 | | |
4214 | | /* Not UTF mode */ |
4215 | 0 | { |
4216 | 0 | for (;;) |
4217 | 0 | { |
4218 | 0 | RMATCH(Fecode, RM33); |
4219 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4220 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
4221 | 0 | if (Feptr >= mb->end_subject) |
4222 | 0 | { |
4223 | 0 | SCHECK_PARTIAL(); |
4224 | 0 | RRETURN(MATCH_NOMATCH); |
4225 | 0 | } |
4226 | 0 | if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) |
4227 | 0 | RRETURN(MATCH_NOMATCH); |
4228 | 0 | fc = *Feptr++; |
4229 | 0 | switch(Lctype) |
4230 | 0 | { |
4231 | 0 | case OP_ANY: /* This is the non-NL case */ |
4232 | 0 | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4233 | 0 | Feptr >= mb->end_subject && |
4234 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
4235 | 0 | NLBLOCK->nllen == 2 && |
4236 | 0 | fc == NLBLOCK->nl[0]) |
4237 | 0 | { |
4238 | 0 | mb->hitend = TRUE; |
4239 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4240 | 0 | } |
4241 | 0 | break; |
4242 | | |
4243 | 0 | case OP_ALLANY: |
4244 | 0 | case OP_ANYBYTE: |
4245 | 0 | break; |
4246 | | |
4247 | 0 | case OP_ANYNL: |
4248 | 0 | switch(fc) |
4249 | 0 | { |
4250 | 0 | default: RRETURN(MATCH_NOMATCH); |
4251 | | |
4252 | 0 | case CHAR_CR: |
4253 | 0 | if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; |
4254 | 0 | break; |
4255 | | |
4256 | 0 | case CHAR_LF: |
4257 | 0 | break; |
4258 | | |
4259 | 0 | case CHAR_VT: |
4260 | 0 | case CHAR_FF: |
4261 | 0 | case CHAR_NEL: |
4262 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4263 | | case 0x2028: |
4264 | | case 0x2029: |
4265 | | #endif |
4266 | 0 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) |
4267 | 0 | RRETURN(MATCH_NOMATCH); |
4268 | 0 | break; |
4269 | 0 | } |
4270 | 0 | break; |
4271 | | |
4272 | 0 | case OP_NOT_HSPACE: |
4273 | 0 | switch(fc) |
4274 | 0 | { |
4275 | 0 | default: break; |
4276 | 0 | HSPACE_BYTE_CASES: |
4277 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4278 | | HSPACE_MULTIBYTE_CASES: |
4279 | | #endif |
4280 | 0 | RRETURN(MATCH_NOMATCH); |
4281 | 0 | } |
4282 | 0 | break; |
4283 | | |
4284 | 0 | case OP_HSPACE: |
4285 | 0 | switch(fc) |
4286 | 0 | { |
4287 | 0 | default: RRETURN(MATCH_NOMATCH); |
4288 | 0 | HSPACE_BYTE_CASES: |
4289 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4290 | | HSPACE_MULTIBYTE_CASES: |
4291 | | #endif |
4292 | 0 | break; |
4293 | 0 | } |
4294 | 0 | break; |
4295 | | |
4296 | 0 | case OP_NOT_VSPACE: |
4297 | 0 | switch(fc) |
4298 | 0 | { |
4299 | 0 | default: break; |
4300 | 0 | VSPACE_BYTE_CASES: |
4301 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4302 | | VSPACE_MULTIBYTE_CASES: |
4303 | | #endif |
4304 | 0 | RRETURN(MATCH_NOMATCH); |
4305 | 0 | } |
4306 | 0 | break; |
4307 | | |
4308 | 0 | case OP_VSPACE: |
4309 | 0 | switch(fc) |
4310 | 0 | { |
4311 | 0 | default: RRETURN(MATCH_NOMATCH); |
4312 | 0 | VSPACE_BYTE_CASES: |
4313 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4314 | | VSPACE_MULTIBYTE_CASES: |
4315 | | #endif |
4316 | 0 | break; |
4317 | 0 | } |
4318 | 0 | break; |
4319 | | |
4320 | 0 | case OP_NOT_DIGIT: |
4321 | 0 | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) |
4322 | 0 | RRETURN(MATCH_NOMATCH); |
4323 | 0 | break; |
4324 | | |
4325 | 0 | case OP_DIGIT: |
4326 | 0 | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) |
4327 | 0 | RRETURN(MATCH_NOMATCH); |
4328 | 0 | break; |
4329 | | |
4330 | 0 | case OP_NOT_WHITESPACE: |
4331 | 0 | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) |
4332 | 0 | RRETURN(MATCH_NOMATCH); |
4333 | 0 | break; |
4334 | | |
4335 | 0 | case OP_WHITESPACE: |
4336 | 0 | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) |
4337 | 0 | RRETURN(MATCH_NOMATCH); |
4338 | 0 | break; |
4339 | | |
4340 | 0 | case OP_NOT_WORDCHAR: |
4341 | 0 | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) |
4342 | 0 | RRETURN(MATCH_NOMATCH); |
4343 | 0 | break; |
4344 | | |
4345 | 0 | case OP_WORDCHAR: |
4346 | 0 | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) |
4347 | 0 | RRETURN(MATCH_NOMATCH); |
4348 | 0 | break; |
4349 | | |
4350 | | /* LCOV_EXCL_START */ |
4351 | 0 | default: |
4352 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
4353 | 0 | return PCRE2_ERROR_INTERNAL; |
4354 | | /* LCOV_EXCL_STOP */ |
4355 | 0 | } |
4356 | 0 | } |
4357 | 0 | } |
4358 | | |
4359 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ |
4360 | 0 | } |
4361 | | |
4362 | | /* If maximizing, it is worth using inline code for speed, doing the type |
4363 | | test once at the start (i.e. keep it out of the loops). Once again, |
4364 | | "notmatch" can be an ordinary local variable because the loops do not call |
4365 | | RMATCH. */ |
4366 | | |
4367 | 0 | else |
4368 | 0 | { |
4369 | 0 | Lstart_eptr = Feptr; /* Remember where we started */ |
4370 | |
|
4371 | 0 | #ifdef SUPPORT_UNICODE |
4372 | 0 | if (proptype >= 0) |
4373 | 0 | { |
4374 | 0 | BOOL notmatch = Lctype == OP_NOTPROP; |
4375 | 0 | switch(proptype) |
4376 | 0 | { |
4377 | 0 | case PT_LAMP: |
4378 | 0 | for (i = Lmin; i < Lmax; i++) |
4379 | 0 | { |
4380 | 0 | int chartype; |
4381 | 0 | int len = 1; |
4382 | 0 | if (Feptr >= mb->end_subject) |
4383 | 0 | { |
4384 | 0 | SCHECK_PARTIAL(); |
4385 | 0 | break; |
4386 | 0 | } |
4387 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4388 | 0 | chartype = UCD_CHARTYPE(fc); |
4389 | 0 | if ((chartype == ucp_Lu || |
4390 | 0 | chartype == ucp_Ll || |
4391 | 0 | chartype == ucp_Lt) == notmatch) |
4392 | 0 | break; |
4393 | 0 | Feptr+= len; |
4394 | 0 | } |
4395 | 0 | break; |
4396 | | |
4397 | 0 | case PT_GC: |
4398 | 0 | for (i = Lmin; i < Lmax; i++) |
4399 | 0 | { |
4400 | 0 | int len = 1; |
4401 | 0 | if (Feptr >= mb->end_subject) |
4402 | 0 | { |
4403 | 0 | SCHECK_PARTIAL(); |
4404 | 0 | break; |
4405 | 0 | } |
4406 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4407 | 0 | if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break; |
4408 | 0 | Feptr+= len; |
4409 | 0 | } |
4410 | 0 | break; |
4411 | | |
4412 | 0 | case PT_PC: |
4413 | 0 | for (i = Lmin; i < Lmax; i++) |
4414 | 0 | { |
4415 | 0 | int len = 1; |
4416 | 0 | if (Feptr >= mb->end_subject) |
4417 | 0 | { |
4418 | 0 | SCHECK_PARTIAL(); |
4419 | 0 | break; |
4420 | 0 | } |
4421 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4422 | 0 | if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break; |
4423 | 0 | Feptr+= len; |
4424 | 0 | } |
4425 | 0 | break; |
4426 | | |
4427 | 0 | case PT_SC: |
4428 | 0 | for (i = Lmin; i < Lmax; i++) |
4429 | 0 | { |
4430 | 0 | int len = 1; |
4431 | 0 | if (Feptr >= mb->end_subject) |
4432 | 0 | { |
4433 | 0 | SCHECK_PARTIAL(); |
4434 | 0 | break; |
4435 | 0 | } |
4436 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4437 | 0 | if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break; |
4438 | 0 | Feptr+= len; |
4439 | 0 | } |
4440 | 0 | break; |
4441 | | |
4442 | 0 | case PT_SCX: |
4443 | 0 | for (i = Lmin; i < Lmax; i++) |
4444 | 0 | { |
4445 | 0 | BOOL ok; |
4446 | 0 | const ucd_record *prop; |
4447 | 0 | int len = 1; |
4448 | 0 | if (Feptr >= mb->end_subject) |
4449 | 0 | { |
4450 | 0 | SCHECK_PARTIAL(); |
4451 | 0 | break; |
4452 | 0 | } |
4453 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4454 | 0 | prop = GET_UCD(fc); |
4455 | 0 | ok = (prop->script == Lpropvalue || |
4456 | 0 | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); |
4457 | 0 | if (ok == notmatch) break; |
4458 | 0 | Feptr+= len; |
4459 | 0 | } |
4460 | 0 | break; |
4461 | | |
4462 | 0 | case PT_ALNUM: |
4463 | 0 | for (i = Lmin; i < Lmax; i++) |
4464 | 0 | { |
4465 | 0 | int category; |
4466 | 0 | int len = 1; |
4467 | 0 | if (Feptr >= mb->end_subject) |
4468 | 0 | { |
4469 | 0 | SCHECK_PARTIAL(); |
4470 | 0 | break; |
4471 | 0 | } |
4472 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4473 | 0 | category = UCD_CATEGORY(fc); |
4474 | 0 | if ((category == ucp_L || category == ucp_N) == notmatch) |
4475 | 0 | break; |
4476 | 0 | Feptr+= len; |
4477 | 0 | } |
4478 | 0 | break; |
4479 | | |
4480 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
4481 | | which means that Perl space and POSIX space are now identical. PCRE |
4482 | | was changed at release 8.34. */ |
4483 | | |
4484 | 0 | case PT_SPACE: /* Perl space */ |
4485 | 0 | case PT_PXSPACE: /* POSIX space */ |
4486 | 0 | for (i = Lmin; i < Lmax; i++) |
4487 | 0 | { |
4488 | 0 | int len = 1; |
4489 | 0 | if (Feptr >= mb->end_subject) |
4490 | 0 | { |
4491 | 0 | SCHECK_PARTIAL(); |
4492 | 0 | break; |
4493 | 0 | } |
4494 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4495 | 0 | switch(fc) |
4496 | 0 | { |
4497 | 0 | HSPACE_CASES: |
4498 | 0 | VSPACE_CASES: |
4499 | 0 | if (notmatch) goto ENDLOOP99; /* Break the loop */ |
4500 | 0 | break; |
4501 | | |
4502 | 0 | default: |
4503 | 0 | if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch) |
4504 | 0 | goto ENDLOOP99; /* Break the loop */ |
4505 | 0 | break; |
4506 | 0 | } |
4507 | 0 | Feptr+= len; |
4508 | 0 | } |
4509 | 0 | ENDLOOP99: |
4510 | 0 | break; |
4511 | | |
4512 | 0 | case PT_WORD: |
4513 | 0 | for (i = Lmin; i < Lmax; i++) |
4514 | 0 | { |
4515 | 0 | int chartype, category; |
4516 | 0 | int len = 1; |
4517 | 0 | if (Feptr >= mb->end_subject) |
4518 | 0 | { |
4519 | 0 | SCHECK_PARTIAL(); |
4520 | 0 | break; |
4521 | 0 | } |
4522 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4523 | 0 | chartype = UCD_CHARTYPE(fc); |
4524 | 0 | category = PRIV(ucp_gentype)[chartype]; |
4525 | 0 | if ((category == ucp_L || |
4526 | 0 | category == ucp_N || |
4527 | 0 | chartype == ucp_Mn || |
4528 | 0 | chartype == ucp_Pc) == notmatch) |
4529 | 0 | break; |
4530 | 0 | Feptr+= len; |
4531 | 0 | } |
4532 | 0 | break; |
4533 | | |
4534 | 0 | case PT_CLIST: |
4535 | 0 | for (i = Lmin; i < Lmax; i++) |
4536 | 0 | { |
4537 | 0 | const uint32_t *cp; |
4538 | 0 | int len = 1; |
4539 | 0 | if (Feptr >= mb->end_subject) |
4540 | 0 | { |
4541 | 0 | SCHECK_PARTIAL(); |
4542 | 0 | break; |
4543 | 0 | } |
4544 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4545 | | #if PCRE2_CODE_UNIT_WIDTH == 32 |
4546 | | if (fc > MAX_UTF_CODE_POINT) |
4547 | | { |
4548 | | if (!notmatch) goto GOT_MAX; |
4549 | | } |
4550 | | else |
4551 | | #endif |
4552 | 0 | { |
4553 | 0 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
4554 | 0 | for (;;) |
4555 | 0 | { |
4556 | 0 | if (fc < *cp) |
4557 | 0 | { if (notmatch) break; else goto GOT_MAX; } |
4558 | 0 | if (fc == *cp++) |
4559 | 0 | { if (notmatch) goto GOT_MAX; else break; } |
4560 | 0 | } |
4561 | 0 | } |
4562 | | |
4563 | 0 | Feptr += len; |
4564 | 0 | } |
4565 | 0 | GOT_MAX: |
4566 | 0 | break; |
4567 | | |
4568 | 0 | case PT_UCNC: |
4569 | 0 | for (i = Lmin; i < Lmax; i++) |
4570 | 0 | { |
4571 | 0 | int len = 1; |
4572 | 0 | if (Feptr >= mb->end_subject) |
4573 | 0 | { |
4574 | 0 | SCHECK_PARTIAL(); |
4575 | 0 | break; |
4576 | 0 | } |
4577 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4578 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
4579 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
4580 | 0 | fc >= 0xe000) == notmatch) |
4581 | 0 | break; |
4582 | 0 | Feptr += len; |
4583 | 0 | } |
4584 | 0 | break; |
4585 | | |
4586 | 0 | case PT_BIDICL: |
4587 | 0 | for (i = Lmin; i < Lmax; i++) |
4588 | 0 | { |
4589 | 0 | int len = 1; |
4590 | 0 | if (Feptr >= mb->end_subject) |
4591 | 0 | { |
4592 | 0 | SCHECK_PARTIAL(); |
4593 | 0 | break; |
4594 | 0 | } |
4595 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4596 | 0 | if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break; |
4597 | 0 | Feptr+= len; |
4598 | 0 | } |
4599 | 0 | break; |
4600 | | |
4601 | 0 | case PT_BOOL: |
4602 | 0 | for (i = Lmin; i < Lmax; i++) |
4603 | 0 | { |
4604 | 0 | BOOL ok; |
4605 | 0 | const ucd_record *prop; |
4606 | 0 | int len = 1; |
4607 | 0 | if (Feptr >= mb->end_subject) |
4608 | 0 | { |
4609 | 0 | SCHECK_PARTIAL(); |
4610 | 0 | break; |
4611 | 0 | } |
4612 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4613 | 0 | prop = GET_UCD(fc); |
4614 | 0 | ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
4615 | 0 | UCD_BPROPS_PROP(prop), Lpropvalue) != 0; |
4616 | 0 | if (ok == notmatch) break; |
4617 | 0 | Feptr+= len; |
4618 | 0 | } |
4619 | 0 | break; |
4620 | | |
4621 | | /* LCOV_EXCL_START */ |
4622 | 0 | default: |
4623 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
4624 | 0 | return PCRE2_ERROR_INTERNAL; |
4625 | | /* LCOV_EXCL_STOP */ |
4626 | 0 | } |
4627 | | |
4628 | | /* Feptr is now past the end of the maximum run */ |
4629 | | |
4630 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4631 | | |
4632 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
4633 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
4634 | | go too far. */ |
4635 | | |
4636 | 0 | for(;;) |
4637 | 0 | { |
4638 | 0 | if (Feptr <= Lstart_eptr) break; |
4639 | 0 | RMATCH(Fecode, RM221); |
4640 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4641 | 0 | Feptr--; |
4642 | 0 | if (utf) BACKCHAR(Feptr); |
4643 | 0 | } |
4644 | 0 | } |
4645 | | |
4646 | | /* Match extended Unicode grapheme clusters. We will get here only if the |
4647 | | support is in the binary; otherwise a compile-time error occurs. */ |
4648 | | |
4649 | 0 | else if (Lctype == OP_EXTUNI) |
4650 | 0 | { |
4651 | 0 | for (i = Lmin; i < Lmax; i++) |
4652 | 0 | { |
4653 | 0 | if (Feptr >= mb->end_subject) |
4654 | 0 | { |
4655 | 0 | SCHECK_PARTIAL(); |
4656 | 0 | break; |
4657 | 0 | } |
4658 | 0 | else |
4659 | 0 | { |
4660 | 0 | GETCHARINCTEST(fc, Feptr); |
4661 | 0 | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, |
4662 | 0 | utf, NULL); |
4663 | 0 | } |
4664 | 0 | CHECK_PARTIAL(); |
4665 | 0 | } |
4666 | | |
4667 | | /* Feptr is now past the end of the maximum run */ |
4668 | | |
4669 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4670 | | |
4671 | | /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start |
4672 | | of the run while backtracking because the use of \C in UTF mode can |
4673 | | cause BACKCHAR to move back past Lstart_eptr. This is just palliative; |
4674 | | the use of \C in UTF mode is fraught with danger. */ |
4675 | | |
4676 | 0 | for(;;) |
4677 | 0 | { |
4678 | 0 | int lgb, rgb; |
4679 | 0 | PCRE2_SPTR fptr; |
4680 | |
|
4681 | 0 | if (Feptr <= Lstart_eptr) break; /* At start of char run */ |
4682 | 0 | RMATCH(Fecode, RM219); |
4683 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4684 | | |
4685 | | /* Backtracking over an extended grapheme cluster involves inspecting |
4686 | | the previous two characters (if present) to see if a break is |
4687 | | permitted between them. */ |
4688 | | |
4689 | 0 | Feptr--; |
4690 | 0 | if (!utf) fc = *Feptr; else |
4691 | 0 | { |
4692 | 0 | BACKCHAR(Feptr); |
4693 | 0 | GETCHAR(fc, Feptr); |
4694 | 0 | } |
4695 | 0 | rgb = UCD_GRAPHBREAK(fc); |
4696 | |
|
4697 | 0 | for (;;) |
4698 | 0 | { |
4699 | 0 | if (Feptr <= Lstart_eptr) break; /* At start of char run */ |
4700 | 0 | fptr = Feptr - 1; |
4701 | 0 | if (!utf) fc = *fptr; else |
4702 | 0 | { |
4703 | 0 | BACKCHAR(fptr); |
4704 | 0 | GETCHAR(fc, fptr); |
4705 | 0 | } |
4706 | 0 | lgb = UCD_GRAPHBREAK(fc); |
4707 | 0 | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; |
4708 | 0 | Feptr = fptr; |
4709 | 0 | rgb = lgb; |
4710 | 0 | } |
4711 | 0 | } |
4712 | 0 | } |
4713 | | |
4714 | 0 | else |
4715 | 0 | #endif /* SUPPORT_UNICODE */ |
4716 | | |
4717 | 0 | #ifdef SUPPORT_UNICODE |
4718 | 0 | if (utf) |
4719 | 0 | { |
4720 | 0 | switch(Lctype) |
4721 | 0 | { |
4722 | 0 | case OP_ANY: |
4723 | 0 | for (i = Lmin; i < Lmax; i++) |
4724 | 0 | { |
4725 | 0 | if (Feptr >= mb->end_subject) |
4726 | 0 | { |
4727 | 0 | SCHECK_PARTIAL(); |
4728 | 0 | break; |
4729 | 0 | } |
4730 | 0 | if (IS_NEWLINE(Feptr)) break; |
4731 | 0 | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4732 | 0 | Feptr + 1 >= mb->end_subject && |
4733 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
4734 | 0 | NLBLOCK->nllen == 2 && |
4735 | 0 | UCHAR21(Feptr) == NLBLOCK->nl[0]) |
4736 | 0 | { |
4737 | 0 | mb->hitend = TRUE; |
4738 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4739 | 0 | } |
4740 | 0 | Feptr++; |
4741 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
4742 | 0 | } |
4743 | 0 | break; |
4744 | | |
4745 | 0 | case OP_ALLANY: |
4746 | 0 | if (Lmax < UINT32_MAX) |
4747 | 0 | { |
4748 | 0 | for (i = Lmin; i < Lmax; i++) |
4749 | 0 | { |
4750 | 0 | if (Feptr >= mb->end_subject) |
4751 | 0 | { |
4752 | 0 | SCHECK_PARTIAL(); |
4753 | 0 | break; |
4754 | 0 | } |
4755 | 0 | Feptr++; |
4756 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
4757 | 0 | } |
4758 | 0 | } |
4759 | 0 | else |
4760 | 0 | { |
4761 | 0 | Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */ |
4762 | 0 | SCHECK_PARTIAL(); |
4763 | 0 | } |
4764 | 0 | break; |
4765 | | |
4766 | | /* The "byte" (i.e. "code unit") case is the same as non-UTF */ |
4767 | | |
4768 | 0 | case OP_ANYBYTE: |
4769 | 0 | fc = Lmax - Lmin; |
4770 | 0 | if (fc > (uint32_t)(mb->end_subject - Feptr)) |
4771 | 0 | { |
4772 | 0 | Feptr = mb->end_subject; |
4773 | 0 | SCHECK_PARTIAL(); |
4774 | 0 | } |
4775 | 0 | else Feptr += fc; |
4776 | 0 | break; |
4777 | | |
4778 | 0 | case OP_ANYNL: |
4779 | 0 | for (i = Lmin; i < Lmax; i++) |
4780 | 0 | { |
4781 | 0 | int len = 1; |
4782 | 0 | if (Feptr >= mb->end_subject) |
4783 | 0 | { |
4784 | 0 | SCHECK_PARTIAL(); |
4785 | 0 | break; |
4786 | 0 | } |
4787 | 0 | GETCHARLEN(fc, Feptr, len); |
4788 | 0 | if (fc == CHAR_CR) |
4789 | 0 | { |
4790 | 0 | if (++Feptr >= mb->end_subject) break; |
4791 | 0 | if (UCHAR21(Feptr) == CHAR_LF) Feptr++; |
4792 | 0 | } |
4793 | 0 | else |
4794 | 0 | { |
4795 | 0 | if (fc != CHAR_LF && |
4796 | 0 | (mb->bsr_convention == PCRE2_BSR_ANYCRLF || |
4797 | 0 | (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL |
4798 | 0 | #ifndef EBCDIC |
4799 | 0 | && fc != 0x2028 && fc != 0x2029 |
4800 | 0 | #endif /* Not EBCDIC */ |
4801 | 0 | ))) |
4802 | 0 | break; |
4803 | 0 | Feptr += len; |
4804 | 0 | } |
4805 | 0 | } |
4806 | 0 | break; |
4807 | | |
4808 | 0 | case OP_NOT_HSPACE: |
4809 | 0 | case OP_HSPACE: |
4810 | 0 | for (i = Lmin; i < Lmax; i++) |
4811 | 0 | { |
4812 | 0 | BOOL gotspace; |
4813 | 0 | int len = 1; |
4814 | 0 | if (Feptr >= mb->end_subject) |
4815 | 0 | { |
4816 | 0 | SCHECK_PARTIAL(); |
4817 | 0 | break; |
4818 | 0 | } |
4819 | 0 | GETCHARLEN(fc, Feptr, len); |
4820 | 0 | switch(fc) |
4821 | 0 | { |
4822 | 0 | HSPACE_CASES: gotspace = TRUE; break; |
4823 | 0 | default: gotspace = FALSE; break; |
4824 | 0 | } |
4825 | 0 | if (gotspace == (Lctype == OP_NOT_HSPACE)) break; |
4826 | 0 | Feptr += len; |
4827 | 0 | } |
4828 | 0 | break; |
4829 | | |
4830 | 0 | case OP_NOT_VSPACE: |
4831 | 0 | case OP_VSPACE: |
4832 | 0 | for (i = Lmin; i < Lmax; i++) |
4833 | 0 | { |
4834 | 0 | BOOL gotspace; |
4835 | 0 | int len = 1; |
4836 | 0 | if (Feptr >= mb->end_subject) |
4837 | 0 | { |
4838 | 0 | SCHECK_PARTIAL(); |
4839 | 0 | break; |
4840 | 0 | } |
4841 | 0 | GETCHARLEN(fc, Feptr, len); |
4842 | 0 | switch(fc) |
4843 | 0 | { |
4844 | 0 | VSPACE_CASES: gotspace = TRUE; break; |
4845 | 0 | default: gotspace = FALSE; break; |
4846 | 0 | } |
4847 | 0 | if (gotspace == (Lctype == OP_NOT_VSPACE)) break; |
4848 | 0 | Feptr += len; |
4849 | 0 | } |
4850 | 0 | break; |
4851 | | |
4852 | 0 | case OP_NOT_DIGIT: |
4853 | 0 | for (i = Lmin; i < Lmax; i++) |
4854 | 0 | { |
4855 | 0 | int len = 1; |
4856 | 0 | if (Feptr >= mb->end_subject) |
4857 | 0 | { |
4858 | 0 | SCHECK_PARTIAL(); |
4859 | 0 | break; |
4860 | 0 | } |
4861 | 0 | GETCHARLEN(fc, Feptr, len); |
4862 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break; |
4863 | 0 | Feptr+= len; |
4864 | 0 | } |
4865 | 0 | break; |
4866 | | |
4867 | 0 | case OP_DIGIT: |
4868 | 0 | for (i = Lmin; i < Lmax; i++) |
4869 | 0 | { |
4870 | 0 | int len = 1; |
4871 | 0 | if (Feptr >= mb->end_subject) |
4872 | 0 | { |
4873 | 0 | SCHECK_PARTIAL(); |
4874 | 0 | break; |
4875 | 0 | } |
4876 | 0 | GETCHARLEN(fc, Feptr, len); |
4877 | 0 | if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break; |
4878 | 0 | Feptr+= len; |
4879 | 0 | } |
4880 | 0 | break; |
4881 | | |
4882 | 0 | case OP_NOT_WHITESPACE: |
4883 | 0 | for (i = Lmin; i < Lmax; i++) |
4884 | 0 | { |
4885 | 0 | int len = 1; |
4886 | 0 | if (Feptr >= mb->end_subject) |
4887 | 0 | { |
4888 | 0 | SCHECK_PARTIAL(); |
4889 | 0 | break; |
4890 | 0 | } |
4891 | 0 | GETCHARLEN(fc, Feptr, len); |
4892 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break; |
4893 | 0 | Feptr+= len; |
4894 | 0 | } |
4895 | 0 | break; |
4896 | | |
4897 | 0 | case OP_WHITESPACE: |
4898 | 0 | for (i = Lmin; i < Lmax; i++) |
4899 | 0 | { |
4900 | 0 | int len = 1; |
4901 | 0 | if (Feptr >= mb->end_subject) |
4902 | 0 | { |
4903 | 0 | SCHECK_PARTIAL(); |
4904 | 0 | break; |
4905 | 0 | } |
4906 | 0 | GETCHARLEN(fc, Feptr, len); |
4907 | 0 | if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break; |
4908 | 0 | Feptr+= len; |
4909 | 0 | } |
4910 | 0 | break; |
4911 | | |
4912 | 0 | case OP_NOT_WORDCHAR: |
4913 | 0 | for (i = Lmin; i < Lmax; i++) |
4914 | 0 | { |
4915 | 0 | int len = 1; |
4916 | 0 | if (Feptr >= mb->end_subject) |
4917 | 0 | { |
4918 | 0 | SCHECK_PARTIAL(); |
4919 | 0 | break; |
4920 | 0 | } |
4921 | 0 | GETCHARLEN(fc, Feptr, len); |
4922 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break; |
4923 | 0 | Feptr+= len; |
4924 | 0 | } |
4925 | 0 | break; |
4926 | | |
4927 | 0 | case OP_WORDCHAR: |
4928 | 0 | for (i = Lmin; i < Lmax; i++) |
4929 | 0 | { |
4930 | 0 | int len = 1; |
4931 | 0 | if (Feptr >= mb->end_subject) |
4932 | 0 | { |
4933 | 0 | SCHECK_PARTIAL(); |
4934 | 0 | break; |
4935 | 0 | } |
4936 | 0 | GETCHARLEN(fc, Feptr, len); |
4937 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break; |
4938 | 0 | Feptr+= len; |
4939 | 0 | } |
4940 | 0 | break; |
4941 | | |
4942 | | /* LCOV_EXCL_START */ |
4943 | 0 | default: |
4944 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
4945 | 0 | return PCRE2_ERROR_INTERNAL; |
4946 | | /* LCOV_EXCL_STOP */ |
4947 | 0 | } |
4948 | | |
4949 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4950 | | |
4951 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
4952 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go |
4953 | | too far. */ |
4954 | | |
4955 | 0 | for(;;) |
4956 | 0 | { |
4957 | 0 | if (Feptr <= Lstart_eptr) break; |
4958 | 0 | RMATCH(Fecode, RM220); |
4959 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4960 | 0 | Feptr--; |
4961 | 0 | BACKCHAR(Feptr); |
4962 | 0 | if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && |
4963 | 0 | UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR) |
4964 | 0 | Feptr--; |
4965 | 0 | } |
4966 | 0 | } |
4967 | 0 | else |
4968 | 0 | #endif /* SUPPORT_UNICODE */ |
4969 | | |
4970 | | /* Not UTF mode */ |
4971 | 0 | { |
4972 | 0 | switch(Lctype) |
4973 | 0 | { |
4974 | 0 | case OP_ANY: |
4975 | 0 | for (i = Lmin; i < Lmax; i++) |
4976 | 0 | { |
4977 | 0 | if (Feptr >= mb->end_subject) |
4978 | 0 | { |
4979 | 0 | SCHECK_PARTIAL(); |
4980 | 0 | break; |
4981 | 0 | } |
4982 | 0 | if (IS_NEWLINE(Feptr)) break; |
4983 | 0 | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4984 | 0 | Feptr + 1 >= mb->end_subject && |
4985 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
4986 | 0 | NLBLOCK->nllen == 2 && |
4987 | 0 | *Feptr == NLBLOCK->nl[0]) |
4988 | 0 | { |
4989 | 0 | mb->hitend = TRUE; |
4990 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4991 | 0 | } |
4992 | 0 | Feptr++; |
4993 | 0 | } |
4994 | 0 | break; |
4995 | | |
4996 | 0 | case OP_ALLANY: |
4997 | 0 | case OP_ANYBYTE: |
4998 | 0 | fc = Lmax - Lmin; |
4999 | 0 | if (fc > (uint32_t)(mb->end_subject - Feptr)) |
5000 | 0 | { |
5001 | 0 | Feptr = mb->end_subject; |
5002 | 0 | SCHECK_PARTIAL(); |
5003 | 0 | } |
5004 | 0 | else Feptr += fc; |
5005 | 0 | break; |
5006 | | |
5007 | 0 | case OP_ANYNL: |
5008 | 0 | for (i = Lmin; i < Lmax; i++) |
5009 | 0 | { |
5010 | 0 | if (Feptr >= mb->end_subject) |
5011 | 0 | { |
5012 | 0 | SCHECK_PARTIAL(); |
5013 | 0 | break; |
5014 | 0 | } |
5015 | 0 | fc = *Feptr; |
5016 | 0 | if (fc == CHAR_CR) |
5017 | 0 | { |
5018 | 0 | if (++Feptr >= mb->end_subject) break; |
5019 | 0 | if (*Feptr == CHAR_LF) Feptr++; |
5020 | 0 | } |
5021 | 0 | else |
5022 | 0 | { |
5023 | 0 | if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF || |
5024 | 0 | (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL |
5025 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
5026 | | && fc != 0x2028 && fc != 0x2029 |
5027 | | #endif |
5028 | 0 | ))) break; |
5029 | 0 | Feptr++; |
5030 | 0 | } |
5031 | 0 | } |
5032 | 0 | break; |
5033 | | |
5034 | 0 | case OP_NOT_HSPACE: |
5035 | 0 | for (i = Lmin; i < Lmax; i++) |
5036 | 0 | { |
5037 | 0 | if (Feptr >= mb->end_subject) |
5038 | 0 | { |
5039 | 0 | SCHECK_PARTIAL(); |
5040 | 0 | break; |
5041 | 0 | } |
5042 | 0 | switch(*Feptr) |
5043 | 0 | { |
5044 | 0 | default: Feptr++; break; |
5045 | 0 | HSPACE_BYTE_CASES: |
5046 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
5047 | | HSPACE_MULTIBYTE_CASES: |
5048 | | #endif |
5049 | 0 | goto ENDLOOP00; |
5050 | 0 | } |
5051 | 0 | } |
5052 | 0 | ENDLOOP00: |
5053 | 0 | break; |
5054 | | |
5055 | 0 | case OP_HSPACE: |
5056 | 0 | for (i = Lmin; i < Lmax; i++) |
5057 | 0 | { |
5058 | 0 | if (Feptr >= mb->end_subject) |
5059 | 0 | { |
5060 | 0 | SCHECK_PARTIAL(); |
5061 | 0 | break; |
5062 | 0 | } |
5063 | 0 | switch(*Feptr) |
5064 | 0 | { |
5065 | 0 | default: goto ENDLOOP01; |
5066 | 0 | HSPACE_BYTE_CASES: |
5067 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
5068 | | HSPACE_MULTIBYTE_CASES: |
5069 | | #endif |
5070 | 0 | Feptr++; break; |
5071 | 0 | } |
5072 | 0 | } |
5073 | 0 | ENDLOOP01: |
5074 | 0 | break; |
5075 | | |
5076 | 0 | case OP_NOT_VSPACE: |
5077 | 0 | for (i = Lmin; i < Lmax; i++) |
5078 | 0 | { |
5079 | 0 | if (Feptr >= mb->end_subject) |
5080 | 0 | { |
5081 | 0 | SCHECK_PARTIAL(); |
5082 | 0 | break; |
5083 | 0 | } |
5084 | 0 | switch(*Feptr) |
5085 | 0 | { |
5086 | 0 | default: Feptr++; break; |
5087 | 0 | VSPACE_BYTE_CASES: |
5088 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
5089 | | VSPACE_MULTIBYTE_CASES: |
5090 | | #endif |
5091 | 0 | goto ENDLOOP02; |
5092 | 0 | } |
5093 | 0 | } |
5094 | 0 | ENDLOOP02: |
5095 | 0 | break; |
5096 | | |
5097 | 0 | case OP_VSPACE: |
5098 | 0 | for (i = Lmin; i < Lmax; i++) |
5099 | 0 | { |
5100 | 0 | if (Feptr >= mb->end_subject) |
5101 | 0 | { |
5102 | 0 | SCHECK_PARTIAL(); |
5103 | 0 | break; |
5104 | 0 | } |
5105 | 0 | switch(*Feptr) |
5106 | 0 | { |
5107 | 0 | default: goto ENDLOOP03; |
5108 | 0 | VSPACE_BYTE_CASES: |
5109 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
5110 | | VSPACE_MULTIBYTE_CASES: |
5111 | | #endif |
5112 | 0 | Feptr++; break; |
5113 | 0 | } |
5114 | 0 | } |
5115 | 0 | ENDLOOP03: |
5116 | 0 | break; |
5117 | | |
5118 | 0 | case OP_NOT_DIGIT: |
5119 | 0 | for (i = Lmin; i < Lmax; i++) |
5120 | 0 | { |
5121 | 0 | if (Feptr >= mb->end_subject) |
5122 | 0 | { |
5123 | 0 | SCHECK_PARTIAL(); |
5124 | 0 | break; |
5125 | 0 | } |
5126 | 0 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) |
5127 | 0 | break; |
5128 | 0 | Feptr++; |
5129 | 0 | } |
5130 | 0 | break; |
5131 | | |
5132 | 0 | case OP_DIGIT: |
5133 | 0 | for (i = Lmin; i < Lmax; i++) |
5134 | 0 | { |
5135 | 0 | if (Feptr >= mb->end_subject) |
5136 | 0 | { |
5137 | 0 | SCHECK_PARTIAL(); |
5138 | 0 | break; |
5139 | 0 | } |
5140 | 0 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) |
5141 | 0 | break; |
5142 | 0 | Feptr++; |
5143 | 0 | } |
5144 | 0 | break; |
5145 | | |
5146 | 0 | case OP_NOT_WHITESPACE: |
5147 | 0 | for (i = Lmin; i < Lmax; i++) |
5148 | 0 | { |
5149 | 0 | if (Feptr >= mb->end_subject) |
5150 | 0 | { |
5151 | 0 | SCHECK_PARTIAL(); |
5152 | 0 | break; |
5153 | 0 | } |
5154 | 0 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) |
5155 | 0 | break; |
5156 | 0 | Feptr++; |
5157 | 0 | } |
5158 | 0 | break; |
5159 | | |
5160 | 0 | case OP_WHITESPACE: |
5161 | 0 | for (i = Lmin; i < Lmax; i++) |
5162 | 0 | { |
5163 | 0 | if (Feptr >= mb->end_subject) |
5164 | 0 | { |
5165 | 0 | SCHECK_PARTIAL(); |
5166 | 0 | break; |
5167 | 0 | } |
5168 | 0 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) |
5169 | 0 | break; |
5170 | 0 | Feptr++; |
5171 | 0 | } |
5172 | 0 | break; |
5173 | | |
5174 | 0 | case OP_NOT_WORDCHAR: |
5175 | 0 | for (i = Lmin; i < Lmax; i++) |
5176 | 0 | { |
5177 | 0 | if (Feptr >= mb->end_subject) |
5178 | 0 | { |
5179 | 0 | SCHECK_PARTIAL(); |
5180 | 0 | break; |
5181 | 0 | } |
5182 | 0 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) |
5183 | 0 | break; |
5184 | 0 | Feptr++; |
5185 | 0 | } |
5186 | 0 | break; |
5187 | | |
5188 | 0 | case OP_WORDCHAR: |
5189 | 0 | for (i = Lmin; i < Lmax; i++) |
5190 | 0 | { |
5191 | 0 | if (Feptr >= mb->end_subject) |
5192 | 0 | { |
5193 | 0 | SCHECK_PARTIAL(); |
5194 | 0 | break; |
5195 | 0 | } |
5196 | 0 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) |
5197 | 0 | break; |
5198 | 0 | Feptr++; |
5199 | 0 | } |
5200 | 0 | break; |
5201 | | |
5202 | | /* LCOV_EXCL_START */ |
5203 | 0 | default: |
5204 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
5205 | 0 | return PCRE2_ERROR_INTERNAL; |
5206 | | /* LCOV_EXCL_STOP */ |
5207 | 0 | } |
5208 | | |
5209 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
5210 | | |
5211 | 0 | for (;;) |
5212 | 0 | { |
5213 | 0 | if (Feptr == Lstart_eptr) break; |
5214 | 0 | RMATCH(Fecode, RM34); |
5215 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5216 | 0 | Feptr--; |
5217 | 0 | if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF && |
5218 | 0 | Feptr[-1] == CHAR_CR) Feptr--; |
5219 | 0 | } |
5220 | 0 | } |
5221 | 0 | } |
5222 | 0 | break; /* End of repeat character type processing */ |
5223 | | |
5224 | 0 | #undef Lstart_eptr |
5225 | 0 | #undef Lmin |
5226 | 0 | #undef Lmax |
5227 | 0 | #undef Lctype |
5228 | 0 | #undef Lpropvalue |
5229 | | |
5230 | | |
5231 | | /* ===================================================================== */ |
5232 | | /* Match a back reference, possibly repeatedly. Look past the end of the |
5233 | | item to see if there is repeat information following. The OP_REF and |
5234 | | OP_REFI opcodes are used for a reference to a numbered group or to a |
5235 | | non-duplicated named group. For a duplicated named group, OP_DNREF and |
5236 | | OP_DNREFI are used. In this case we must scan the list of groups to which |
5237 | | the name refers, and use the first one that is set. */ |
5238 | | |
5239 | 0 | #define Lmin F->temp_32[0] |
5240 | 0 | #define Lmax F->temp_32[1] |
5241 | 0 | #define Lcaseless F->temp_32[2] |
5242 | 0 | #define Lcaseopts F->temp_32[3] |
5243 | 0 | #define Lstart F->temp_sptr[0] |
5244 | 0 | #define Loffset F->temp_size |
5245 | | |
5246 | 0 | case OP_DNREF: |
5247 | 0 | case OP_DNREFI: |
5248 | 0 | Lcaseless = (Fop == OP_DNREFI); |
5249 | 0 | Lcaseopts = (Fop == OP_DNREFI)? Fecode[1 + 2*IMM2_SIZE] : 0; |
5250 | 0 | { |
5251 | 0 | int count = GET2(Fecode, 1+IMM2_SIZE); |
5252 | 0 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
5253 | 0 | Fecode += 1 + 2*IMM2_SIZE + (Fop == OP_DNREFI? 1 : 0); |
5254 | |
|
5255 | 0 | while (count-- > 0) |
5256 | 0 | { |
5257 | 0 | Loffset = (GET2(slot, 0) << 1) - 2; |
5258 | 0 | if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break; |
5259 | 0 | slot += mb->name_entry_size; |
5260 | 0 | } |
5261 | 0 | } |
5262 | 0 | goto REF_REPEAT; |
5263 | | |
5264 | 0 | case OP_REF: |
5265 | 0 | case OP_REFI: |
5266 | 0 | Lcaseless = (Fop == OP_REFI); |
5267 | 0 | Lcaseopts = (Fop == OP_REFI)? Fecode[1 + IMM2_SIZE] : 0; |
5268 | 0 | Loffset = (GET2(Fecode, 1) << 1) - 2; |
5269 | 0 | Fecode += 1 + IMM2_SIZE + (Fop == OP_REFI? 1 : 0); |
5270 | | |
5271 | | /* Set up for repetition, or handle the non-repeated case. The maximum and |
5272 | | minimum must be in the heap frame, but as they are short-term values, we |
5273 | | use temporary fields. */ |
5274 | |
|
5275 | 0 | REF_REPEAT: |
5276 | 0 | switch (*Fecode) |
5277 | 0 | { |
5278 | 0 | case OP_CRSTAR: |
5279 | 0 | case OP_CRMINSTAR: |
5280 | 0 | case OP_CRPLUS: |
5281 | 0 | case OP_CRMINPLUS: |
5282 | 0 | case OP_CRQUERY: |
5283 | 0 | case OP_CRMINQUERY: |
5284 | 0 | fc = *Fecode++ - OP_CRSTAR; |
5285 | 0 | Lmin = rep_min[fc]; |
5286 | 0 | Lmax = rep_max[fc]; |
5287 | 0 | reptype = rep_typ[fc]; |
5288 | 0 | break; |
5289 | | |
5290 | 0 | case OP_CRRANGE: |
5291 | 0 | case OP_CRMINRANGE: |
5292 | 0 | Lmin = GET2(Fecode, 1); |
5293 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
5294 | 0 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
5295 | 0 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
5296 | 0 | Fecode += 1 + 2 * IMM2_SIZE; |
5297 | 0 | break; |
5298 | | |
5299 | 0 | default: /* No repeat follows */ |
5300 | 0 | { |
5301 | 0 | rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &length); |
5302 | 0 | if (rrc != 0) |
5303 | 0 | { |
5304 | 0 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
5305 | 0 | CHECK_PARTIAL(); |
5306 | 0 | RRETURN(MATCH_NOMATCH); |
5307 | 0 | } |
5308 | 0 | } |
5309 | 0 | Feptr += length; |
5310 | 0 | continue; /* With the main loop */ |
5311 | 0 | } |
5312 | | |
5313 | | /* Handle repeated back references. If a set group has length zero, just |
5314 | | continue with the main loop, because it matches however many times. For an |
5315 | | unset reference, if the minimum is zero, we can also just continue. We can |
5316 | | also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset |
5317 | | group behave as a zero-length group. For any other unset cases, carrying |
5318 | | on will result in NOMATCH. */ |
5319 | | |
5320 | 0 | if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) |
5321 | 0 | { |
5322 | 0 | if (Fovector[Loffset] == Fovector[Loffset + 1]) continue; |
5323 | 0 | } |
5324 | 0 | else /* Group is not set */ |
5325 | 0 | { |
5326 | 0 | if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) |
5327 | 0 | continue; |
5328 | 0 | } |
5329 | | |
5330 | | /* First, ensure the minimum number of matches are present. */ |
5331 | | |
5332 | 0 | for (i = 1; i <= Lmin; i++) |
5333 | 0 | { |
5334 | 0 | PCRE2_SIZE slength; |
5335 | 0 | rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); |
5336 | 0 | if (rrc != 0) |
5337 | 0 | { |
5338 | 0 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
5339 | 0 | CHECK_PARTIAL(); |
5340 | 0 | RRETURN(MATCH_NOMATCH); |
5341 | 0 | } |
5342 | 0 | Feptr += slength; |
5343 | 0 | } |
5344 | | |
5345 | | /* If min = max, we are done. They are not both allowed to be zero. */ |
5346 | | |
5347 | 0 | if (Lmin == Lmax) continue; |
5348 | | |
5349 | | /* If minimizing, keep trying and advancing the pointer. */ |
5350 | | |
5351 | 0 | if (reptype == REPTYPE_MIN) |
5352 | 0 | { |
5353 | 0 | for (;;) |
5354 | 0 | { |
5355 | 0 | PCRE2_SIZE slength; |
5356 | 0 | RMATCH(Fecode, RM20); |
5357 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5358 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
5359 | 0 | rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); |
5360 | 0 | if (rrc != 0) |
5361 | 0 | { |
5362 | 0 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
5363 | 0 | CHECK_PARTIAL(); |
5364 | 0 | RRETURN(MATCH_NOMATCH); |
5365 | 0 | } |
5366 | 0 | Feptr += slength; |
5367 | 0 | } |
5368 | | |
5369 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
5370 | 0 | } |
5371 | | |
5372 | | /* If maximizing, find the longest string and work backwards, as long as |
5373 | | the matched lengths for each iteration are the same. */ |
5374 | | |
5375 | 0 | else |
5376 | 0 | { |
5377 | 0 | BOOL samelengths = TRUE; |
5378 | 0 | Lstart = Feptr; /* Starting position */ |
5379 | 0 | Flength = Fovector[Loffset+1] - Fovector[Loffset]; |
5380 | |
|
5381 | 0 | for (i = Lmin; i < Lmax; i++) |
5382 | 0 | { |
5383 | 0 | PCRE2_SIZE slength; |
5384 | 0 | rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); |
5385 | 0 | if (rrc != 0) |
5386 | 0 | { |
5387 | | /* Can't use CHECK_PARTIAL because we don't want to update Feptr in |
5388 | | the soft partial matching case. */ |
5389 | |
|
5390 | 0 | if (rrc > 0 && mb->partial != 0 && |
5391 | 0 | mb->end_subject > mb->start_used_ptr) |
5392 | 0 | { |
5393 | 0 | mb->hitend = TRUE; |
5394 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
5395 | 0 | } |
5396 | 0 | break; |
5397 | 0 | } |
5398 | | |
5399 | 0 | if (slength != Flength) samelengths = FALSE; |
5400 | 0 | Feptr += slength; |
5401 | 0 | } |
5402 | | |
5403 | | /* If the length matched for each repetition is the same as the length of |
5404 | | the captured group, we can easily work backwards. This is the normal |
5405 | | case. However, in caseless UTF-8 mode there are pairs of case-equivalent |
5406 | | characters whose lengths (in terms of code units) differ. However, this |
5407 | | is very rare, so we handle it by re-matching fewer and fewer times. */ |
5408 | | |
5409 | 0 | if (samelengths) |
5410 | 0 | { |
5411 | 0 | while (Feptr >= Lstart) |
5412 | 0 | { |
5413 | 0 | RMATCH(Fecode, RM21); |
5414 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5415 | 0 | Feptr -= Flength; |
5416 | 0 | } |
5417 | 0 | } |
5418 | | |
5419 | | /* The rare case of non-matching lengths. Re-scan the repetition for each |
5420 | | iteration. We know that match_ref() will succeed every time. */ |
5421 | | |
5422 | 0 | else |
5423 | 0 | { |
5424 | 0 | Lmax = i; |
5425 | 0 | for (;;) |
5426 | 0 | { |
5427 | 0 | RMATCH(Fecode, RM22); |
5428 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5429 | 0 | if (Feptr == Lstart) break; /* Failed after minimal repetition */ |
5430 | 0 | Feptr = Lstart; |
5431 | 0 | Lmax--; |
5432 | 0 | for (i = Lmin; i < Lmax; i++) |
5433 | 0 | { |
5434 | 0 | PCRE2_SIZE slength; |
5435 | 0 | (void)match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); |
5436 | 0 | Feptr += slength; |
5437 | 0 | } |
5438 | 0 | } |
5439 | 0 | } |
5440 | | |
5441 | 0 | RRETURN(MATCH_NOMATCH); |
5442 | 0 | } |
5443 | | |
5444 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ |
5445 | |
|
5446 | 0 | #undef Lcaseless |
5447 | 0 | #undef Lmin |
5448 | 0 | #undef Lmax |
5449 | 0 | #undef Lstart |
5450 | 0 | #undef Loffset |
5451 | | |
5452 | | |
5453 | | |
5454 | | /* ========================================================================= */ |
5455 | | /* Opcodes for the start of various parenthesized items */ |
5456 | | /* ========================================================================= */ |
5457 | | |
5458 | | /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the |
5459 | | (*THEN) is within the current branch by comparing the address of OP_THEN |
5460 | | that is passed back with the end of the branch. If (*THEN) is within the |
5461 | | current branch, and the branch is one of two or more alternatives (it |
5462 | | either starts or ends with OP_ALT), we have reached the limit of THEN's |
5463 | | action, so convert the return code to NOMATCH, which will cause normal |
5464 | | backtracking to happen from now on. Otherwise, THEN is passed back to an |
5465 | | outer alternative. This implements Perl's treatment of parenthesized |
5466 | | groups, where a group not containing | does not affect the current |
5467 | | alternative, that is, (X) is NOT the same as (X|(*F)). */ |
5468 | | |
5469 | | |
5470 | | /* ===================================================================== */ |
5471 | | /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive |
5472 | | bracket group, indicating that it may occur zero times. It may repeat |
5473 | | infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in |
5474 | | the pattern. Brackets with fixed upper repeat limits are compiled as a |
5475 | | number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO. |
5476 | | Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */ |
5477 | |
|
5478 | 0 | #define Lnext_ecode F->temp_sptr[0] |
5479 | |
|
5480 | 0 | case OP_BRAZERO: |
5481 | 0 | Lnext_ecode = Fecode + 1; |
5482 | 0 | RMATCH(Lnext_ecode, RM9); |
5483 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5484 | 0 | do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); |
5485 | 0 | Fecode = Lnext_ecode + 1 + LINK_SIZE; |
5486 | 0 | break; |
5487 | | |
5488 | 0 | case OP_BRAMINZERO: |
5489 | 0 | Lnext_ecode = Fecode + 1; |
5490 | 0 | do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); |
5491 | 0 | RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10); |
5492 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5493 | 0 | Fecode++; |
5494 | 0 | break; |
5495 | | |
5496 | 0 | #undef Lnext_ecode |
5497 | | |
5498 | 0 | case OP_SKIPZERO: |
5499 | 0 | Fecode++; |
5500 | 0 | do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); |
5501 | 0 | Fecode += 1 + LINK_SIZE; |
5502 | 0 | break; |
5503 | | |
5504 | | |
5505 | | /* ===================================================================== */ |
5506 | | /* Handle possessive brackets with an unlimited repeat. The end of these |
5507 | | brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without |
5508 | | going further in the pattern. */ |
5509 | | |
5510 | 0 | #define Lframe_type F->temp_32[0] |
5511 | 0 | #define Lmatched_once F->temp_32[1] |
5512 | 0 | #define Lzero_allowed F->temp_32[2] |
5513 | 0 | #define Lstart_eptr F->temp_sptr[0] |
5514 | 0 | #define Lstart_group F->temp_sptr[1] |
5515 | | |
5516 | 0 | case OP_BRAPOSZERO: |
5517 | 0 | Lzero_allowed = TRUE; /* Zero repeat is allowed */ |
5518 | 0 | Fecode += 1; |
5519 | 0 | if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS) |
5520 | 0 | goto POSSESSIVE_CAPTURE; |
5521 | 0 | goto POSSESSIVE_NON_CAPTURE; |
5522 | | |
5523 | 0 | case OP_BRAPOS: |
5524 | 0 | case OP_SBRAPOS: |
5525 | 0 | Lzero_allowed = FALSE; /* Zero repeat not allowed */ |
5526 | |
|
5527 | 0 | POSSESSIVE_NON_CAPTURE: |
5528 | 0 | Lframe_type = GF_NOCAPTURE; /* Remembered frame type */ |
5529 | 0 | goto POSSESSIVE_GROUP; |
5530 | | |
5531 | 0 | case OP_CBRAPOS: |
5532 | 0 | case OP_SCBRAPOS: |
5533 | 0 | Lzero_allowed = FALSE; /* Zero repeat not allowed */ |
5534 | |
|
5535 | 0 | POSSESSIVE_CAPTURE: |
5536 | 0 | number = GET2(Fecode, 1+LINK_SIZE); |
5537 | 0 | Lframe_type = GF_CAPTURE | number; /* Remembered frame type */ |
5538 | |
|
5539 | 0 | POSSESSIVE_GROUP: |
5540 | 0 | Lmatched_once = FALSE; /* Never matched */ |
5541 | 0 | Lstart_group = Fecode; /* Start of this group */ |
5542 | |
|
5543 | 0 | for (;;) |
5544 | 0 | { |
5545 | 0 | Lstart_eptr = Feptr; /* Position at group start */ |
5546 | 0 | group_frame_type = Lframe_type; |
5547 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8); |
5548 | 0 | if (rrc == MATCH_KETRPOS) |
5549 | 0 | { |
5550 | 0 | Lmatched_once = TRUE; /* Matched at least once */ |
5551 | 0 | if (Feptr == Lstart_eptr) /* Empty match; skip to end */ |
5552 | 0 | { |
5553 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5554 | 0 | break; |
5555 | 0 | } |
5556 | | |
5557 | 0 | Fecode = Lstart_group; |
5558 | 0 | continue; |
5559 | 0 | } |
5560 | | |
5561 | | /* See comment above about handling THEN. */ |
5562 | | |
5563 | 0 | if (rrc == MATCH_THEN) |
5564 | 0 | { |
5565 | 0 | PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); |
5566 | 0 | if (mb->verb_ecode_ptr < next_ecode && |
5567 | 0 | (*Fecode == OP_ALT || *next_ecode == OP_ALT)) |
5568 | 0 | rrc = MATCH_NOMATCH; |
5569 | 0 | } |
5570 | |
|
5571 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5572 | 0 | Fecode += GET(Fecode, 1); |
5573 | 0 | if (*Fecode != OP_ALT) break; |
5574 | 0 | } |
5575 | | |
5576 | | /* Success if matched something or zero repeat allowed */ |
5577 | | |
5578 | 0 | if (Lmatched_once || Lzero_allowed) |
5579 | 0 | { |
5580 | 0 | Fecode += 1 + LINK_SIZE; |
5581 | 0 | break; |
5582 | 0 | } |
5583 | | |
5584 | 0 | RRETURN(MATCH_NOMATCH); |
5585 | | |
5586 | 0 | #undef Lmatched_once |
5587 | 0 | #undef Lzero_allowed |
5588 | 0 | #undef Lframe_type |
5589 | 0 | #undef Lstart_eptr |
5590 | 0 | #undef Lstart_group |
5591 | | |
5592 | | |
5593 | | /* ===================================================================== */ |
5594 | | /* Handle non-capturing brackets that cannot match an empty string. When we |
5595 | | get to the final alternative within the brackets, as long as there are no |
5596 | | THEN's in the pattern, we can optimize by not recording a new backtracking |
5597 | | point. (Ideally we should test for a THEN within this group, but we don't |
5598 | | have that information.) Don't do this if we are at the very top level, |
5599 | | however, because that would make handling assertions and once-only brackets |
5600 | | messier when there is nothing to go back to. */ |
5601 | | |
5602 | 0 | #define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */ |
5603 | 0 | #define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */ |
5604 | | |
5605 | 0 | case OP_BRA: |
5606 | 0 | if (mb->hasthen || Frdepth == 0) |
5607 | 0 | { |
5608 | 0 | Lframe_type = 0; |
5609 | 0 | goto GROUPLOOP; |
5610 | 0 | } |
5611 | | |
5612 | 0 | for (;;) |
5613 | 0 | { |
5614 | 0 | Lnext_branch = Fecode + GET(Fecode, 1); |
5615 | 0 | if (*Lnext_branch != OP_ALT) break; |
5616 | | |
5617 | | /* This is never the final branch. We do not need to test for MATCH_THEN |
5618 | | here because this code is not used when there is a THEN in the pattern. */ |
5619 | | |
5620 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1); |
5621 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5622 | 0 | Fecode = Lnext_branch; |
5623 | 0 | } |
5624 | | |
5625 | | /* Hit the start of the final branch. Continue at this level. */ |
5626 | | |
5627 | 0 | Fecode += PRIV(OP_lengths)[*Fecode]; |
5628 | 0 | break; |
5629 | | |
5630 | 0 | #undef Lnext_branch |
5631 | | |
5632 | | |
5633 | | /* ===================================================================== */ |
5634 | | /* Handle a capturing bracket, other than those that are possessive with an |
5635 | | unlimited repeat. */ |
5636 | | |
5637 | 0 | case OP_CBRA: |
5638 | 0 | case OP_SCBRA: |
5639 | 0 | Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE); |
5640 | 0 | goto GROUPLOOP; |
5641 | | |
5642 | | |
5643 | | /* ===================================================================== */ |
5644 | | /* Atomic groups and non-capturing brackets that can match an empty string |
5645 | | must record a backtracking point and also set up a chained frame. */ |
5646 | | |
5647 | 0 | case OP_ONCE: |
5648 | 0 | case OP_SCRIPT_RUN: |
5649 | 0 | case OP_SBRA: |
5650 | 0 | Lframe_type = GF_NOCAPTURE | Fop; |
5651 | |
|
5652 | 0 | GROUPLOOP: |
5653 | 0 | for (;;) |
5654 | 0 | { |
5655 | 0 | group_frame_type = Lframe_type; |
5656 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2); |
5657 | 0 | if (rrc == MATCH_THEN) |
5658 | 0 | { |
5659 | 0 | PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); |
5660 | 0 | if (mb->verb_ecode_ptr < next_ecode && |
5661 | 0 | (*Fecode == OP_ALT || *next_ecode == OP_ALT)) |
5662 | 0 | rrc = MATCH_NOMATCH; |
5663 | 0 | } |
5664 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5665 | 0 | Fecode += GET(Fecode, 1); |
5666 | 0 | if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
5667 | 0 | } |
5668 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
5669 | | |
5670 | 0 | #undef Lframe_type |
5671 | | |
5672 | | |
5673 | | /* ===================================================================== */ |
5674 | | /* Pattern recursion either matches the current regex, or some |
5675 | | subexpression. The offset data is the offset to the starting bracket from |
5676 | | the start of the whole pattern. This is so that it works from duplicated |
5677 | | subpatterns. For a whole-pattern recursion, we have to infer the number |
5678 | | zero. */ |
5679 | | |
5680 | 0 | #define Lframe_type F->temp_32[0] |
5681 | 0 | #define Lstart_branch F->temp_sptr[0] |
5682 | | |
5683 | 0 | case OP_RECURSE: |
5684 | 0 | bracode = mb->start_code + GET(Fecode, 1); |
5685 | 0 | number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE); |
5686 | | |
5687 | | /* If we are already in a pattern recursion, check for repeating the same |
5688 | | one without changing the subject pointer or the last referenced character |
5689 | | in the subject. This should catch convoluted mutual recursions; some |
5690 | | simple cases are caught at compile time. However, there are rare cases when |
5691 | | this check needs to be turned off. In this case, actual recursion loops |
5692 | | will be caught by the match or heap limits. */ |
5693 | |
|
5694 | 0 | if (Fcurrent_recurse != RECURSE_UNSET) |
5695 | 0 | { |
5696 | 0 | offset = Flast_group_offset; |
5697 | 0 | while (offset != PCRE2_UNSET) |
5698 | 0 | { |
5699 | 0 | N = (heapframe *)((char *)match_data->heapframes + offset); |
5700 | 0 | P = (heapframe *)((char *)N - frame_size); |
5701 | 0 | if (N->group_frame_type == (GF_RECURSE | number)) |
5702 | 0 | { |
5703 | 0 | if (Feptr == P->eptr && mb->last_used_ptr == P->recurse_last_used && |
5704 | 0 | (mb->moptions & PCRE2_DISABLE_RECURSELOOP_CHECK) == 0) |
5705 | 0 | return PCRE2_ERROR_RECURSELOOP; |
5706 | 0 | break; |
5707 | 0 | } |
5708 | 0 | offset = P->last_group_offset; |
5709 | 0 | } |
5710 | 0 | } |
5711 | | |
5712 | | /* Remember the current last referenced character and then run the |
5713 | | recursion branch by branch. */ |
5714 | | |
5715 | 0 | F->recurse_last_used = mb->last_used_ptr; |
5716 | 0 | Lstart_branch = bracode; |
5717 | 0 | Lframe_type = GF_RECURSE | number; |
5718 | |
|
5719 | 0 | for (;;) |
5720 | 0 | { |
5721 | 0 | PCRE2_SPTR next_ecode; |
5722 | |
|
5723 | 0 | group_frame_type = Lframe_type; |
5724 | 0 | RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11); |
5725 | 0 | next_ecode = Lstart_branch + GET(Lstart_branch,1); |
5726 | | |
5727 | | /* Handle backtracking verbs, which are defined in a range that can |
5728 | | easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to |
5729 | | escape beyond a recursion; they cause a NOMATCH for the entire recursion. |
5730 | | |
5731 | | When one of these verbs triggers, the current recursion group number is |
5732 | | recorded. If it matches the recursion we are processing, the verb |
5733 | | happened within the recursion and we must deal with it. Otherwise it must |
5734 | | have happened after the recursion completed, and so has to be passed |
5735 | | back. See comment above about handling THEN. */ |
5736 | |
|
5737 | 0 | if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX && |
5738 | 0 | mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE)) |
5739 | 0 | { |
5740 | 0 | if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode && |
5741 | 0 | (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT)) |
5742 | 0 | rrc = MATCH_NOMATCH; |
5743 | 0 | else RRETURN(MATCH_NOMATCH); |
5744 | 0 | } |
5745 | | |
5746 | | /* Note that carrying on after (*ACCEPT) in a recursion is handled in the |
5747 | | OP_ACCEPT code. Nothing needs to be done here. */ |
5748 | | |
5749 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5750 | 0 | Lstart_branch = next_ecode; |
5751 | 0 | if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH); |
5752 | 0 | } |
5753 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
5754 | | |
5755 | 0 | #undef Lframe_type |
5756 | 0 | #undef Lstart_branch |
5757 | | |
5758 | | |
5759 | | /* ===================================================================== */ |
5760 | | /* Positive assertions are like other groups except that PCRE doesn't allow |
5761 | | the effect of (*THEN) to escape beyond an assertion; it is therefore |
5762 | | treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its |
5763 | | captures and mark retained. Any other return is an error. */ |
5764 | | |
5765 | 0 | #define Lframe_type F->temp_32[0] |
5766 | | |
5767 | 0 | case OP_ASSERT: |
5768 | 0 | case OP_ASSERTBACK: |
5769 | 0 | case OP_ASSERT_NA: |
5770 | 0 | case OP_ASSERTBACK_NA: |
5771 | 0 | Lframe_type = GF_NOCAPTURE | Fop; |
5772 | 0 | for (;;) |
5773 | 0 | { |
5774 | 0 | group_frame_type = Lframe_type; |
5775 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3); |
5776 | 0 | if (rrc == MATCH_ACCEPT) |
5777 | 0 | { |
5778 | 0 | memcpy(Fovector, |
5779 | 0 | (char *)assert_accept_frame + offsetof(heapframe, ovector), |
5780 | 0 | assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); |
5781 | 0 | Foffset_top = assert_accept_frame->offset_top; |
5782 | 0 | Fmark = assert_accept_frame->mark; |
5783 | 0 | break; |
5784 | 0 | } |
5785 | 0 | if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
5786 | 0 | Fecode += GET(Fecode, 1); |
5787 | 0 | if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
5788 | 0 | } |
5789 | | |
5790 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5791 | 0 | Fecode += 1 + LINK_SIZE; |
5792 | 0 | break; |
5793 | | |
5794 | 0 | #undef Lframe_type |
5795 | | |
5796 | | |
5797 | | /* ===================================================================== */ |
5798 | | /* Handle negative assertions. Loop for each non-matching branch as for |
5799 | | positive assertions. */ |
5800 | | |
5801 | 0 | #define Lframe_type F->temp_32[0] |
5802 | | |
5803 | 0 | case OP_ASSERT_NOT: |
5804 | 0 | case OP_ASSERTBACK_NOT: |
5805 | 0 | Lframe_type = GF_NOCAPTURE | Fop; |
5806 | |
|
5807 | 0 | for (;;) |
5808 | 0 | { |
5809 | 0 | group_frame_type = Lframe_type; |
5810 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4); |
5811 | 0 | switch(rrc) |
5812 | 0 | { |
5813 | 0 | case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */ |
5814 | 0 | case MATCH_MATCH: |
5815 | 0 | RRETURN (MATCH_NOMATCH); |
5816 | | |
5817 | 0 | case MATCH_NOMATCH: /* Branch failed, try next if present. */ |
5818 | 0 | case MATCH_THEN: |
5819 | 0 | Fecode += GET(Fecode, 1); |
5820 | 0 | if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED; |
5821 | 0 | break; |
5822 | | |
5823 | 0 | case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */ |
5824 | 0 | case MATCH_SKIP: |
5825 | 0 | case MATCH_PRUNE: |
5826 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5827 | 0 | goto ASSERT_NOT_FAILED; |
5828 | | |
5829 | 0 | default: /* Pass back any other return */ |
5830 | 0 | RRETURN(rrc); |
5831 | 0 | } |
5832 | 0 | } |
5833 | | |
5834 | | /* None of the branches have matched or there was a backtrack to (*COMMIT), |
5835 | | (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a |
5836 | | negative assertion, so carry on. */ |
5837 | | |
5838 | 0 | ASSERT_NOT_FAILED: |
5839 | 0 | Fecode += 1 + LINK_SIZE; |
5840 | 0 | break; |
5841 | | |
5842 | 0 | #undef Lframe_type |
5843 | | |
5844 | | /* ===================================================================== */ |
5845 | | /* Handle scan substring operation. */ |
5846 | | |
5847 | 0 | #define Lframe_type F->temp_32[0] |
5848 | 0 | #define Lextra_size F->temp_32[1] |
5849 | 0 | #define Lsaved_moptions F->temp_32[2] |
5850 | 0 | #define Lsaved_end_subject F->temp_sptr[0] |
5851 | 0 | #define Lsaved_eptr F->temp_sptr[1] |
5852 | 0 | #define Ltrue_end_extra F->temp_size |
5853 | | |
5854 | 0 | case OP_ASSERT_SCS: |
5855 | 0 | { |
5856 | 0 | PCRE2_SPTR ecode = Fecode + 1 + LINK_SIZE; |
5857 | 0 | uint32_t extra_size = 0; |
5858 | 0 | int count; |
5859 | 0 | PCRE2_SPTR slot; |
5860 | | |
5861 | | /* Disable compiler warning. */ |
5862 | 0 | offset = 0; |
5863 | 0 | (void)offset; |
5864 | |
|
5865 | 0 | for (;;) |
5866 | 0 | { |
5867 | 0 | if (*ecode == OP_CREF) |
5868 | 0 | { |
5869 | 0 | extra_size += 1+IMM2_SIZE; |
5870 | 0 | offset = (GET2(ecode, 1) << 1) - 2; |
5871 | 0 | ecode += 1+IMM2_SIZE; |
5872 | 0 | if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET) |
5873 | 0 | goto SCS_OFFSET_FOUND; |
5874 | 0 | continue; |
5875 | 0 | } |
5876 | | |
5877 | 0 | if (*ecode != OP_DNCREF) RRETURN(MATCH_NOMATCH); |
5878 | | |
5879 | 0 | count = GET2(ecode, 1 + IMM2_SIZE); |
5880 | 0 | slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size; |
5881 | 0 | extra_size += 1+2*IMM2_SIZE; |
5882 | 0 | ecode += 1+2*IMM2_SIZE; |
5883 | |
|
5884 | 0 | while (count > 0) |
5885 | 0 | { |
5886 | 0 | offset = (GET2(slot, 0) << 1) - 2; |
5887 | 0 | if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET) |
5888 | 0 | goto SCS_OFFSET_FOUND; |
5889 | 0 | slot += mb->name_entry_size; |
5890 | 0 | count--; |
5891 | 0 | } |
5892 | 0 | } |
5893 | | |
5894 | 0 | SCS_OFFSET_FOUND: |
5895 | | |
5896 | | /* Skip remaining options. */ |
5897 | 0 | for (;;) |
5898 | 0 | { |
5899 | 0 | if (*ecode == OP_CREF) |
5900 | 0 | { |
5901 | 0 | extra_size += 1+IMM2_SIZE; |
5902 | 0 | ecode += 1+IMM2_SIZE; |
5903 | 0 | } |
5904 | 0 | else if (*ecode == OP_DNCREF) |
5905 | 0 | { |
5906 | 0 | extra_size += 1+2*IMM2_SIZE; |
5907 | 0 | ecode += 1+2*IMM2_SIZE; |
5908 | 0 | } |
5909 | 0 | else break; |
5910 | 0 | } |
5911 | |
|
5912 | 0 | Lextra_size = extra_size; |
5913 | 0 | } |
5914 | | |
5915 | 0 | Lsaved_end_subject = mb->end_subject; |
5916 | 0 | Ltrue_end_extra = mb->true_end_subject - mb->end_subject; |
5917 | 0 | Lsaved_eptr = Feptr; |
5918 | 0 | Lsaved_moptions = mb->moptions; |
5919 | |
|
5920 | 0 | Feptr = mb->start_subject + Fovector[offset]; |
5921 | 0 | mb->true_end_subject = mb->end_subject = |
5922 | 0 | mb->start_subject + Fovector[offset + 1]; |
5923 | 0 | mb->moptions &= ~PCRE2_NOTEOL; |
5924 | |
|
5925 | 0 | Lframe_type = GF_NOCAPTURE | Fop; |
5926 | 0 | for (;;) |
5927 | 0 | { |
5928 | 0 | group_frame_type = Lframe_type; |
5929 | 0 | RMATCH(Fecode + 1 + LINK_SIZE + Lextra_size, RM38); |
5930 | 0 | if (rrc == MATCH_ACCEPT) |
5931 | 0 | { |
5932 | 0 | memcpy(Fovector, |
5933 | 0 | (char *)assert_accept_frame + offsetof(heapframe, ovector), |
5934 | 0 | assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); |
5935 | 0 | Foffset_top = assert_accept_frame->offset_top; |
5936 | 0 | Fmark = assert_accept_frame->mark; |
5937 | 0 | mb->end_subject = Lsaved_end_subject; |
5938 | 0 | mb->true_end_subject = mb->end_subject + Ltrue_end_extra; |
5939 | 0 | mb->moptions = Lsaved_moptions; |
5940 | 0 | break; |
5941 | 0 | } |
5942 | | |
5943 | 0 | if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) |
5944 | 0 | { |
5945 | 0 | mb->end_subject = Lsaved_end_subject; |
5946 | 0 | mb->true_end_subject = mb->end_subject + Ltrue_end_extra; |
5947 | 0 | mb->moptions = Lsaved_moptions; |
5948 | 0 | RRETURN(rrc); |
5949 | 0 | } |
5950 | | |
5951 | 0 | Fecode += GET(Fecode, 1); |
5952 | 0 | if (*Fecode != OP_ALT) |
5953 | 0 | { |
5954 | 0 | mb->end_subject = Lsaved_end_subject; |
5955 | 0 | mb->true_end_subject = mb->end_subject + Ltrue_end_extra; |
5956 | 0 | mb->moptions = Lsaved_moptions; |
5957 | 0 | RRETURN(MATCH_NOMATCH); |
5958 | 0 | } |
5959 | 0 | Lextra_size = 0; |
5960 | 0 | } |
5961 | | |
5962 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5963 | 0 | Fecode += 1 + LINK_SIZE; |
5964 | 0 | Feptr = Lsaved_eptr; |
5965 | 0 | break; |
5966 | | |
5967 | 0 | #undef Lframe_type |
5968 | 0 | #undef Lextra_size |
5969 | 0 | #undef Lsaved_end_subject |
5970 | 0 | #undef Lsaved_eptr |
5971 | 0 | #undef Ltrue_end_extra |
5972 | 0 | #undef Lsave_moptions |
5973 | | |
5974 | | /* ===================================================================== */ |
5975 | | /* The callout item calls an external function, if one is provided, passing |
5976 | | details of the match so far. This is mainly for debugging, though the |
5977 | | function is able to force a failure. */ |
5978 | | |
5979 | 0 | case OP_CALLOUT: |
5980 | 0 | case OP_CALLOUT_STR: |
5981 | 0 | rrc = do_callout(F, mb, &length); |
5982 | 0 | if (rrc > 0) RRETURN(MATCH_NOMATCH); |
5983 | 0 | if (rrc < 0) RRETURN(rrc); |
5984 | 0 | Fecode += length; |
5985 | 0 | break; |
5986 | | |
5987 | | |
5988 | | /* ===================================================================== */ |
5989 | | /* Conditional group: compilation checked that there are no more than two |
5990 | | branches. If the condition is false, skipping the first branch takes us |
5991 | | past the end of the item if there is only one branch, but that's exactly |
5992 | | what we want. */ |
5993 | | |
5994 | 0 | case OP_COND: |
5995 | 0 | case OP_SCOND: |
5996 | | |
5997 | | /* The variable Flength will be added to Fecode when the condition is |
5998 | | false, to get to the second branch. Setting it to the offset to the ALT or |
5999 | | KET, then incrementing Fecode achieves this effect. However, if the second |
6000 | | branch is non-existent, we must point to the KET so that the end of the |
6001 | | group is correctly processed. We now have Fecode pointing to the condition |
6002 | | or callout. */ |
6003 | |
|
6004 | 0 | Flength = GET(Fecode, 1); /* Offset to the second branch */ |
6005 | 0 | if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE; |
6006 | 0 | Fecode += 1 + LINK_SIZE; /* From this opcode */ |
6007 | | |
6008 | | /* Because of the way auto-callout works during compile, a callout item is |
6009 | | inserted between OP_COND and an assertion condition. Such a callout can |
6010 | | also be inserted manually. */ |
6011 | |
|
6012 | 0 | if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR) |
6013 | 0 | { |
6014 | 0 | rrc = do_callout(F, mb, &length); |
6015 | 0 | if (rrc > 0) RRETURN(MATCH_NOMATCH); |
6016 | 0 | if (rrc < 0) RRETURN(rrc); |
6017 | | |
6018 | | /* Advance Fecode past the callout, so it now points to the condition. We |
6019 | | must adjust Flength so that the value of Fecode+Flength is unchanged. */ |
6020 | | |
6021 | 0 | Fecode += length; |
6022 | 0 | Flength -= length; |
6023 | 0 | } |
6024 | | |
6025 | | /* Test the various possible conditions */ |
6026 | | |
6027 | 0 | condition = FALSE; |
6028 | 0 | switch(*Fecode) |
6029 | 0 | { |
6030 | 0 | case OP_RREF: /* Group recursion test */ |
6031 | 0 | if (Fcurrent_recurse != RECURSE_UNSET) |
6032 | 0 | { |
6033 | 0 | number = GET2(Fecode, 1); |
6034 | 0 | condition = (number == RREF_ANY || number == Fcurrent_recurse); |
6035 | 0 | } |
6036 | 0 | break; |
6037 | | |
6038 | 0 | case OP_DNRREF: /* Duplicate named group recursion test */ |
6039 | 0 | if (Fcurrent_recurse != RECURSE_UNSET) |
6040 | 0 | { |
6041 | 0 | int count = GET2(Fecode, 1 + IMM2_SIZE); |
6042 | 0 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
6043 | 0 | while (count-- > 0) |
6044 | 0 | { |
6045 | 0 | number = GET2(slot, 0); |
6046 | 0 | condition = number == Fcurrent_recurse; |
6047 | 0 | if (condition) break; |
6048 | 0 | slot += mb->name_entry_size; |
6049 | 0 | } |
6050 | 0 | } |
6051 | 0 | break; |
6052 | | |
6053 | 0 | case OP_CREF: /* Numbered group used test */ |
6054 | 0 | offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */ |
6055 | 0 | condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; |
6056 | 0 | break; |
6057 | | |
6058 | 0 | case OP_DNCREF: /* Duplicate named group used test */ |
6059 | 0 | { |
6060 | 0 | int count = GET2(Fecode, 1 + IMM2_SIZE); |
6061 | 0 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
6062 | 0 | while (count-- > 0) |
6063 | 0 | { |
6064 | 0 | offset = (GET2(slot, 0) << 1) - 2; |
6065 | 0 | condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; |
6066 | 0 | if (condition) break; |
6067 | 0 | slot += mb->name_entry_size; |
6068 | 0 | } |
6069 | 0 | } |
6070 | 0 | break; |
6071 | | |
6072 | 0 | case OP_FALSE: |
6073 | 0 | case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ |
6074 | 0 | break; |
6075 | | |
6076 | 0 | case OP_TRUE: |
6077 | 0 | condition = TRUE; |
6078 | 0 | break; |
6079 | | |
6080 | | /* The condition is an assertion. Run code similar to the assertion code |
6081 | | above. */ |
6082 | | |
6083 | 0 | #define Lpositive F->temp_32[0] |
6084 | 0 | #define Lstart_branch F->temp_sptr[0] |
6085 | | |
6086 | 0 | default: |
6087 | 0 | Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK); |
6088 | 0 | Lstart_branch = Fecode; |
6089 | |
|
6090 | 0 | for (;;) |
6091 | 0 | { |
6092 | 0 | group_frame_type = GF_CONDASSERT | *Fecode; |
6093 | 0 | RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5); |
6094 | | |
6095 | 0 | switch(rrc) |
6096 | 0 | { |
6097 | 0 | case MATCH_ACCEPT: /* Save captures */ |
6098 | 0 | memcpy(Fovector, |
6099 | 0 | (char *)assert_accept_frame + offsetof(heapframe, ovector), |
6100 | 0 | assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); |
6101 | 0 | Foffset_top = assert_accept_frame->offset_top; |
6102 | |
|
6103 | 0 | PCRE2_FALLTHROUGH /* Fall through */ |
6104 | | /* In the case of a match, the captures have already been put into |
6105 | | the current frame. */ |
6106 | |
|
6107 | 0 | case MATCH_MATCH: |
6108 | 0 | condition = Lpositive; /* TRUE for positive assertion */ |
6109 | 0 | break; |
6110 | | |
6111 | | /* PCRE doesn't allow the effect of (*THEN) to escape beyond an |
6112 | | assertion; it is therefore always treated as NOMATCH. */ |
6113 | | |
6114 | 0 | case MATCH_NOMATCH: |
6115 | 0 | case MATCH_THEN: |
6116 | 0 | Lstart_branch += GET(Lstart_branch, 1); |
6117 | 0 | if (*Lstart_branch == OP_ALT) continue; /* Try next branch */ |
6118 | 0 | condition = !Lpositive; /* TRUE for negative assertion */ |
6119 | 0 | break; |
6120 | | |
6121 | | /* These force no match without checking other branches. */ |
6122 | | |
6123 | 0 | case MATCH_COMMIT: |
6124 | 0 | case MATCH_SKIP: |
6125 | 0 | case MATCH_PRUNE: |
6126 | 0 | condition = !Lpositive; |
6127 | 0 | break; |
6128 | | |
6129 | 0 | default: |
6130 | 0 | RRETURN(rrc); |
6131 | 0 | } |
6132 | 0 | break; /* Out of the branch loop */ |
6133 | 0 | } |
6134 | | |
6135 | | /* If the condition is true, find the end of the assertion so that |
6136 | | advancing past it gets us to the start of the first branch. */ |
6137 | | |
6138 | 0 | if (condition) |
6139 | 0 | { |
6140 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
6141 | 0 | } |
6142 | 0 | break; /* End of assertion condition */ |
6143 | 0 | } |
6144 | | |
6145 | 0 | #undef Lpositive |
6146 | 0 | #undef Lstart_branch |
6147 | | |
6148 | | /* Choose branch according to the condition. */ |
6149 | | |
6150 | 0 | Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength; |
6151 | | |
6152 | | /* If the opcode is OP_SCOND it means we are at a repeated conditional |
6153 | | group that might match an empty string. We must therefore descend a level |
6154 | | so that the start is remembered for checking. For OP_COND we can just |
6155 | | continue at this level. */ |
6156 | |
|
6157 | 0 | if (Fop == OP_SCOND) |
6158 | 0 | { |
6159 | 0 | group_frame_type = GF_NOCAPTURE | Fop; |
6160 | 0 | RMATCH(Fecode, RM35); |
6161 | 0 | RRETURN(rrc); |
6162 | 0 | } |
6163 | 0 | break; |
6164 | | |
6165 | | |
6166 | | |
6167 | | /* ========================================================================= */ |
6168 | | /* End of start of parenthesis opcodes */ |
6169 | | /* ========================================================================= */ |
6170 | | |
6171 | | |
6172 | | /* ===================================================================== */ |
6173 | | /* Move the subject pointer back by one fixed amount. This occurs at the |
6174 | | start of each branch that has a fixed length in a lookbehind assertion. If |
6175 | | we are too close to the start to move back, fail. When working with UTF-8 |
6176 | | we move back a number of characters, not bytes. */ |
6177 | | |
6178 | 0 | case OP_REVERSE: |
6179 | 0 | number = GET2(Fecode, 1); |
6180 | 0 | #ifdef SUPPORT_UNICODE |
6181 | 0 | if (utf) |
6182 | 0 | { |
6183 | | /* We used to do a simpler `while (number-- > 0)` but that triggers |
6184 | | clang's unsigned integer overflow sanitizer. */ |
6185 | 0 | while (number > 0) |
6186 | 0 | { |
6187 | 0 | --number; |
6188 | 0 | if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH); |
6189 | 0 | Feptr--; |
6190 | 0 | BACKCHAR(Feptr); |
6191 | 0 | } |
6192 | 0 | } |
6193 | 0 | else |
6194 | 0 | #endif |
6195 | | |
6196 | | /* No UTF support, or not in UTF mode: count is code unit count */ |
6197 | | |
6198 | 0 | { |
6199 | 0 | if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH); |
6200 | 0 | Feptr -= number; |
6201 | 0 | } |
6202 | | |
6203 | | /* Save the earliest consulted character, then skip to next opcode */ |
6204 | | |
6205 | 0 | if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr; |
6206 | 0 | Fecode += 1 + IMM2_SIZE; |
6207 | 0 | break; |
6208 | | |
6209 | | |
6210 | | /* ===================================================================== */ |
6211 | | /* Move the subject pointer back by a variable amount. This occurs at the |
6212 | | start of each branch of a lookbehind assertion when the branch has a |
6213 | | variable, but limited, length. A loop is needed to try matching the branch |
6214 | | after moving back different numbers of characters. If we are too close to |
6215 | | the start to move back even the minimum amount, fail. When working with |
6216 | | UTF-8 we move back a number of characters, not bytes. */ |
6217 | | |
6218 | 0 | #define Lmin F->temp_32[0] |
6219 | 0 | #define Lmax F->temp_32[1] |
6220 | 0 | #define Leptr F->temp_sptr[0] |
6221 | | |
6222 | 0 | case OP_VREVERSE: |
6223 | 0 | Lmin = GET2(Fecode, 1); |
6224 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
6225 | 0 | Leptr = Feptr; |
6226 | | |
6227 | | /* Move back by the maximum branch length and then work forwards. This |
6228 | | ensures that items such as \d{3,5} get the maximum length, which is |
6229 | | relevant for captures, and makes for Perl compatibility. */ |
6230 | |
|
6231 | 0 | #ifdef SUPPORT_UNICODE |
6232 | 0 | if (utf) |
6233 | 0 | { |
6234 | 0 | for (i = 0; i < Lmax; i++) |
6235 | 0 | { |
6236 | 0 | if (Feptr == mb->start_subject) |
6237 | 0 | { |
6238 | 0 | if (i < Lmin) RRETURN(MATCH_NOMATCH); |
6239 | 0 | Lmax = i; |
6240 | 0 | break; |
6241 | 0 | } |
6242 | 0 | Feptr--; |
6243 | 0 | BACKCHAR(Feptr); |
6244 | 0 | } |
6245 | 0 | } |
6246 | 0 | else |
6247 | 0 | #endif |
6248 | | |
6249 | | /* No UTF support or not in UTF mode */ |
6250 | | |
6251 | 0 | { |
6252 | 0 | ptrdiff_t diff = Feptr - mb->start_subject; |
6253 | 0 | uint32_t available = (diff > 65535)? 65535 : ((diff > 0)? (int)diff : 0); |
6254 | 0 | if (Lmin > available) RRETURN(MATCH_NOMATCH); |
6255 | 0 | if (Lmax > available) Lmax = available; |
6256 | 0 | Feptr -= Lmax; |
6257 | 0 | } |
6258 | | |
6259 | | /* Now try matching, moving forward one character on failure, until we |
6260 | | reach the minimum back length. */ |
6261 | | |
6262 | 0 | for (;;) |
6263 | 0 | { |
6264 | 0 | RMATCH(Fecode + 1 + 2 * IMM2_SIZE, RM37); |
6265 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6266 | 0 | if (Lmax-- <= Lmin) RRETURN(MATCH_NOMATCH); |
6267 | 0 | Feptr++; |
6268 | 0 | #ifdef SUPPORT_UNICODE |
6269 | 0 | if (utf) { FORWARDCHARTEST(Feptr, mb->end_subject); } |
6270 | 0 | #endif |
6271 | 0 | } |
6272 | 0 | PCRE2_UNREACHABLE(); /* Control never reaches here */ |
6273 | | |
6274 | 0 | #undef Lmin |
6275 | 0 | #undef Lmax |
6276 | 0 | #undef Leptr |
6277 | | |
6278 | | /* ===================================================================== */ |
6279 | | /* An alternation is the end of a branch; scan along to find the end of the |
6280 | | bracketed group. */ |
6281 | | |
6282 | 0 | case OP_ALT: |
6283 | 0 | branch_end = Fecode; |
6284 | 0 | do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); |
6285 | 0 | break; |
6286 | | |
6287 | | |
6288 | | /* ===================================================================== */ |
6289 | | /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the |
6290 | | starting frame was added to the chained frames in order to remember the |
6291 | | starting subject position for the group. (Not true for OP_BRA when it's a |
6292 | | whole pattern recursion, but that is handled separately below.)*/ |
6293 | | |
6294 | 0 | case OP_KET: |
6295 | 0 | case OP_KETRMIN: |
6296 | 0 | case OP_KETRMAX: |
6297 | 0 | case OP_KETRPOS: |
6298 | |
|
6299 | 0 | bracode = Fecode - GET(Fecode, 1); |
6300 | |
|
6301 | 0 | if (branch_end == NULL) branch_end = Fecode; |
6302 | 0 | branch_start = bracode; |
6303 | 0 | while (branch_start + GET(branch_start, 1) != branch_end) |
6304 | 0 | branch_start += GET(branch_start, 1); |
6305 | 0 | branch_end = NULL; |
6306 | | |
6307 | | /* Point N to the frame at the start of the most recent group, and P to its |
6308 | | predecessor. Remember the subject pointer at the start of the group. */ |
6309 | |
|
6310 | 0 | if (*bracode != OP_BRA && *bracode != OP_COND) |
6311 | 0 | { |
6312 | 0 | N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset); |
6313 | 0 | P = (heapframe *)((char *)N - frame_size); |
6314 | 0 | Flast_group_offset = P->last_group_offset; |
6315 | |
|
6316 | | #ifdef DEBUG_SHOW_RMATCH |
6317 | | fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n", |
6318 | | N->rdepth, N->group_frame_type, |
6319 | | (char *)P->eptr - (char *)mb->start_subject); |
6320 | | #endif |
6321 | | |
6322 | | /* If we are at the end of an assertion that is a condition, first check |
6323 | | to see if we are at the end of a variable-length branch in a lookbehind. |
6324 | | If this is the case and we have not landed on the current character, |
6325 | | return no match. Compare code below for non-condition lookbehinds. In |
6326 | | other cases, return a match, discarding any intermediate backtracking |
6327 | | points. Copy back the mark setting and the captures into the frame before |
6328 | | N so that they are set on return. Doing this for all assertions, both |
6329 | | positive and negative, seems to match what Perl does. */ |
6330 | |
|
6331 | 0 | if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT) |
6332 | 0 | { |
6333 | 0 | if ((*bracode == OP_ASSERTBACK || *bracode == OP_ASSERTBACK_NOT) && |
6334 | 0 | branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr) |
6335 | 0 | RRETURN(MATCH_NOMATCH); |
6336 | 0 | memcpy((char *)P + offsetof(heapframe, ovector), Fovector, |
6337 | 0 | Foffset_top * sizeof(PCRE2_SIZE)); |
6338 | 0 | P->offset_top = Foffset_top; |
6339 | 0 | P->mark = Fmark; |
6340 | 0 | Fback_frame = (char *)F - (char *)P; |
6341 | 0 | RRETURN(MATCH_MATCH); |
6342 | 0 | } |
6343 | 0 | } |
6344 | 0 | else P = NULL; /* Indicates starting frame not recorded */ |
6345 | | |
6346 | | /* The group was not a conditional assertion. */ |
6347 | | |
6348 | 0 | switch (*bracode) |
6349 | 0 | { |
6350 | | /* Whole pattern recursion is handled as a recursion into group 0, but |
6351 | | the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing |
6352 | | group - a design mistake: it should perhaps have been capture group 0. |
6353 | | Anyway, that means the end of such recursion must be handled here. It is |
6354 | | detected by checking for an immediately following OP_END when we are |
6355 | | recursing in group 0. If this is not the end of a whole-pattern |
6356 | | recursion, there is nothing to be done. */ |
6357 | | |
6358 | 0 | case OP_BRA: |
6359 | 0 | if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break; |
6360 | | |
6361 | | /* It is the end of whole-pattern recursion. */ |
6362 | | |
6363 | 0 | offset = Flast_group_offset; |
6364 | | |
6365 | | /* Corrupted heapframes?. Trigger an assert and return an error */ |
6366 | 0 | PCRE2_ASSERT(offset != PCRE2_UNSET); |
6367 | 0 | if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; |
6368 | | |
6369 | 0 | N = (heapframe *)((char *)match_data->heapframes + offset); |
6370 | 0 | P = (heapframe *)((char *)N - frame_size); |
6371 | 0 | Flast_group_offset = P->last_group_offset; |
6372 | | |
6373 | | /* Reinstate the previous set of captures and then carry on after the |
6374 | | recursion call. */ |
6375 | |
|
6376 | 0 | Fecode = P->ecode + 1 + LINK_SIZE; |
6377 | |
|
6378 | 0 | if (*Fecode != OP_CREF) |
6379 | 0 | { |
6380 | 0 | memcpy(F->ovector, P->ovector, Foffset_top * sizeof(PCRE2_SIZE)); |
6381 | 0 | Foffset_top = P->offset_top; |
6382 | 0 | } |
6383 | 0 | else |
6384 | 0 | recurse_update_offsets(F, P); |
6385 | |
|
6386 | 0 | Fcapture_last = P->capture_last; |
6387 | 0 | Fcurrent_recurse = P->current_recurse; |
6388 | 0 | continue; /* With next opcode */ |
6389 | | |
6390 | 0 | case OP_COND: /* No need to do anything for these */ |
6391 | 0 | case OP_SCOND: |
6392 | 0 | break; |
6393 | | |
6394 | | /* Non-atomic positive assertions are like OP_BRA, except that the |
6395 | | subject pointer must be put back to where it was at the start of the |
6396 | | assertion. For a variable lookbehind, check its end point. */ |
6397 | | |
6398 | 0 | case OP_ASSERTBACK_NA: |
6399 | 0 | if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr) |
6400 | 0 | RRETURN(MATCH_NOMATCH); |
6401 | 0 | PCRE2_FALLTHROUGH /* Fall through */ |
6402 | 0 |
|
6403 | 0 | case OP_ASSERT_NA: |
6404 | 0 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
6405 | 0 | Feptr = P->eptr; |
6406 | 0 | break; |
6407 | | |
6408 | | /* Atomic positive assertions are like OP_ONCE, except that in addition |
6409 | | the subject pointer must be put back to where it was at the start of the |
6410 | | assertion. For a variable lookbehind, check its end point. */ |
6411 | | |
6412 | 0 | case OP_ASSERTBACK: |
6413 | 0 | if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr) |
6414 | 0 | RRETURN(MATCH_NOMATCH); |
6415 | 0 | PCRE2_FALLTHROUGH /* Fall through */ |
6416 | 0 |
|
6417 | 0 | case OP_ASSERT: |
6418 | 0 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
6419 | 0 | Feptr = P->eptr; |
6420 | 0 | PCRE2_FALLTHROUGH /* Fall through */ |
6421 | | |
6422 | | /* For an atomic group, discard internal backtracking points. We must |
6423 | | also ensure that any remaining branches within the top-level of the group |
6424 | | are not tried. Do this by adjusting the code pointer within the backtrack |
6425 | | frame so that it points to the final branch. */ |
6426 | |
|
6427 | 0 | case OP_ONCE: |
6428 | 0 | Fback_frame = ((char *)F - (char *)P); |
6429 | 0 | for (;;) |
6430 | 0 | { |
6431 | 0 | uint32_t y = GET(P->ecode,1); |
6432 | 0 | if ((P->ecode)[y] != OP_ALT) break; |
6433 | 0 | P->ecode += y; |
6434 | 0 | } |
6435 | 0 | break; |
6436 | | |
6437 | | /* A matching negative assertion returns MATCH, which is turned into |
6438 | | NOMATCH at the assertion level. For a variable lookbehind, check its end |
6439 | | point. */ |
6440 | | |
6441 | 0 | case OP_ASSERTBACK_NOT: |
6442 | 0 | if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr) |
6443 | 0 | RRETURN(MATCH_NOMATCH); |
6444 | 0 | PCRE2_FALLTHROUGH /* Fall through */ |
6445 | 0 |
|
6446 | 0 | case OP_ASSERT_NOT: |
6447 | 0 | RRETURN(MATCH_MATCH); |
6448 | | |
6449 | | /* A scan substring group must preserve the current end_subject, |
6450 | | and restore it before the backtracking is performed into its sub |
6451 | | pattern. */ |
6452 | | |
6453 | 0 | case OP_ASSERT_SCS: |
6454 | 0 | F->temp_sptr[0] = mb->end_subject; |
6455 | 0 | mb->end_subject = P->temp_sptr[0]; |
6456 | 0 | mb->true_end_subject = mb->end_subject + P->temp_size; |
6457 | 0 | Feptr = P->temp_sptr[1]; |
6458 | |
|
6459 | 0 | RMATCH(Fecode + 1 + LINK_SIZE, RM39); |
6460 | | |
6461 | 0 | mb->end_subject = F->temp_sptr[0]; |
6462 | 0 | mb->true_end_subject = mb->end_subject; |
6463 | 0 | RRETURN(rrc); |
6464 | 0 | break; |
6465 | | |
6466 | | /* At the end of a script run, apply the script-checking rules. This code |
6467 | | will never by exercised if Unicode support it not compiled, because in |
6468 | | that environment script runs cause an error at compile time. */ |
6469 | | |
6470 | 0 | case OP_SCRIPT_RUN: |
6471 | 0 | if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH); |
6472 | 0 | break; |
6473 | | |
6474 | | /* Whole-pattern recursion is coded as a recurse into group 0, and is |
6475 | | handled with OP_BRA above. Other recursion is handled here. */ |
6476 | | |
6477 | 0 | case OP_CBRA: |
6478 | 0 | case OP_CBRAPOS: |
6479 | 0 | case OP_SCBRA: |
6480 | 0 | case OP_SCBRAPOS: |
6481 | 0 | number = GET2(bracode, 1+LINK_SIZE); |
6482 | | |
6483 | | /* Handle a recursively called group. We reinstate the previous set of |
6484 | | captures and then carry on after the recursion call. */ |
6485 | |
|
6486 | 0 | if (Fcurrent_recurse == number) |
6487 | 0 | { |
6488 | 0 | P = (heapframe *)((char *)N - frame_size); |
6489 | 0 | Fecode = P->ecode + 1 + LINK_SIZE; |
6490 | |
|
6491 | 0 | if (*Fecode != OP_CREF) |
6492 | 0 | { |
6493 | 0 | memcpy(F->ovector, P->ovector, Foffset_top * sizeof(PCRE2_SIZE)); |
6494 | 0 | Foffset_top = P->offset_top; |
6495 | 0 | } |
6496 | 0 | else |
6497 | 0 | recurse_update_offsets(F, P); |
6498 | |
|
6499 | 0 | Fcapture_last = P->capture_last; |
6500 | 0 | Fcurrent_recurse = P->current_recurse; |
6501 | 0 | continue; /* With next opcode */ |
6502 | 0 | } |
6503 | | |
6504 | | /* Deal with actual capturing. */ |
6505 | | |
6506 | 0 | offset = (number << 1) - 2; |
6507 | 0 | Fcapture_last = number; |
6508 | 0 | Fovector[offset] = P->eptr - mb->start_subject; |
6509 | 0 | Fovector[offset+1] = Feptr - mb->start_subject; |
6510 | 0 | if (offset >= Foffset_top) Foffset_top = offset + 2; |
6511 | 0 | break; |
6512 | 0 | } /* End actions relating to the starting opcode */ |
6513 | | |
6514 | | /* OP_KETRPOS is a possessive repeating ket. Remember the current position, |
6515 | | and return the MATCH_KETRPOS. This makes it possible to do the repeats one |
6516 | | at a time from the outer level. This must precede the empty string test - |
6517 | | in this case that test is done at the outer level. */ |
6518 | | |
6519 | 0 | if (*Fecode == OP_KETRPOS) |
6520 | 0 | { |
6521 | 0 | memcpy((char *)P + offsetof(heapframe, eptr), |
6522 | 0 | (char *)F + offsetof(heapframe, eptr), |
6523 | 0 | frame_copy_size); |
6524 | 0 | RRETURN(MATCH_KETRPOS); |
6525 | 0 | } |
6526 | | |
6527 | | /* Handle the different kinds of closing brackets. A non-repeating ket |
6528 | | needs no special action, just continuing at this level. This also happens |
6529 | | for the repeating kets if the group matched no characters, in order to |
6530 | | forcibly break infinite loops. Otherwise, the repeating kets try the rest |
6531 | | of the pattern or restart from the preceding bracket, in the appropriate |
6532 | | order. */ |
6533 | | |
6534 | 0 | if (Fop != OP_KET && (P == NULL || Feptr != P->eptr)) |
6535 | 0 | { |
6536 | 0 | if (Fop == OP_KETRMIN) |
6537 | 0 | { |
6538 | 0 | RMATCH(Fecode + 1 + LINK_SIZE, RM6); |
6539 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6540 | 0 | Fecode -= GET(Fecode, 1); |
6541 | 0 | break; /* End of ket processing */ |
6542 | 0 | } |
6543 | | |
6544 | | /* Repeat the maximum number of times (KETRMAX) */ |
6545 | | |
6546 | 0 | RMATCH(bracode, RM7); |
6547 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6548 | 0 | } |
6549 | | |
6550 | | /* Carry on at this level for a non-repeating ket, or after matching an |
6551 | | empty string, or after repeating for a maximum number of times. */ |
6552 | | |
6553 | 0 | Fecode += 1 + LINK_SIZE; |
6554 | 0 | break; |
6555 | | |
6556 | | |
6557 | | /* ===================================================================== */ |
6558 | | /* Start and end of line assertions, not multiline mode. */ |
6559 | | |
6560 | 0 | case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */ |
6561 | 0 | if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0) |
6562 | 0 | RRETURN(MATCH_NOMATCH); |
6563 | 0 | Fecode++; |
6564 | 0 | break; |
6565 | | |
6566 | 0 | case OP_SOD: /* Unconditional start of subject */ |
6567 | 0 | if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH); |
6568 | 0 | Fecode++; |
6569 | 0 | break; |
6570 | | |
6571 | | /* When PCRE2_NOTEOL is unset, assert before the subject end, or a |
6572 | | terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */ |
6573 | | |
6574 | 0 | case OP_DOLL: |
6575 | 0 | if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); |
6576 | 0 | if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS; |
6577 | | |
6578 | 0 | PCRE2_FALLTHROUGH /* Fall through */ |
6579 | 0 | /* Unconditional end of subject assertion (\z). */ |
6580 | 0 |
|
6581 | 0 | case OP_EOD: |
6582 | 0 | if (Feptr < mb->true_end_subject) RRETURN(MATCH_NOMATCH); |
6583 | 0 | if (mb->partial != 0) |
6584 | 0 | { |
6585 | 0 | mb->hitend = TRUE; |
6586 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
6587 | 0 | } |
6588 | 0 | Fecode++; |
6589 | 0 | break; |
6590 | | |
6591 | | /* End of subject or ending \n assertion (\Z) */ |
6592 | | |
6593 | 0 | case OP_EODN: |
6594 | 0 | ASSERT_NL_OR_EOS: |
6595 | 0 | if (Feptr < mb->true_end_subject && |
6596 | 0 | (!IS_NEWLINE(Feptr) || Feptr != mb->true_end_subject - mb->nllen)) |
6597 | 0 | { |
6598 | 0 | if (mb->partial != 0 && |
6599 | 0 | Feptr + 1 >= mb->end_subject && |
6600 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
6601 | 0 | NLBLOCK->nllen == 2 && |
6602 | 0 | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
6603 | 0 | { |
6604 | 0 | mb->hitend = TRUE; |
6605 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
6606 | 0 | } |
6607 | 0 | RRETURN(MATCH_NOMATCH); |
6608 | 0 | } |
6609 | | |
6610 | | /* Either at end of string or \n before end. */ |
6611 | | |
6612 | 0 | if (mb->partial != 0) |
6613 | 0 | { |
6614 | 0 | mb->hitend = TRUE; |
6615 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
6616 | 0 | } |
6617 | 0 | Fecode++; |
6618 | 0 | break; |
6619 | | |
6620 | | |
6621 | | /* ===================================================================== */ |
6622 | | /* Start and end of line assertions, multiline mode. */ |
6623 | | |
6624 | | /* Start of subject unless notbol, or after any newline except for one at |
6625 | | the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */ |
6626 | | |
6627 | 0 | case OP_CIRCM: |
6628 | 0 | if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject) |
6629 | 0 | RRETURN(MATCH_NOMATCH); |
6630 | 0 | if (Feptr != mb->start_subject && |
6631 | 0 | ((Feptr == mb->end_subject && |
6632 | 0 | (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) || |
6633 | 0 | !WAS_NEWLINE(Feptr))) |
6634 | 0 | RRETURN(MATCH_NOMATCH); |
6635 | 0 | Fecode++; |
6636 | 0 | break; |
6637 | | |
6638 | | /* Assert before any newline, or before end of subject unless noteol is |
6639 | | set. */ |
6640 | | |
6641 | 0 | case OP_DOLLM: |
6642 | 0 | if (Feptr < mb->end_subject) |
6643 | 0 | { |
6644 | 0 | if (!IS_NEWLINE(Feptr)) |
6645 | 0 | { |
6646 | 0 | if (mb->partial != 0 && |
6647 | 0 | Feptr + 1 >= mb->end_subject && |
6648 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
6649 | 0 | NLBLOCK->nllen == 2 && |
6650 | 0 | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
6651 | 0 | { |
6652 | 0 | mb->hitend = TRUE; |
6653 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
6654 | 0 | } |
6655 | 0 | RRETURN(MATCH_NOMATCH); |
6656 | 0 | } |
6657 | 0 | } |
6658 | 0 | else |
6659 | 0 | { |
6660 | 0 | if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); |
6661 | 0 | SCHECK_PARTIAL(); |
6662 | 0 | } |
6663 | 0 | Fecode++; |
6664 | 0 | break; |
6665 | | |
6666 | | |
6667 | | /* ===================================================================== */ |
6668 | | /* Start of match assertion */ |
6669 | | |
6670 | 0 | case OP_SOM: |
6671 | 0 | if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH); |
6672 | 0 | Fecode++; |
6673 | 0 | break; |
6674 | | |
6675 | | |
6676 | | /* ===================================================================== */ |
6677 | | /* Reset the start of match point */ |
6678 | | |
6679 | 0 | case OP_SET_SOM: |
6680 | 0 | Fstart_match = Feptr; |
6681 | 0 | Fecode++; |
6682 | 0 | break; |
6683 | | |
6684 | | |
6685 | | /* ===================================================================== */ |
6686 | | /* Word boundary assertions. Find out if the previous and current |
6687 | | characters are "word" characters. It takes a bit more work in UTF mode. |
6688 | | Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is |
6689 | | not set. When it is set, use Unicode properties if available, even when not |
6690 | | in UTF mode. Remember the earliest and latest consulted characters. */ |
6691 | | |
6692 | 0 | case OP_NOT_WORD_BOUNDARY: |
6693 | 0 | case OP_WORD_BOUNDARY: |
6694 | 0 | case OP_NOT_UCP_WORD_BOUNDARY: |
6695 | 0 | case OP_UCP_WORD_BOUNDARY: |
6696 | 0 | if (Feptr == mb->check_subject) prev_is_word = FALSE; else |
6697 | 0 | { |
6698 | 0 | PCRE2_SPTR lastptr = Feptr - 1; |
6699 | 0 | #ifdef SUPPORT_UNICODE |
6700 | 0 | if (utf) |
6701 | 0 | { |
6702 | 0 | BACKCHAR(lastptr); |
6703 | 0 | GETCHAR(fc, lastptr); |
6704 | 0 | } |
6705 | 0 | else |
6706 | 0 | #endif /* SUPPORT_UNICODE */ |
6707 | 0 | fc = *lastptr; |
6708 | 0 | if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr; |
6709 | 0 | #ifdef SUPPORT_UNICODE |
6710 | 0 | if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY) |
6711 | 0 | { |
6712 | 0 | int chartype = UCD_CHARTYPE(fc); |
6713 | 0 | int category = PRIV(ucp_gentype)[chartype]; |
6714 | 0 | prev_is_word = (category == ucp_L || category == ucp_N || |
6715 | 0 | chartype == ucp_Mn || chartype == ucp_Pc); |
6716 | 0 | } |
6717 | 0 | else |
6718 | 0 | #endif /* SUPPORT_UNICODE */ |
6719 | 0 | prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; |
6720 | 0 | } |
6721 | | |
6722 | | /* Get status of next character */ |
6723 | |
|
6724 | 0 | if (Feptr >= mb->end_subject) |
6725 | 0 | { |
6726 | 0 | SCHECK_PARTIAL(); |
6727 | 0 | cur_is_word = FALSE; |
6728 | 0 | } |
6729 | 0 | else |
6730 | 0 | { |
6731 | 0 | PCRE2_SPTR nextptr = Feptr + 1; |
6732 | 0 | #ifdef SUPPORT_UNICODE |
6733 | 0 | if (utf) |
6734 | 0 | { |
6735 | 0 | FORWARDCHARTEST(nextptr, mb->end_subject); |
6736 | 0 | GETCHAR(fc, Feptr); |
6737 | 0 | } |
6738 | 0 | else |
6739 | 0 | #endif /* SUPPORT_UNICODE */ |
6740 | 0 | fc = *Feptr; |
6741 | 0 | if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr; |
6742 | 0 | #ifdef SUPPORT_UNICODE |
6743 | 0 | if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY) |
6744 | 0 | { |
6745 | 0 | int chartype = UCD_CHARTYPE(fc); |
6746 | 0 | int category = PRIV(ucp_gentype)[chartype]; |
6747 | 0 | cur_is_word = (category == ucp_L || category == ucp_N || |
6748 | 0 | chartype == ucp_Mn || chartype == ucp_Pc); |
6749 | 0 | } |
6750 | 0 | else |
6751 | 0 | #endif /* SUPPORT_UNICODE */ |
6752 | 0 | cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; |
6753 | 0 | } |
6754 | | |
6755 | | /* Now see if the situation is what we want */ |
6756 | | |
6757 | 0 | if ((*Fecode++ == OP_WORD_BOUNDARY || Fop == OP_UCP_WORD_BOUNDARY)? |
6758 | 0 | cur_is_word == prev_is_word : cur_is_word != prev_is_word) |
6759 | 0 | RRETURN(MATCH_NOMATCH); |
6760 | 0 | break; |
6761 | | |
6762 | | |
6763 | | /* ===================================================================== */ |
6764 | | /* Backtracking (*VERB)s, with and without arguments. Note that if the |
6765 | | pattern is successfully matched, we do not come back from RMATCH. */ |
6766 | | |
6767 | 0 | case OP_MARK: |
6768 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
6769 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12); |
6770 | | |
6771 | | /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an |
6772 | | argument, and we must check whether that argument matches this MARK's |
6773 | | argument. It is passed back in mb->verb_skip_ptr. If it does match, we |
6774 | | return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject |
6775 | | position that corresponds to this mark. Otherwise, pass back the return |
6776 | | code unaltered. */ |
6777 | | |
6778 | 0 | if (rrc == MATCH_SKIP_ARG && |
6779 | 0 | PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0) |
6780 | 0 | { |
6781 | 0 | mb->verb_skip_ptr = Feptr; /* Pass back current position */ |
6782 | 0 | RRETURN(MATCH_SKIP); |
6783 | 0 | } |
6784 | 0 | RRETURN(rrc); |
6785 | | |
6786 | 0 | case OP_FAIL: |
6787 | 0 | RRETURN(MATCH_NOMATCH); |
6788 | | |
6789 | | /* Record the current recursing group number in mb->verb_current_recurse |
6790 | | when a backtracking return such as MATCH_COMMIT is given. This enables the |
6791 | | recurse processing to catch verbs from within the recursion. */ |
6792 | | |
6793 | 0 | case OP_COMMIT: |
6794 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13); |
6795 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6796 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6797 | 0 | RRETURN(MATCH_COMMIT); |
6798 | | |
6799 | 0 | case OP_COMMIT_ARG: |
6800 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
6801 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36); |
6802 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6803 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6804 | 0 | RRETURN(MATCH_COMMIT); |
6805 | | |
6806 | 0 | case OP_PRUNE: |
6807 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14); |
6808 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6809 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6810 | 0 | RRETURN(MATCH_PRUNE); |
6811 | | |
6812 | 0 | case OP_PRUNE_ARG: |
6813 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
6814 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15); |
6815 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6816 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6817 | 0 | RRETURN(MATCH_PRUNE); |
6818 | | |
6819 | 0 | case OP_SKIP: |
6820 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16); |
6821 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6822 | 0 | mb->verb_skip_ptr = Feptr; /* Pass back current position */ |
6823 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6824 | 0 | RRETURN(MATCH_SKIP); |
6825 | | |
6826 | | /* Note that, for Perl compatibility, SKIP with an argument does NOT set |
6827 | | nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was |
6828 | | not a matching mark, we have to re-run the match, ignoring the SKIP_ARG |
6829 | | that failed and any that precede it (either they also failed, or were not |
6830 | | triggered). To do this, we maintain a count of executed SKIP_ARGs. If a |
6831 | | SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg |
6832 | | set to the count of the one that failed. */ |
6833 | | |
6834 | 0 | case OP_SKIP_ARG: |
6835 | 0 | mb->skip_arg_count++; |
6836 | 0 | if (mb->skip_arg_count <= mb->ignore_skip_arg) |
6837 | 0 | { |
6838 | 0 | Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1]; |
6839 | 0 | break; |
6840 | 0 | } |
6841 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17); |
6842 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6843 | | |
6844 | | /* Pass back the current skip name and return the special MATCH_SKIP_ARG |
6845 | | return code. This will either be caught by a matching MARK, or get to the |
6846 | | top, where it causes a rematch with mb->ignore_skip_arg set to the value of |
6847 | | mb->skip_arg_count. */ |
6848 | | |
6849 | 0 | mb->verb_skip_ptr = Fecode + 2; |
6850 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6851 | 0 | RRETURN(MATCH_SKIP_ARG); |
6852 | | |
6853 | | /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that |
6854 | | the branch in which it occurs can be determined. */ |
6855 | | |
6856 | 0 | case OP_THEN: |
6857 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18); |
6858 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6859 | 0 | mb->verb_ecode_ptr = Fecode; |
6860 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6861 | 0 | RRETURN(MATCH_THEN); |
6862 | | |
6863 | 0 | case OP_THEN_ARG: |
6864 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
6865 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19); |
6866 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6867 | 0 | mb->verb_ecode_ptr = Fecode; |
6868 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6869 | 0 | RRETURN(MATCH_THEN); |
6870 | | |
6871 | | |
6872 | | /* ===================================================================== */ |
6873 | | /* There's been some horrible disaster. Arrival here can only mean there is |
6874 | | something seriously wrong in the code above or the OP_xxx definitions. */ |
6875 | | |
6876 | | /* LCOV_EXCL_START */ |
6877 | 0 | default: |
6878 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
6879 | 0 | return PCRE2_ERROR_INTERNAL; |
6880 | | /* LCOV_EXCL_STOP */ |
6881 | 0 | } |
6882 | | |
6883 | | /* Do not insert any code in here without much thought; it is assumed |
6884 | | that "continue" in the code above comes out to here to repeat the main |
6885 | | loop. */ |
6886 | |
|
6887 | 0 | } /* End of main loop */ |
6888 | | |
6889 | 0 | PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */ |
6890 | | |
6891 | | /* ========================================================================= */ |
6892 | | /* The RRETURN() macro jumps here. The number that is saved in Freturn_id |
6893 | | indicates which label we actually want to return to. The value in Frdepth is |
6894 | | the index number of the frame in the vector. The return value has been placed |
6895 | | in rrc. */ |
6896 | |
|
6897 | 0 | #define LBL(val) case val: goto L_RM##val; |
6898 | |
|
6899 | 0 | RETURN_SWITCH: |
6900 | 0 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
6901 | 0 | if (Frdepth == 0) return rrc; /* Exit from the top level */ |
6902 | 0 | F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */ |
6903 | 0 | mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */ |
6904 | |
|
6905 | | #ifdef DEBUG_SHOW_RMATCH |
6906 | | fprintf(stderr, "++ RETURN %d to RM%d\n", rrc, Freturn_id); |
6907 | | #endif |
6908 | |
|
6909 | 0 | switch (Freturn_id) |
6910 | 0 | { |
6911 | 0 | LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) |
6912 | 0 | LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16) |
6913 | 0 | LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24) |
6914 | 0 | LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32) |
6915 | 0 | LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) |
6916 | | |
6917 | 0 | #ifdef SUPPORT_WIDE_CHARS |
6918 | 0 | LBL(100) LBL(101) LBL(102) LBL(103) |
6919 | 0 | #endif |
6920 | | |
6921 | 0 | #ifdef SUPPORT_UNICODE |
6922 | 0 | LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206) |
6923 | 0 | LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213) |
6924 | 0 | LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220) |
6925 | 0 | LBL(221) LBL(222) LBL(223) LBL(224) |
6926 | 0 | #endif |
6927 | | |
6928 | | /* LCOV_EXCL_START */ |
6929 | 0 | default: |
6930 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
6931 | 0 | return PCRE2_ERROR_INTERNAL; |
6932 | | /* LCOV_EXCL_STOP */ |
6933 | 0 | } |
6934 | 0 | #undef LBL |
6935 | 0 | } |
6936 | | |
6937 | | |
6938 | | /************************************************* |
6939 | | * Match a Regular Expression * |
6940 | | *************************************************/ |
6941 | | |
6942 | | /* This function applies a compiled pattern to a subject string and picks out |
6943 | | portions of the string if it matches. Two elements in the vector are set for |
6944 | | each substring: the offsets to the start and end of the substring. |
6945 | | |
6946 | | Arguments: |
6947 | | code points to the compiled expression |
6948 | | subject points to the subject string |
6949 | | length length of subject string (may contain binary zeros) |
6950 | | start_offset where to start in the subject string |
6951 | | options option bits |
6952 | | match_data points to a match_data block |
6953 | | mcontext points a PCRE2 context |
6954 | | |
6955 | | Returns: > 0 => success; value is the number of ovector pairs filled |
6956 | | = 0 => success, but ovector is not big enough |
6957 | | = -1 => failed to match (PCRE2_ERROR_NOMATCH) |
6958 | | = -2 => partial match (PCRE2_ERROR_PARTIAL) |
6959 | | < -2 => some kind of unexpected problem |
6960 | | */ |
6961 | | |
6962 | | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION |
6963 | | pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, |
6964 | | PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, |
6965 | | pcre2_match_context *mcontext) |
6966 | 0 | { |
6967 | 0 | int rc; |
6968 | 0 | const uint8_t *start_bits = NULL; |
6969 | 0 | const pcre2_real_code *re = (const pcre2_real_code *)code; |
6970 | 0 | uint32_t original_options = options; |
6971 | |
|
6972 | 0 | BOOL anchored; |
6973 | 0 | BOOL firstline; |
6974 | 0 | BOOL has_first_cu = FALSE; |
6975 | 0 | BOOL has_req_cu = FALSE; |
6976 | 0 | BOOL startline; |
6977 | |
|
6978 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6979 | 0 | PCRE2_SPTR memchr_found_first_cu; |
6980 | 0 | PCRE2_SPTR memchr_found_first_cu2; |
6981 | 0 | #endif |
6982 | |
|
6983 | 0 | PCRE2_UCHAR first_cu = 0; |
6984 | 0 | PCRE2_UCHAR first_cu2 = 0; |
6985 | 0 | PCRE2_UCHAR req_cu = 0; |
6986 | 0 | PCRE2_UCHAR req_cu2 = 0; |
6987 | |
|
6988 | 0 | PCRE2_UCHAR null_str[1] = { 0xcd }; |
6989 | 0 | PCRE2_SPTR original_subject = subject; |
6990 | 0 | PCRE2_SPTR bumpalong_limit; |
6991 | 0 | PCRE2_SPTR end_subject; |
6992 | 0 | PCRE2_SPTR true_end_subject; |
6993 | 0 | PCRE2_SPTR start_match; |
6994 | 0 | PCRE2_SPTR req_cu_ptr; |
6995 | 0 | PCRE2_SPTR start_partial; |
6996 | 0 | PCRE2_SPTR match_partial; |
6997 | |
|
6998 | | #ifdef SUPPORT_JIT |
6999 | | BOOL use_jit; |
7000 | | #endif |
7001 | | |
7002 | | /* This flag is needed even when Unicode is not supported for convenience |
7003 | | (it is used by the IS_NEWLINE macro). */ |
7004 | |
|
7005 | 0 | BOOL utf = FALSE; |
7006 | |
|
7007 | 0 | #ifdef SUPPORT_UNICODE |
7008 | 0 | BOOL ucp = FALSE; |
7009 | 0 | BOOL allow_invalid; |
7010 | 0 | uint32_t fragment_options = 0; |
7011 | | #ifdef SUPPORT_JIT |
7012 | | BOOL jit_checked_utf = FALSE; |
7013 | | #endif |
7014 | 0 | #endif /* SUPPORT_UNICODE */ |
7015 | |
|
7016 | 0 | PCRE2_SIZE frame_size; |
7017 | 0 | PCRE2_SIZE heapframes_size; |
7018 | | |
7019 | | /* We need to have mb as a pointer to a match block, because the IS_NEWLINE |
7020 | | macro is used below, and it expects NLBLOCK to be defined as a pointer. */ |
7021 | |
|
7022 | 0 | pcre2_callout_block cb; |
7023 | 0 | match_block actual_match_block; |
7024 | 0 | match_block *mb = &actual_match_block; |
7025 | | |
7026 | | /* Recognize NULL, length 0 as an empty string. */ |
7027 | |
|
7028 | 0 | if (subject == NULL && length == 0) subject = null_str; |
7029 | | |
7030 | | /* Plausibility checks */ |
7031 | |
|
7032 | 0 | if (match_data == NULL) return PCRE2_ERROR_NULL; |
7033 | 0 | if (code == NULL || subject == NULL) |
7034 | 0 | return match_data->rc = PCRE2_ERROR_NULL; |
7035 | 0 | if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) |
7036 | 0 | return match_data->rc = PCRE2_ERROR_BADOPTION; |
7037 | | |
7038 | 0 | start_match = subject + start_offset; |
7039 | 0 | req_cu_ptr = start_match - 1; |
7040 | 0 | if (length == PCRE2_ZERO_TERMINATED) |
7041 | 0 | { |
7042 | 0 | length = PRIV(strlen)(subject); |
7043 | 0 | } |
7044 | 0 | true_end_subject = end_subject = subject + length; |
7045 | |
|
7046 | 0 | if (start_offset > length) return match_data->rc = PCRE2_ERROR_BADOFFSET; |
7047 | | |
7048 | | /* Check that the first field in the block is the magic number. */ |
7049 | | |
7050 | 0 | if (re->magic_number != MAGIC_NUMBER) |
7051 | 0 | return match_data->rc = PCRE2_ERROR_BADMAGIC; |
7052 | | |
7053 | | /* Check the code unit width. */ |
7054 | | |
7055 | 0 | if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) |
7056 | 0 | return match_data->rc = PCRE2_ERROR_BADMODE; |
7057 | | |
7058 | | /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the |
7059 | | options variable for this function. Users of PCRE2 who are not calling the |
7060 | | function directly would like to have a way of setting these flags, in the same |
7061 | | way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with |
7062 | | constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and |
7063 | | (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now |
7064 | | transfer to the options for this function. The bits are guaranteed to be |
7065 | | adjacent, but do not have the same values. This bit of Boolean trickery assumes |
7066 | | that the match-time bits are not more significant than the flag bits. If by |
7067 | | accident this is not the case, a compile-time division by zero error will |
7068 | | occur. */ |
7069 | | |
7070 | 0 | #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) |
7071 | 0 | #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) |
7072 | 0 | options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); |
7073 | 0 | #undef FF |
7074 | 0 | #undef OO |
7075 | | |
7076 | | /* If the pattern was successfully studied with JIT support, we will run the |
7077 | | JIT executable instead of the rest of this function. Most options must be set |
7078 | | at compile time for the JIT code to be usable. */ |
7079 | |
|
7080 | | #ifdef SUPPORT_JIT |
7081 | | use_jit = (re->executable_jit != NULL && |
7082 | | (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); |
7083 | | #endif |
7084 | | |
7085 | | /* Initialize UTF/UCP parameters. */ |
7086 | |
|
7087 | 0 | #ifdef SUPPORT_UNICODE |
7088 | 0 | utf = (re->overall_options & PCRE2_UTF) != 0; |
7089 | 0 | allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; |
7090 | 0 | ucp = (re->overall_options & PCRE2_UCP) != 0; |
7091 | 0 | #endif /* SUPPORT_UNICODE */ |
7092 | | |
7093 | | /* Convert the partial matching flags into an integer. */ |
7094 | |
|
7095 | 0 | mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : |
7096 | 0 | ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0; |
7097 | | |
7098 | | /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same |
7099 | | time. */ |
7100 | |
|
7101 | 0 | if (mb->partial != 0 && |
7102 | 0 | ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) |
7103 | 0 | return match_data->rc = PCRE2_ERROR_BADOPTION; |
7104 | | |
7105 | | /* It is an error to set an offset limit without setting the flag at compile |
7106 | | time. */ |
7107 | | |
7108 | 0 | if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && |
7109 | 0 | (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) |
7110 | 0 | return match_data->rc = PCRE2_ERROR_BADOFFSETLIMIT; |
7111 | | |
7112 | | /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, |
7113 | | free the memory that was obtained. Set the field to NULL for match error |
7114 | | cases. */ |
7115 | | |
7116 | 0 | if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) |
7117 | 0 | { |
7118 | 0 | match_data->memctl.free((void *)match_data->subject, |
7119 | 0 | match_data->memctl.memory_data); |
7120 | 0 | match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; |
7121 | 0 | } |
7122 | 0 | match_data->subject = NULL; |
7123 | | |
7124 | | /* Zero the error offset in case the first code unit is invalid UTF. */ |
7125 | |
|
7126 | 0 | match_data->startchar = 0; |
7127 | | |
7128 | | |
7129 | | /* ============================= JIT matching ============================== */ |
7130 | | |
7131 | | /* Prepare for JIT matching. Check a UTF string for validity unless no check is |
7132 | | requested or invalid UTF can be handled. We check only the portion of the |
7133 | | subject that might be be inspected during matching - from the offset minus the |
7134 | | maximum lookbehind to the given length. This saves time when a small part of a |
7135 | | large subject is being matched by the use of a starting offset. Note that the |
7136 | | maximum lookbehind is a number of characters, not code units. */ |
7137 | |
|
7138 | | #ifdef SUPPORT_JIT |
7139 | | if (use_jit) |
7140 | | { |
7141 | | #ifdef SUPPORT_UNICODE |
7142 | | if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid) |
7143 | | { |
7144 | | |
7145 | | /* For 8-bit and 16-bit UTF, check that the first code unit is a valid |
7146 | | character start. */ |
7147 | | |
7148 | | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7149 | | if (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
7150 | | { |
7151 | | if (start_offset > 0) return match_data->rc = PCRE2_ERROR_BADUTFOFFSET; |
7152 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7153 | | return match_data->rc = PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ |
7154 | | #else |
7155 | | return match_data->rc = PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ |
7156 | | #endif |
7157 | | } |
7158 | | #endif /* WIDTH != 32 */ |
7159 | | |
7160 | | /* Move back by the maximum lookbehind, just in case it happens at the very |
7161 | | start of matching. */ |
7162 | | |
7163 | | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7164 | | for (unsigned int i = re->max_lookbehind; i > 0 && start_match > subject; i--) |
7165 | | { |
7166 | | start_match--; |
7167 | | while (start_match > subject && |
7168 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7169 | | (*start_match & 0xc0) == 0x80) |
7170 | | #else /* 16-bit */ |
7171 | | (*start_match & 0xfc00) == 0xdc00) |
7172 | | #endif |
7173 | | start_match--; |
7174 | | } |
7175 | | #else /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
7176 | | |
7177 | | /* In the 32-bit library, one code unit equals one character. However, |
7178 | | we cannot just subtract the lookbehind and then compare pointers, because |
7179 | | a very large lookbehind could create an invalid pointer. */ |
7180 | | |
7181 | | if (start_offset >= re->max_lookbehind) |
7182 | | start_match -= re->max_lookbehind; |
7183 | | else |
7184 | | start_match = subject; |
7185 | | #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
7186 | | |
7187 | | /* Validate the relevant portion of the subject. Adjust the offset of an |
7188 | | invalid code point to be an absolute offset in the whole string. */ |
7189 | | |
7190 | | rc = PRIV(valid_utf)(start_match, |
7191 | | length - (start_match - subject), &(match_data->startchar)); |
7192 | | if (rc != 0) |
7193 | | { |
7194 | | match_data->startchar += start_match - subject; |
7195 | | return match_data->rc = rc; |
7196 | | } |
7197 | | jit_checked_utf = TRUE; |
7198 | | } |
7199 | | #endif /* SUPPORT_UNICODE */ |
7200 | | |
7201 | | /* If JIT returns BADOPTION, which means that the selected complete or |
7202 | | partial matching mode was not compiled, fall through to the interpreter. */ |
7203 | | |
7204 | | rc = pcre2_jit_match(code, subject, length, start_offset, options, |
7205 | | match_data, mcontext); |
7206 | | if (rc != PCRE2_ERROR_JIT_BADOPTION) |
7207 | | { |
7208 | | match_data->options = original_options; |
7209 | | if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0) |
7210 | | { |
7211 | | if (length != 0) |
7212 | | { |
7213 | | match_data->subject = match_data->memctl.malloc(CU2BYTES(length), |
7214 | | match_data->memctl.memory_data); |
7215 | | if (match_data->subject == NULL) |
7216 | | return match_data->rc = PCRE2_ERROR_NOMEMORY; |
7217 | | memcpy((void *)match_data->subject, subject, CU2BYTES(length)); |
7218 | | } |
7219 | | else |
7220 | | match_data->subject = NULL; |
7221 | | match_data->flags |= PCRE2_MD_COPIED_SUBJECT; |
7222 | | } |
7223 | | else |
7224 | | { |
7225 | | /* When pcre2_jit_match sets the subject, it doesn't know what the |
7226 | | original passed-in pointer was. */ |
7227 | | if (match_data->subject != NULL) match_data->subject = original_subject; |
7228 | | } |
7229 | | return rc; |
7230 | | } |
7231 | | } |
7232 | | #endif /* SUPPORT_JIT */ |
7233 | | |
7234 | | /* ========================= End of JIT matching ========================== */ |
7235 | | |
7236 | | |
7237 | | /* Proceed with non-JIT matching. The default is to allow lookbehinds to the |
7238 | | start of the subject. A UTF check when there is a non-zero offset may change |
7239 | | this. */ |
7240 | |
|
7241 | 0 | mb->check_subject = subject; |
7242 | | |
7243 | | /* If a UTF subject string was not checked for validity in the JIT code above, |
7244 | | check it here, and handle support for invalid UTF strings. The check above |
7245 | | happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset. |
7246 | | If we get here in those circumstances, it means the subject string is valid, |
7247 | | but for some reason JIT matching was not successful. There is no need to check |
7248 | | the subject again. |
7249 | | |
7250 | | We check only the portion of the subject that might be be inspected during |
7251 | | matching - from the offset minus the maximum lookbehind to the given length. |
7252 | | This saves time when a small part of a large subject is being matched by the |
7253 | | use of a starting offset. Note that the maximum lookbehind is a number of |
7254 | | characters, not code units. |
7255 | | |
7256 | | Note also that support for invalid UTF forces a check, overriding the setting |
7257 | | of PCRE2_NO_CHECK_UTF. */ |
7258 | |
|
7259 | 0 | #ifdef SUPPORT_UNICODE |
7260 | 0 | if (utf && |
7261 | | #ifdef SUPPORT_JIT |
7262 | | !jit_checked_utf && |
7263 | | #endif |
7264 | 0 | ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid)) |
7265 | 0 | { |
7266 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7267 | 0 | BOOL skipped_bad_start = FALSE; |
7268 | 0 | #endif |
7269 | | |
7270 | | /* For 8-bit and 16-bit UTF, check that the first code unit is a valid |
7271 | | character start. If we are handling invalid UTF, just skip over such code |
7272 | | units. Otherwise, give an appropriate error. */ |
7273 | |
|
7274 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7275 | 0 | if (allow_invalid) |
7276 | 0 | { |
7277 | 0 | while (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
7278 | 0 | { |
7279 | 0 | start_match++; |
7280 | 0 | skipped_bad_start = TRUE; |
7281 | 0 | } |
7282 | 0 | } |
7283 | 0 | else if (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
7284 | 0 | { |
7285 | 0 | if (start_offset > 0) return match_data->rc = PCRE2_ERROR_BADUTFOFFSET; |
7286 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7287 | 0 | return match_data->rc = PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ |
7288 | | #else |
7289 | | return match_data->rc = PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ |
7290 | | #endif |
7291 | 0 | } |
7292 | 0 | #endif /* WIDTH != 32 */ |
7293 | | |
7294 | | /* The mb->check_subject field points to the start of UTF checking; |
7295 | | lookbehinds can go back no further than this. */ |
7296 | | |
7297 | 0 | mb->check_subject = start_match; |
7298 | | |
7299 | | /* Move back by the maximum lookbehind, just in case it happens at the very |
7300 | | start of matching, but don't do this if we skipped bad 8-bit or 16-bit code |
7301 | | units above. */ |
7302 | |
|
7303 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7304 | 0 | if (!skipped_bad_start) |
7305 | 0 | { |
7306 | 0 | unsigned int i; |
7307 | 0 | for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--) |
7308 | 0 | { |
7309 | 0 | mb->check_subject--; |
7310 | 0 | while (mb->check_subject > subject && |
7311 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7312 | 0 | (*mb->check_subject & 0xc0) == 0x80) |
7313 | | #else /* 16-bit */ |
7314 | | (*mb->check_subject & 0xfc00) == 0xdc00) |
7315 | | #endif |
7316 | 0 | mb->check_subject--; |
7317 | 0 | } |
7318 | 0 | } |
7319 | | #else /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
7320 | | |
7321 | | /* In the 32-bit library, one code unit equals one character. However, |
7322 | | we cannot just subtract the lookbehind and then compare pointers, because |
7323 | | a very large lookbehind could create an invalid pointer. */ |
7324 | | |
7325 | | if (start_offset >= re->max_lookbehind) |
7326 | | mb->check_subject -= re->max_lookbehind; |
7327 | | else |
7328 | | mb->check_subject = subject; |
7329 | | #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
7330 | | |
7331 | | /* Validate the relevant portion of the subject. There's a loop in case we |
7332 | | encounter bad UTF in the characters preceding start_match which we are |
7333 | | scanning because of a lookbehind. */ |
7334 | |
|
7335 | 0 | for (;;) |
7336 | 0 | { |
7337 | 0 | rc = PRIV(valid_utf)(mb->check_subject, |
7338 | 0 | length - (mb->check_subject - subject), &(match_data->startchar)); |
7339 | |
|
7340 | 0 | if (rc == 0) break; /* Valid UTF string */ |
7341 | | |
7342 | | /* Invalid UTF string. Adjust the offset to be an absolute offset in the |
7343 | | whole string. If we are handling invalid UTF strings, set end_subject to |
7344 | | stop before the bad code unit, and set the options to "not end of line". |
7345 | | Otherwise return the error. */ |
7346 | | |
7347 | 0 | match_data->startchar += mb->check_subject - subject; |
7348 | 0 | if (!allow_invalid || rc > 0) return match_data->rc = rc; |
7349 | 0 | end_subject = subject + match_data->startchar; |
7350 | | |
7351 | | /* If the end precedes start_match, it means there is invalid UTF in the |
7352 | | extra code units we reversed over because of a lookbehind. Advance past the |
7353 | | first bad code unit, and then skip invalid character starting code units in |
7354 | | 8-bit and 16-bit modes, and try again with the original end point. */ |
7355 | |
|
7356 | 0 | if (end_subject < start_match) |
7357 | 0 | { |
7358 | 0 | mb->check_subject = end_subject + 1; |
7359 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7360 | 0 | while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject)) |
7361 | 0 | mb->check_subject++; |
7362 | 0 | #endif |
7363 | 0 | end_subject = true_end_subject; |
7364 | 0 | } |
7365 | | |
7366 | | /* Otherwise, set the not end of line option, and do the match. */ |
7367 | | |
7368 | 0 | else |
7369 | 0 | { |
7370 | 0 | fragment_options = PCRE2_NOTEOL; |
7371 | 0 | break; |
7372 | 0 | } |
7373 | 0 | } |
7374 | 0 | } |
7375 | 0 | #endif /* SUPPORT_UNICODE */ |
7376 | | |
7377 | | /* A NULL match context means "use a default context", but we take the memory |
7378 | | control functions from the pattern. */ |
7379 | | |
7380 | 0 | if (mcontext == NULL) |
7381 | 0 | { |
7382 | 0 | mcontext = (pcre2_match_context *)(&PRIV(default_match_context)); |
7383 | 0 | mb->memctl = re->memctl; |
7384 | 0 | } |
7385 | 0 | else mb->memctl = mcontext->memctl; |
7386 | |
|
7387 | 0 | anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; |
7388 | 0 | firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0; |
7389 | 0 | startline = (re->flags & PCRE2_STARTLINE) != 0; |
7390 | 0 | bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? |
7391 | 0 | true_end_subject : subject + mcontext->offset_limit; |
7392 | | |
7393 | | /* Initialize and set up the fixed fields in the callout block, with a pointer |
7394 | | in the match block. */ |
7395 | |
|
7396 | 0 | mb->cb = &cb; |
7397 | 0 | cb.version = 2; |
7398 | 0 | cb.subject = subject; |
7399 | 0 | cb.subject_length = (PCRE2_SIZE)(end_subject - subject); |
7400 | 0 | cb.callout_flags = 0; |
7401 | | |
7402 | | /* Fill in the remaining fields in the match block, except for moptions, which |
7403 | | gets set later. */ |
7404 | |
|
7405 | 0 | mb->callout = mcontext->callout; |
7406 | 0 | mb->callout_data = mcontext->callout_data; |
7407 | |
|
7408 | 0 | mb->start_subject = subject; |
7409 | 0 | mb->start_offset = start_offset; |
7410 | 0 | mb->end_subject = end_subject; |
7411 | 0 | mb->true_end_subject = true_end_subject; |
7412 | 0 | mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; |
7413 | 0 | mb->hasbsk = (re->flags & PCRE2_HASBSK) != 0; |
7414 | 0 | mb->allowemptypartial = (re->max_lookbehind > 0) || |
7415 | 0 | (re->flags & PCRE2_MATCH_EMPTY) != 0; |
7416 | 0 | mb->allowlookaroundbsk = |
7417 | 0 | (re->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) != 0; |
7418 | 0 | mb->poptions = re->overall_options; /* Pattern options */ |
7419 | 0 | mb->ignore_skip_arg = 0; |
7420 | 0 | mb->mark = mb->nomatch_mark = NULL; /* In case never set */ |
7421 | | |
7422 | | /* The name table is needed for finding all the numbers associated with a |
7423 | | given name, for condition testing. The code follows the name table. */ |
7424 | |
|
7425 | 0 | mb->name_table = (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)); |
7426 | 0 | mb->name_count = re->name_count; |
7427 | 0 | mb->name_entry_size = re->name_entry_size; |
7428 | 0 | mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start); |
7429 | | |
7430 | | /* Process the \R and newline settings. */ |
7431 | |
|
7432 | 0 | mb->bsr_convention = re->bsr_convention; |
7433 | 0 | mb->nltype = NLTYPE_FIXED; |
7434 | 0 | switch(re->newline_convention) |
7435 | 0 | { |
7436 | 0 | case PCRE2_NEWLINE_CR: |
7437 | 0 | mb->nllen = 1; |
7438 | 0 | mb->nl[0] = CHAR_CR; |
7439 | 0 | break; |
7440 | | |
7441 | 0 | case PCRE2_NEWLINE_LF: |
7442 | 0 | mb->nllen = 1; |
7443 | 0 | mb->nl[0] = CHAR_NL; |
7444 | 0 | break; |
7445 | | |
7446 | 0 | case PCRE2_NEWLINE_NUL: |
7447 | 0 | mb->nllen = 1; |
7448 | 0 | mb->nl[0] = CHAR_NUL; |
7449 | 0 | break; |
7450 | | |
7451 | 0 | case PCRE2_NEWLINE_CRLF: |
7452 | 0 | mb->nllen = 2; |
7453 | 0 | mb->nl[0] = CHAR_CR; |
7454 | 0 | mb->nl[1] = CHAR_NL; |
7455 | 0 | break; |
7456 | | |
7457 | 0 | case PCRE2_NEWLINE_ANY: |
7458 | 0 | mb->nltype = NLTYPE_ANY; |
7459 | 0 | break; |
7460 | | |
7461 | 0 | case PCRE2_NEWLINE_ANYCRLF: |
7462 | 0 | mb->nltype = NLTYPE_ANYCRLF; |
7463 | 0 | break; |
7464 | | |
7465 | | /* LCOV_EXCL_START */ |
7466 | 0 | default: |
7467 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
7468 | 0 | return match_data->rc = PCRE2_ERROR_INTERNAL; |
7469 | | /* LCOV_EXCL_STOP */ |
7470 | 0 | } |
7471 | | |
7472 | | /* The backtracking frames have fixed data at the front, and a PCRE2_SIZE |
7473 | | vector at the end, whose size depends on the number of capturing parentheses in |
7474 | | the pattern. It is not used at all if there are no capturing parentheses. |
7475 | | |
7476 | | frame_size is the total size of each frame |
7477 | | match_data->heapframes is the pointer to the frames vector |
7478 | | match_data->heapframes_size is the allocated size of the vector |
7479 | | |
7480 | | We must pad the frame_size for alignment to ensure subsequent frames are as |
7481 | | aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE |
7482 | | array, that does not guarantee it is suitably aligned for pointers, as some |
7483 | | architectures have pointers that are larger than a size_t. */ |
7484 | | |
7485 | 0 | frame_size = (offsetof(heapframe, ovector) + |
7486 | 0 | re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) & |
7487 | 0 | ~(HEAPFRAME_ALIGNMENT - 1); |
7488 | | |
7489 | | /* Limits set in the pattern override the match context only if they are |
7490 | | smaller. */ |
7491 | |
|
7492 | 0 | mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)? |
7493 | 0 | mcontext->heap_limit : re->limit_heap); |
7494 | |
|
7495 | 0 | mb->match_limit = (mcontext->match_limit < re->limit_match)? |
7496 | 0 | mcontext->match_limit : re->limit_match; |
7497 | |
|
7498 | 0 | mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)? |
7499 | 0 | mcontext->depth_limit : re->limit_depth; |
7500 | | |
7501 | | /* If a pattern has very many capturing parentheses, the frame size may be very |
7502 | | large. Set the initial frame vector size to ensure that there are at least 10 |
7503 | | available frames, but enforce a minimum of START_FRAMES_SIZE. If this is |
7504 | | greater than the heap limit, get as large a vector as possible. */ |
7505 | |
|
7506 | 0 | heapframes_size = frame_size * 10; |
7507 | 0 | if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE; |
7508 | 0 | if (heapframes_size / 1024 > mb->heap_limit) |
7509 | 0 | { |
7510 | 0 | PCRE2_SIZE max_size = 1024 * mb->heap_limit; |
7511 | 0 | if (max_size < frame_size) return match_data->rc = PCRE2_ERROR_HEAPLIMIT; |
7512 | 0 | heapframes_size = max_size; |
7513 | 0 | } |
7514 | | |
7515 | | /* If an existing frame vector in the match_data block is large enough, we can |
7516 | | use it. Otherwise, free any pre-existing vector and get a new one. */ |
7517 | | |
7518 | 0 | if (match_data->heapframes_size < heapframes_size) |
7519 | 0 | { |
7520 | 0 | match_data->memctl.free(match_data->heapframes, |
7521 | 0 | match_data->memctl.memory_data); |
7522 | 0 | match_data->heapframes = match_data->memctl.malloc(heapframes_size, |
7523 | 0 | match_data->memctl.memory_data); |
7524 | 0 | if (match_data->heapframes == NULL) |
7525 | 0 | { |
7526 | 0 | match_data->heapframes_size = 0; |
7527 | 0 | return match_data->rc = PCRE2_ERROR_NOMEMORY; |
7528 | 0 | } |
7529 | 0 | match_data->heapframes_size = heapframes_size; |
7530 | 0 | } |
7531 | | |
7532 | | /* Write to the ovector within the first frame to mark every capture unset and |
7533 | | to avoid uninitialized memory read errors when it is copied to a new frame. */ |
7534 | | |
7535 | 0 | memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff, |
7536 | 0 | frame_size - offsetof(heapframe, ovector)); |
7537 | | |
7538 | | /* Pointers to the individual character tables */ |
7539 | |
|
7540 | 0 | mb->lcc = re->tables + lcc_offset; |
7541 | 0 | mb->fcc = re->tables + fcc_offset; |
7542 | 0 | mb->ctypes = re->tables + ctypes_offset; |
7543 | | |
7544 | | /* Set up the first code unit to match, if available. If there's no first code |
7545 | | unit there may be a bitmap of possible first characters. */ |
7546 | |
|
7547 | 0 | if ((re->flags & PCRE2_FIRSTSET) != 0) |
7548 | 0 | { |
7549 | 0 | has_first_cu = TRUE; |
7550 | 0 | first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); |
7551 | 0 | if ((re->flags & PCRE2_FIRSTCASELESS) != 0) |
7552 | 0 | { |
7553 | 0 | first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); |
7554 | 0 | #ifdef SUPPORT_UNICODE |
7555 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7556 | 0 | if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu); |
7557 | | #else |
7558 | | if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu); |
7559 | | #endif |
7560 | 0 | #endif /* SUPPORT_UNICODE */ |
7561 | 0 | } |
7562 | 0 | } |
7563 | 0 | else |
7564 | 0 | if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) |
7565 | 0 | start_bits = re->start_bitmap; |
7566 | | |
7567 | | /* There may also be a "last known required character" set. */ |
7568 | |
|
7569 | 0 | if ((re->flags & PCRE2_LASTSET) != 0) |
7570 | 0 | { |
7571 | 0 | has_req_cu = TRUE; |
7572 | 0 | req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); |
7573 | 0 | if ((re->flags & PCRE2_LASTCASELESS) != 0) |
7574 | 0 | { |
7575 | 0 | req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); |
7576 | 0 | #ifdef SUPPORT_UNICODE |
7577 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7578 | 0 | if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu); |
7579 | | #else |
7580 | | if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu); |
7581 | | #endif |
7582 | 0 | #endif /* SUPPORT_UNICODE */ |
7583 | 0 | } |
7584 | 0 | } |
7585 | | |
7586 | | |
7587 | | /* ==========================================================================*/ |
7588 | | |
7589 | | /* Loop for handling unanchored repeated matching attempts; for anchored regexs |
7590 | | the loop runs just once. */ |
7591 | |
|
7592 | 0 | #ifdef SUPPORT_UNICODE |
7593 | 0 | FRAGMENT_RESTART: |
7594 | 0 | #endif |
7595 | |
|
7596 | 0 | start_partial = match_partial = NULL; |
7597 | 0 | mb->hitend = FALSE; |
7598 | |
|
7599 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
7600 | 0 | memchr_found_first_cu = NULL; |
7601 | 0 | memchr_found_first_cu2 = NULL; |
7602 | 0 | #endif |
7603 | |
|
7604 | 0 | for(;;) |
7605 | 0 | { |
7606 | 0 | PCRE2_SPTR new_start_match; |
7607 | | |
7608 | | /* ----------------- Start of match optimizations ---------------- */ |
7609 | | |
7610 | | /* There are some optimizations that avoid running the match if a known |
7611 | | starting point is not found, or if a known later code unit is not present. |
7612 | | However, there is an option (settable at compile time) that disables these, |
7613 | | for testing and for ensuring that all callouts do actually occur. */ |
7614 | |
|
7615 | 0 | if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0) |
7616 | 0 | { |
7617 | | /* If firstline is TRUE, the start of the match is constrained to the first |
7618 | | line of a multiline string. That is, the match must be before or at the |
7619 | | first newline following the start of matching. Temporarily adjust |
7620 | | end_subject so that we stop the scans for a first code unit at a newline. |
7621 | | If the match fails at the newline, later code breaks the loop. */ |
7622 | |
|
7623 | 0 | if (firstline) |
7624 | 0 | { |
7625 | 0 | PCRE2_SPTR t = start_match; |
7626 | 0 | #ifdef SUPPORT_UNICODE |
7627 | 0 | if (utf) |
7628 | 0 | { |
7629 | 0 | while (t < end_subject && !IS_NEWLINE(t)) |
7630 | 0 | { |
7631 | 0 | t++; |
7632 | 0 | ACROSSCHAR(t < end_subject, t, t++); |
7633 | 0 | } |
7634 | 0 | } |
7635 | 0 | else |
7636 | 0 | #endif |
7637 | 0 | while (t < end_subject && !IS_NEWLINE(t)) t++; |
7638 | 0 | end_subject = t; |
7639 | 0 | } |
7640 | | |
7641 | | /* Anchored: check the first code unit if one is recorded. This may seem |
7642 | | pointless but it can help in detecting a no match case without scanning for |
7643 | | the required code unit. */ |
7644 | |
|
7645 | 0 | if (anchored) |
7646 | 0 | { |
7647 | 0 | if (has_first_cu || start_bits != NULL) |
7648 | 0 | { |
7649 | 0 | BOOL ok = start_match < end_subject; |
7650 | 0 | if (ok) |
7651 | 0 | { |
7652 | 0 | PCRE2_UCHAR c = UCHAR21TEST(start_match); |
7653 | 0 | ok = has_first_cu && (c == first_cu || c == first_cu2); |
7654 | 0 | if (!ok && start_bits != NULL) |
7655 | 0 | { |
7656 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7657 | | if (c > 255) c = 255; |
7658 | | #endif |
7659 | 0 | ok = (start_bits[c/8] & (1u << (c&7))) != 0; |
7660 | 0 | } |
7661 | 0 | } |
7662 | 0 | if (!ok) |
7663 | 0 | { |
7664 | 0 | rc = MATCH_NOMATCH; |
7665 | 0 | break; |
7666 | 0 | } |
7667 | 0 | } |
7668 | 0 | } |
7669 | | |
7670 | | /* Not anchored. Advance to a unique first code unit if there is one. */ |
7671 | | |
7672 | 0 | else |
7673 | 0 | { |
7674 | 0 | if (has_first_cu) |
7675 | 0 | { |
7676 | 0 | if (first_cu != first_cu2) /* Caseless */ |
7677 | 0 | { |
7678 | | /* In 16-bit and 32_bit modes we have to do our own search, so can |
7679 | | look for both cases at once. */ |
7680 | |
|
7681 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7682 | | PCRE2_UCHAR smc; |
7683 | | while (start_match < end_subject && |
7684 | | (smc = UCHAR21TEST(start_match)) != first_cu && |
7685 | | smc != first_cu2) |
7686 | | start_match++; |
7687 | | #else |
7688 | | /* In 8-bit mode, the use of memchr() gives a big speed up, even |
7689 | | though we have to call it twice in order to find the earliest |
7690 | | occurrence of the code unit in either of its cases. Caching is used |
7691 | | to remember the positions of previously found code units. This can |
7692 | | make a huge difference when the strings are very long and only one |
7693 | | case is actually present. */ |
7694 | |
|
7695 | 0 | PCRE2_SPTR pp1 = NULL; |
7696 | 0 | PCRE2_SPTR pp2 = NULL; |
7697 | 0 | PCRE2_SIZE searchlength = end_subject - start_match; |
7698 | | |
7699 | | /* If we haven't got a previously found position for first_cu, or if |
7700 | | the current starting position is later, we need to do a search. If |
7701 | | the code unit is not found, set it to the end. */ |
7702 | |
|
7703 | 0 | if (memchr_found_first_cu == NULL || |
7704 | 0 | start_match > memchr_found_first_cu) |
7705 | 0 | { |
7706 | 0 | pp1 = memchr(start_match, first_cu, searchlength); |
7707 | 0 | memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1; |
7708 | 0 | } |
7709 | | |
7710 | | /* If the start is before a previously found position, use the |
7711 | | previous position, or NULL if a previous search failed. */ |
7712 | | |
7713 | 0 | else pp1 = (memchr_found_first_cu == end_subject)? NULL : |
7714 | 0 | memchr_found_first_cu; |
7715 | | |
7716 | | /* Do the same thing for the other case. */ |
7717 | |
|
7718 | 0 | if (memchr_found_first_cu2 == NULL || |
7719 | 0 | start_match > memchr_found_first_cu2) |
7720 | 0 | { |
7721 | 0 | pp2 = memchr(start_match, first_cu2, searchlength); |
7722 | 0 | memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2; |
7723 | 0 | } |
7724 | | |
7725 | 0 | else pp2 = (memchr_found_first_cu2 == end_subject)? NULL : |
7726 | 0 | memchr_found_first_cu2; |
7727 | | |
7728 | | /* Set the start to the end of the subject if neither case was found. |
7729 | | Otherwise, use the earlier found point. */ |
7730 | |
|
7731 | 0 | if (pp1 == NULL) |
7732 | 0 | start_match = (pp2 == NULL)? end_subject : pp2; |
7733 | 0 | else |
7734 | 0 | start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; |
7735 | |
|
7736 | 0 | #endif /* 8-bit handling */ |
7737 | 0 | } |
7738 | | |
7739 | | /* The caseful case is much simpler. */ |
7740 | | |
7741 | 0 | else |
7742 | 0 | { |
7743 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7744 | | while (start_match < end_subject && UCHAR21TEST(start_match) != |
7745 | | first_cu) |
7746 | | start_match++; |
7747 | | #else |
7748 | 0 | start_match = memchr(start_match, first_cu, end_subject - start_match); |
7749 | 0 | if (start_match == NULL) start_match = end_subject; |
7750 | 0 | #endif |
7751 | 0 | } |
7752 | | |
7753 | | /* If we can't find the required first code unit, having reached the |
7754 | | true end of the subject, break the bumpalong loop, to force a match |
7755 | | failure, except when doing partial matching, when we let the next cycle |
7756 | | run at the end of the subject. To see why, consider the pattern |
7757 | | /(?<=abc)def/, which partially matches "abc", even though the string |
7758 | | does not contain the starting character "d". If we have not reached the |
7759 | | true end of the subject (PCRE2_FIRSTLINE caused end_subject to be |
7760 | | temporarily modified) we also let the cycle run, because the matching |
7761 | | string is legitimately allowed to start with the first code unit of a |
7762 | | newline. */ |
7763 | |
|
7764 | 0 | if (mb->partial == 0 && start_match >= mb->end_subject) |
7765 | 0 | { |
7766 | 0 | rc = MATCH_NOMATCH; |
7767 | 0 | break; |
7768 | 0 | } |
7769 | 0 | } |
7770 | | |
7771 | | /* If there's no first code unit, advance to just after a linebreak for a |
7772 | | multiline match if required. */ |
7773 | | |
7774 | 0 | else if (startline) |
7775 | 0 | { |
7776 | 0 | if (start_match > mb->start_subject + start_offset) |
7777 | 0 | { |
7778 | 0 | #ifdef SUPPORT_UNICODE |
7779 | 0 | if (utf) |
7780 | 0 | { |
7781 | 0 | while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
7782 | 0 | { |
7783 | 0 | start_match++; |
7784 | 0 | ACROSSCHAR(start_match < end_subject, start_match, start_match++); |
7785 | 0 | } |
7786 | 0 | } |
7787 | 0 | else |
7788 | 0 | #endif |
7789 | 0 | while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
7790 | 0 | start_match++; |
7791 | | |
7792 | | /* If we have just passed a CR and the newline option is ANY or |
7793 | | ANYCRLF, and we are now at a LF, advance the match position by one |
7794 | | more code unit. */ |
7795 | |
|
7796 | 0 | if (start_match[-1] == CHAR_CR && |
7797 | 0 | (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && |
7798 | 0 | start_match < end_subject && |
7799 | 0 | UCHAR21TEST(start_match) == CHAR_NL) |
7800 | 0 | start_match++; |
7801 | 0 | } |
7802 | 0 | } |
7803 | | |
7804 | | /* If there's no first code unit or a requirement for a multiline line |
7805 | | start, advance to a non-unique first code unit if any have been |
7806 | | identified. The bitmap contains only 256 bits. When code units are 16 or |
7807 | | 32 bits wide, all code units greater than 254 set the 255 bit. */ |
7808 | | |
7809 | 0 | else if (start_bits != NULL) |
7810 | 0 | { |
7811 | 0 | while (start_match < end_subject) |
7812 | 0 | { |
7813 | 0 | uint32_t c = UCHAR21TEST(start_match); |
7814 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7815 | | if (c > 255) c = 255; |
7816 | | #endif |
7817 | 0 | if ((start_bits[c/8] & (1u << (c&7))) != 0) break; |
7818 | 0 | start_match++; |
7819 | 0 | } |
7820 | | |
7821 | | /* See comment above in first_cu checking about the next few lines. */ |
7822 | |
|
7823 | 0 | if (mb->partial == 0 && start_match >= mb->end_subject) |
7824 | 0 | { |
7825 | 0 | rc = MATCH_NOMATCH; |
7826 | 0 | break; |
7827 | 0 | } |
7828 | 0 | } |
7829 | 0 | } /* End first code unit handling */ |
7830 | | |
7831 | | /* Restore fudged end_subject */ |
7832 | | |
7833 | 0 | end_subject = mb->end_subject; |
7834 | | |
7835 | | /* The following two optimizations must be disabled for partial matching. */ |
7836 | |
|
7837 | 0 | if (mb->partial == 0) |
7838 | 0 | { |
7839 | 0 | PCRE2_SPTR p; |
7840 | | |
7841 | | /* The minimum matching length is a lower bound; no string of that length |
7842 | | may actually match the pattern. Although the value is, strictly, in |
7843 | | characters, we treat it as code units to avoid spending too much time in |
7844 | | this optimization. */ |
7845 | |
|
7846 | 0 | if (end_subject - start_match < re->minlength) |
7847 | 0 | { |
7848 | 0 | rc = MATCH_NOMATCH; |
7849 | 0 | break; |
7850 | 0 | } |
7851 | | |
7852 | | /* If req_cu is set, we know that that code unit must appear in the |
7853 | | subject for the (non-partial) match to succeed. If the first code unit is |
7854 | | set, req_cu must be later in the subject; otherwise the test starts at |
7855 | | the match point. This optimization can save a huge amount of backtracking |
7856 | | in patterns with nested unlimited repeats that aren't going to match. |
7857 | | Writing separate code for caseful/caseless versions makes it go faster, |
7858 | | as does using an autoincrement and backing off on a match. As in the case |
7859 | | of the first code unit, using memchr() in the 8-bit library gives a big |
7860 | | speed up. Unlike the first_cu check above, we do not need to call |
7861 | | memchr() twice in the caseless case because we only need to check for the |
7862 | | presence of the character in either case, not find the first occurrence. |
7863 | | |
7864 | | The search can be skipped if the code unit was found later than the |
7865 | | current starting point in a previous iteration of the bumpalong loop. |
7866 | | |
7867 | | HOWEVER: when the subject string is very, very long, searching to its end |
7868 | | can take a long time, and give bad performance on quite ordinary |
7869 | | anchored patterns. This showed up when somebody was matching something |
7870 | | like /^\d+C/ on a 32-megabyte string... so we don't do this when the |
7871 | | string is sufficiently long, but it's worth searching a lot more for |
7872 | | unanchored patterns. */ |
7873 | | |
7874 | 0 | p = start_match + (has_first_cu? 1:0); |
7875 | 0 | if (has_req_cu && p > req_cu_ptr) |
7876 | 0 | { |
7877 | 0 | PCRE2_SIZE check_length = end_subject - start_match; |
7878 | |
|
7879 | 0 | if (check_length < REQ_CU_MAX || |
7880 | 0 | (!anchored && check_length < REQ_CU_MAX * 1000)) |
7881 | 0 | { |
7882 | 0 | if (req_cu != req_cu2) /* Caseless */ |
7883 | 0 | { |
7884 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7885 | | while (p < end_subject) |
7886 | | { |
7887 | | uint32_t pp = UCHAR21INCTEST(p); |
7888 | | if (pp == req_cu || pp == req_cu2) { p--; break; } |
7889 | | } |
7890 | | #else /* 8-bit code units */ |
7891 | 0 | PCRE2_SPTR pp = p; |
7892 | 0 | p = memchr(pp, req_cu, end_subject - pp); |
7893 | 0 | if (p == NULL) |
7894 | 0 | { |
7895 | 0 | p = memchr(pp, req_cu2, end_subject - pp); |
7896 | 0 | if (p == NULL) p = end_subject; |
7897 | 0 | } |
7898 | 0 | #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ |
7899 | 0 | } |
7900 | | |
7901 | | /* The caseful case */ |
7902 | | |
7903 | 0 | else |
7904 | 0 | { |
7905 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7906 | | while (p < end_subject) |
7907 | | { |
7908 | | if (UCHAR21INCTEST(p) == req_cu) { p--; break; } |
7909 | | } |
7910 | | |
7911 | | #else /* 8-bit code units */ |
7912 | 0 | p = memchr(p, req_cu, end_subject - p); |
7913 | 0 | if (p == NULL) p = end_subject; |
7914 | 0 | #endif |
7915 | 0 | } |
7916 | | |
7917 | | /* If we can't find the required code unit, break the bumpalong loop, |
7918 | | forcing a match failure. */ |
7919 | |
|
7920 | 0 | if (p >= end_subject) |
7921 | 0 | { |
7922 | 0 | rc = MATCH_NOMATCH; |
7923 | 0 | break; |
7924 | 0 | } |
7925 | | |
7926 | | /* If we have found the required code unit, save the point where we |
7927 | | found it, so that we don't search again next time round the bumpalong |
7928 | | loop if the start hasn't yet passed this code unit. */ |
7929 | | |
7930 | 0 | req_cu_ptr = p; |
7931 | 0 | } |
7932 | 0 | } |
7933 | 0 | } |
7934 | 0 | } |
7935 | | |
7936 | | /* ------------ End of start of match optimizations ------------ */ |
7937 | | |
7938 | | /* Give no match if we have passed the bumpalong limit. */ |
7939 | | |
7940 | 0 | if (start_match > bumpalong_limit) |
7941 | 0 | { |
7942 | 0 | rc = MATCH_NOMATCH; |
7943 | 0 | break; |
7944 | 0 | } |
7945 | | |
7946 | | /* OK, we can now run the match. If "hitend" is set afterwards, remember the |
7947 | | first starting point for which a partial match was found. */ |
7948 | | |
7949 | 0 | cb.start_match = (PCRE2_SIZE)(start_match - subject); |
7950 | 0 | cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH; |
7951 | |
|
7952 | 0 | mb->start_used_ptr = start_match; |
7953 | 0 | mb->last_used_ptr = start_match; |
7954 | 0 | #ifdef SUPPORT_UNICODE |
7955 | 0 | mb->moptions = options | fragment_options; |
7956 | | #else |
7957 | | mb->moptions = options; |
7958 | | #endif |
7959 | 0 | mb->match_call_count = 0; |
7960 | 0 | mb->end_offset_top = 0; |
7961 | 0 | mb->skip_arg_count = 0; |
7962 | |
|
7963 | | #ifdef DEBUG_SHOW_OPS |
7964 | | fprintf(stderr, "++ Calling match()\n"); |
7965 | | #endif |
7966 | |
|
7967 | 0 | rc = match(start_match, mb->start_code, re->top_bracket, frame_size, |
7968 | 0 | match_data, mb); |
7969 | |
|
7970 | | #ifdef DEBUG_SHOW_OPS |
7971 | | fprintf(stderr, "++ match() returned %d\n\n", rc); |
7972 | | #endif |
7973 | |
|
7974 | 0 | if (mb->hitend && start_partial == NULL) |
7975 | 0 | { |
7976 | 0 | start_partial = mb->start_used_ptr; |
7977 | 0 | match_partial = start_match; |
7978 | 0 | } |
7979 | |
|
7980 | 0 | switch(rc) |
7981 | 0 | { |
7982 | | /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched |
7983 | | the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP |
7984 | | entirely. The only way we can do that is to re-do the match at the same |
7985 | | point, with a flag to force SKIP with an argument to be ignored. Just |
7986 | | treating this case as NOMATCH does not work because it does not check other |
7987 | | alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ |
7988 | | |
7989 | 0 | case MATCH_SKIP_ARG: |
7990 | 0 | new_start_match = start_match; |
7991 | 0 | mb->ignore_skip_arg = mb->skip_arg_count; |
7992 | 0 | break; |
7993 | | |
7994 | | /* SKIP passes back the next starting point explicitly, but if it is no |
7995 | | greater than the match we have just done, treat it as NOMATCH. */ |
7996 | | |
7997 | 0 | case MATCH_SKIP: |
7998 | 0 | if (mb->verb_skip_ptr > start_match) |
7999 | 0 | { |
8000 | 0 | new_start_match = mb->verb_skip_ptr; |
8001 | 0 | break; |
8002 | 0 | } |
8003 | 0 | PCRE2_FALLTHROUGH /* Fall through */ |
8004 | 0 |
|
8005 | 0 | /* NOMATCH and PRUNE advance by one character. THEN at this level acts |
8006 | 0 | exactly like PRUNE. Unset ignore SKIP-with-argument. */ |
8007 | 0 |
|
8008 | 0 | case MATCH_NOMATCH: |
8009 | 0 | case MATCH_PRUNE: |
8010 | 0 | case MATCH_THEN: |
8011 | 0 | mb->ignore_skip_arg = 0; |
8012 | 0 | new_start_match = start_match + 1; |
8013 | 0 | #ifdef SUPPORT_UNICODE |
8014 | 0 | if (utf) |
8015 | 0 | ACROSSCHAR(new_start_match < end_subject, new_start_match, |
8016 | 0 | new_start_match++); |
8017 | 0 | #endif |
8018 | 0 | break; |
8019 | | |
8020 | | /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ |
8021 | | |
8022 | 0 | case MATCH_COMMIT: |
8023 | 0 | rc = MATCH_NOMATCH; |
8024 | 0 | goto ENDLOOP; |
8025 | | |
8026 | | /* Any other return is either a match, or some kind of error. */ |
8027 | | |
8028 | 0 | default: |
8029 | 0 | goto ENDLOOP; |
8030 | 0 | } |
8031 | | |
8032 | | /* Control reaches here for the various types of "no match at this point" |
8033 | | result. Reset the code to MATCH_NOMATCH for subsequent checking. */ |
8034 | | |
8035 | 0 | rc = MATCH_NOMATCH; |
8036 | | |
8037 | | /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first |
8038 | | newline in the subject (though it may continue over the newline). Therefore, |
8039 | | if we have just failed to match, starting at a newline, do not continue. */ |
8040 | |
|
8041 | 0 | if (firstline && IS_NEWLINE(start_match)) break; |
8042 | | |
8043 | | /* Advance to new matching position */ |
8044 | | |
8045 | 0 | start_match = new_start_match; |
8046 | | |
8047 | | /* Break the loop if the pattern is anchored or if we have passed the end of |
8048 | | the subject. */ |
8049 | |
|
8050 | 0 | if (anchored || start_match > end_subject) break; |
8051 | | |
8052 | | /* If we have just passed a CR and we are now at a LF, and the pattern does |
8053 | | not contain any explicit matches for \r or \n, and the newline option is CRLF |
8054 | | or ANY or ANYCRLF, advance the match position by one more code unit. In |
8055 | | normal matching start_match will aways be greater than the first position at |
8056 | | this stage, but a failed *SKIP can cause a return at the same point, which is |
8057 | | why the first test exists. */ |
8058 | | |
8059 | 0 | if (start_match > subject + start_offset && |
8060 | 0 | start_match[-1] == CHAR_CR && |
8061 | 0 | start_match < end_subject && |
8062 | 0 | *start_match == CHAR_NL && |
8063 | 0 | (re->flags & PCRE2_HASCRORLF) == 0 && |
8064 | 0 | (mb->nltype == NLTYPE_ANY || |
8065 | 0 | mb->nltype == NLTYPE_ANYCRLF || |
8066 | 0 | mb->nllen == 2)) |
8067 | 0 | start_match++; |
8068 | |
|
8069 | 0 | mb->mark = NULL; /* Reset for start of next match attempt */ |
8070 | 0 | } /* End of for(;;) "bumpalong" loop */ |
8071 | | |
8072 | | /* ==========================================================================*/ |
8073 | | |
8074 | | /* When we reach here, one of the following stopping conditions is true: |
8075 | | |
8076 | | (1) The match succeeded, either completely, or partially; |
8077 | | |
8078 | | (2) The pattern is anchored or the match was failed after (*COMMIT); |
8079 | | |
8080 | | (3) We are past the end of the subject or the bumpalong limit; |
8081 | | |
8082 | | (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because |
8083 | | this option requests that a match occur at or before the first newline in |
8084 | | the subject. |
8085 | | |
8086 | | (5) Some kind of error occurred. |
8087 | | |
8088 | | */ |
8089 | | |
8090 | 0 | ENDLOOP: |
8091 | | |
8092 | | /* If end_subject != true_end_subject, it means we are handling invalid UTF, |
8093 | | and have just processed a non-terminal fragment. If this resulted in no match |
8094 | | or a partial match we must carry on to the next fragment (a partial match is |
8095 | | returned to the caller only at the very end of the subject). A loop is used to |
8096 | | avoid trying to match against empty fragments; if the pattern can match an |
8097 | | empty string it would have done so already. */ |
8098 | |
|
8099 | 0 | #ifdef SUPPORT_UNICODE |
8100 | 0 | if (utf && end_subject != true_end_subject && |
8101 | 0 | (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL)) |
8102 | 0 | { |
8103 | 0 | for (;;) |
8104 | 0 | { |
8105 | | /* Advance past the first bad code unit, and then skip invalid character |
8106 | | starting code units in 8-bit and 16-bit modes. */ |
8107 | |
|
8108 | 0 | start_match = end_subject + 1; |
8109 | |
|
8110 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
8111 | 0 | while (start_match < true_end_subject && NOT_FIRSTCU(*start_match)) |
8112 | 0 | start_match++; |
8113 | 0 | #endif |
8114 | | |
8115 | | /* If we have hit the end of the subject, there isn't another non-empty |
8116 | | fragment, so give up. */ |
8117 | |
|
8118 | 0 | if (start_match >= true_end_subject) |
8119 | 0 | { |
8120 | 0 | rc = MATCH_NOMATCH; /* In case it was partial */ |
8121 | 0 | match_partial = NULL; |
8122 | 0 | break; |
8123 | 0 | } |
8124 | | |
8125 | | /* Check the rest of the subject */ |
8126 | | |
8127 | 0 | mb->check_subject = start_match; |
8128 | 0 | rc = PRIV(valid_utf)(start_match, length - (start_match - subject), |
8129 | 0 | &(match_data->startchar)); |
8130 | | |
8131 | | /* The rest of the subject is valid UTF. */ |
8132 | |
|
8133 | 0 | if (rc == 0) |
8134 | 0 | { |
8135 | 0 | mb->end_subject = end_subject = true_end_subject; |
8136 | 0 | fragment_options = PCRE2_NOTBOL; |
8137 | 0 | goto FRAGMENT_RESTART; |
8138 | 0 | } |
8139 | | |
8140 | | /* A subsequent UTF error has been found; if the next fragment is |
8141 | | non-empty, set up to process it. Otherwise, let the loop advance. */ |
8142 | | |
8143 | 0 | else if (rc < 0) |
8144 | 0 | { |
8145 | 0 | mb->end_subject = end_subject = start_match + match_data->startchar; |
8146 | 0 | if (end_subject > start_match) |
8147 | 0 | { |
8148 | 0 | fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL; |
8149 | 0 | goto FRAGMENT_RESTART; |
8150 | 0 | } |
8151 | 0 | } |
8152 | 0 | } |
8153 | 0 | } |
8154 | 0 | #endif /* SUPPORT_UNICODE */ |
8155 | | |
8156 | | /* Fill in fields that are always returned in the match data. */ |
8157 | | |
8158 | 0 | match_data->code = re; |
8159 | 0 | match_data->mark = mb->mark; |
8160 | 0 | match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER; |
8161 | 0 | match_data->options = original_options; |
8162 | | |
8163 | | /* Handle a fully successful match. Set the return code to the number of |
8164 | | captured strings, or 0 if there were too many to fit into the ovector, and then |
8165 | | set the remaining returned values before returning. Make a copy of the subject |
8166 | | string if requested. */ |
8167 | |
|
8168 | 0 | if (rc == MATCH_MATCH) |
8169 | 0 | { |
8170 | 0 | match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)? |
8171 | 0 | 0 : (int)mb->end_offset_top/2 + 1; |
8172 | 0 | match_data->subject_length = length; |
8173 | 0 | match_data->start_offset = start_offset; |
8174 | 0 | match_data->startchar = start_match - subject; |
8175 | 0 | match_data->leftchar = mb->start_used_ptr - subject; |
8176 | 0 | match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)? |
8177 | 0 | mb->last_used_ptr : mb->end_match_ptr) - subject; |
8178 | 0 | if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0) |
8179 | 0 | { |
8180 | 0 | if (length != 0) |
8181 | 0 | { |
8182 | 0 | match_data->subject = match_data->memctl.malloc(CU2BYTES(length), |
8183 | 0 | match_data->memctl.memory_data); |
8184 | 0 | if (match_data->subject == NULL) |
8185 | 0 | return match_data->rc = PCRE2_ERROR_NOMEMORY; |
8186 | 0 | memcpy((void *)match_data->subject, subject, CU2BYTES(length)); |
8187 | 0 | } |
8188 | 0 | else |
8189 | 0 | match_data->subject = NULL; |
8190 | 0 | match_data->flags |= PCRE2_MD_COPIED_SUBJECT; |
8191 | 0 | } |
8192 | 0 | else match_data->subject = original_subject; |
8193 | | |
8194 | 0 | return match_data->rc; |
8195 | 0 | } |
8196 | | |
8197 | | /* Control gets here if there has been a partial match, an error, or if the |
8198 | | overall match attempt has failed at all permitted starting positions. Any mark |
8199 | | data is in the nomatch_mark field. */ |
8200 | | |
8201 | 0 | match_data->mark = mb->nomatch_mark; |
8202 | | |
8203 | | /* For anything other than nomatch or partial match, just return the code. */ |
8204 | |
|
8205 | 0 | if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc; |
8206 | | |
8207 | | /* Handle a partial match. If a "soft" partial match was requested, searching |
8208 | | for a complete match will have continued, and the value of rc at this point |
8209 | | will be MATCH_NOMATCH. For a "hard" partial match, it will already be |
8210 | | PCRE2_ERROR_PARTIAL. */ |
8211 | | |
8212 | 0 | else if (match_partial != NULL) |
8213 | 0 | { |
8214 | 0 | match_data->subject = original_subject; |
8215 | 0 | match_data->subject_length = length; |
8216 | 0 | match_data->start_offset = start_offset; |
8217 | 0 | match_data->ovector[0] = match_partial - subject; |
8218 | 0 | match_data->ovector[1] = end_subject - subject; |
8219 | 0 | match_data->startchar = match_partial - subject; |
8220 | 0 | match_data->leftchar = start_partial - subject; |
8221 | 0 | match_data->rightchar = end_subject - subject; |
8222 | 0 | match_data->rc = PCRE2_ERROR_PARTIAL; |
8223 | 0 | } |
8224 | | |
8225 | | /* Else this is the classic nomatch case. */ |
8226 | | |
8227 | 0 | else |
8228 | 0 | { |
8229 | 0 | match_data->subject = original_subject; |
8230 | 0 | match_data->subject_length = length; |
8231 | 0 | match_data->start_offset = start_offset; |
8232 | 0 | match_data->rc = PCRE2_ERROR_NOMATCH; |
8233 | 0 | } |
8234 | |
|
8235 | 0 | return match_data->rc; |
8236 | 0 | } |
8237 | | |
8238 | | /* These #undefs are here to enable unity builds with CMake. */ |
8239 | | |
8240 | | #undef NLBLOCK /* Block containing newline information */ |
8241 | | #undef PSSTART /* Field containing processed string start */ |
8242 | | #undef PSEND /* Field containing processed string end */ |
8243 | | |
8244 | | /* End of pcre2_match.c */ |