/src/pcre2-10.39/src/pcre2_match.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2015-2021 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | |
42 | | #ifdef HAVE_CONFIG_H |
43 | | #include "config.h" |
44 | | #endif |
45 | | |
46 | | /* These defines enable debugging code */ |
47 | | |
48 | | /* #define DEBUG_FRAMES_DISPLAY */ |
49 | | /* #define DEBUG_SHOW_OPS */ |
50 | | /* #define DEBUG_SHOW_RMATCH */ |
51 | | |
52 | | #ifdef DEBUG_FRAME_DISPLAY |
53 | | #include <stdarg.h> |
54 | | #endif |
55 | | |
56 | | /* These defines identify the name of the block containing "static" |
57 | | information, and fields within it. */ |
58 | | |
59 | 3.21G | #define NLBLOCK mb /* Block containing newline information */ |
60 | 43.6M | #define PSSTART start_subject /* Field containing processed string start */ |
61 | 757M | #define PSEND end_subject /* Field containing processed string end */ |
62 | | |
63 | | #include "pcre2_internal.h" |
64 | | |
65 | 54.4M | #define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */ |
66 | | |
67 | | /* Masks for identifying the public options that are permitted at match time. */ |
68 | | |
69 | | #define PUBLIC_MATCH_OPTIONS \ |
70 | 3.71M | (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ |
71 | 3.71M | PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ |
72 | 3.71M | PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT) |
73 | | |
74 | | #define PUBLIC_JIT_MATCH_OPTIONS \ |
75 | | (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\ |
76 | | PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\ |
77 | | PCRE2_COPY_MATCHED_SUBJECT) |
78 | | |
79 | | /* Non-error returns from and within the match() function. Error returns are |
80 | | externally defined PCRE2_ERROR_xxx codes, which are all negative. */ |
81 | | |
82 | 6.54M | #define MATCH_MATCH 1 |
83 | 1.18G | #define MATCH_NOMATCH 0 |
84 | | |
85 | | /* Special internal returns used in the match() function. Make them |
86 | | sufficiently negative to avoid the external error codes. */ |
87 | | |
88 | 426k | #define MATCH_ACCEPT (-999) |
89 | 3.52k | #define MATCH_KETRPOS (-998) |
90 | | /* The next 5 must be kept together and in sequence so that a test that checks |
91 | | for any one of them can use a range. */ |
92 | 0 | #define MATCH_COMMIT (-997) |
93 | 49.3M | #define MATCH_PRUNE (-996) |
94 | 0 | #define MATCH_SKIP (-995) |
95 | 0 | #define MATCH_SKIP_ARG (-994) |
96 | 221M | #define MATCH_THEN (-993) |
97 | 0 | #define MATCH_BACKTRACK_MAX MATCH_THEN |
98 | 0 | #define MATCH_BACKTRACK_MIN MATCH_COMMIT |
99 | | |
100 | | /* Group frame type values. Zero means the frame is not a group frame. The |
101 | | lower 16 bits are used for data (e.g. the capture number). Group frames are |
102 | | used for most groups so that information about the start is easily available at |
103 | | the end without having to scan back through intermediate frames (backtrack |
104 | | points). */ |
105 | | |
106 | 6.14M | #define GF_CAPTURE 0x00010000u |
107 | 17.1M | #define GF_NOCAPTURE 0x00020000u |
108 | 56.2M | #define GF_CONDASSERT 0x00030000u |
109 | 70.3M | #define GF_RECURSE 0x00040000u |
110 | | |
111 | | /* Masks for the identity and data parts of the group frame type. */ |
112 | | |
113 | 126M | #define GF_IDMASK(a) ((a) & 0xffff0000u) |
114 | 0 | #define GF_DATAMASK(a) ((a) & 0x0000ffffu) |
115 | | |
116 | | /* Repetition types */ |
117 | | |
118 | | enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS }; |
119 | | |
120 | | /* Min and max values for the common repeats; a maximum of UINT32_MAX => |
121 | | infinity. */ |
122 | | |
123 | | static const uint32_t rep_min[] = { |
124 | | 0, 0, /* * and *? */ |
125 | | 1, 1, /* + and +? */ |
126 | | 0, 0, /* ? and ?? */ |
127 | | 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ |
128 | | 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ |
129 | | |
130 | | static const uint32_t rep_max[] = { |
131 | | UINT32_MAX, UINT32_MAX, /* * and *? */ |
132 | | UINT32_MAX, UINT32_MAX, /* + and +? */ |
133 | | 1, 1, /* ? and ?? */ |
134 | | 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ |
135 | | UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ |
136 | | |
137 | | /* Repetition types - must include OP_CRPOSRANGE (not needed above) */ |
138 | | |
139 | | static const uint32_t rep_typ[] = { |
140 | | REPTYPE_MAX, REPTYPE_MIN, /* * and *? */ |
141 | | REPTYPE_MAX, REPTYPE_MIN, /* + and +? */ |
142 | | REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */ |
143 | | REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */ |
144 | | REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */ |
145 | | REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */ |
146 | | |
147 | | /* Numbers for RMATCH calls at backtracking points. When these lists are |
148 | | changed, the code at RETURN_SWITCH below must be updated in sync. */ |
149 | | |
150 | | enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, |
151 | | RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, |
152 | | RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, |
153 | | RM31, RM32, RM33, RM34, RM35, RM36 }; |
154 | | |
155 | | #ifdef SUPPORT_WIDE_CHARS |
156 | | enum { RM100=100, RM101 }; |
157 | | #endif |
158 | | |
159 | | #ifdef SUPPORT_UNICODE |
160 | | enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207, |
161 | | RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215, |
162 | | RM216, RM217, RM218, RM219, RM220, RM221, RM222 }; |
163 | | #endif |
164 | | |
165 | | /* Define short names for general fields in the current backtrack frame, which |
166 | | is always pointed to by the F variable. Occasional references to fields in |
167 | | other frames are written out explicitly. There are also some fields in the |
168 | | current frame whose names start with "temp" that are used for short-term, |
169 | | localised backtracking memory. These are #defined with Lxxx names at the point |
170 | | of use and undefined afterwards. */ |
171 | | |
172 | 2.23G | #define Fback_frame F->back_frame |
173 | 77.5M | #define Fcapture_last F->capture_last |
174 | 80.1M | #define Fcurrent_recurse F->current_recurse |
175 | 7.99G | #define Fecode F->ecode |
176 | 15.5G | #define Feptr F->eptr |
177 | 1.14G | #define Fgroup_frame_type F->group_frame_type |
178 | 234M | #define Flast_group_offset F->last_group_offset |
179 | 0 | #define Flength F->length |
180 | 54.4M | #define Fmark F->mark |
181 | 3.50G | #define Frdepth F->rdepth |
182 | 59.9M | #define Fstart_match F->start_match |
183 | 126M | #define Foffset_top F->offset_top |
184 | 0 | #define Foccu F->occu |
185 | 3.79G | #define Fop F->op |
186 | 61.1M | #define Fovector F->ovector |
187 | 2.18G | #define Freturn_id F->return_id |
188 | | |
189 | | |
190 | | #ifdef DEBUG_FRAMES_DISPLAY |
191 | | /************************************************* |
192 | | * Display current frames and contents * |
193 | | *************************************************/ |
194 | | |
195 | | /* This debugging function displays the current set of frames and their |
196 | | contents. It is not called automatically from anywhere, the intention being |
197 | | that calls can be inserted where necessary when debugging frame-related |
198 | | problems. |
199 | | |
200 | | Arguments: |
201 | | f the file to write to |
202 | | F the current top frame |
203 | | P a previous frame of interest |
204 | | frame_size the frame size |
205 | | mb points to the match block |
206 | | s identification text |
207 | | |
208 | | Returns: nothing |
209 | | */ |
210 | | |
211 | | static void |
212 | | display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size, |
213 | | match_block *mb, const char *s, ...) |
214 | | { |
215 | | uint32_t i; |
216 | | heapframe *Q; |
217 | | va_list ap; |
218 | | va_start(ap, s); |
219 | | |
220 | | fprintf(f, "FRAMES "); |
221 | | vfprintf(f, s, ap); |
222 | | va_end(ap); |
223 | | |
224 | | if (P != NULL) fprintf(f, " P=%lu", |
225 | | ((char *)P - (char *)(mb->match_frames))/frame_size); |
226 | | fprintf(f, "\n"); |
227 | | |
228 | | for (i = 0, Q = mb->match_frames; |
229 | | Q <= F; |
230 | | i++, Q = (heapframe *)((char *)Q + frame_size)) |
231 | | { |
232 | | fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d", |
233 | | i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode), |
234 | | Q->back_frame, Q->return_id); |
235 | | |
236 | | if (Q->last_group_offset == PCRE2_UNSET) |
237 | | fprintf(f, " lgoffset=unset\n"); |
238 | | else |
239 | | fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size); |
240 | | } |
241 | | } |
242 | | |
243 | | #endif |
244 | | |
245 | | |
246 | | |
247 | | /************************************************* |
248 | | * Process a callout * |
249 | | *************************************************/ |
250 | | |
251 | | /* This function is called for all callouts, whether "standalone" or at the |
252 | | start of a conditional group. Feptr will be pointing to either OP_CALLOUT or |
253 | | OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized |
254 | | with fixed values. |
255 | | |
256 | | Arguments: |
257 | | F points to the current backtracking frame |
258 | | mb points to the match block |
259 | | lengthptr where to return the length of the callout item |
260 | | |
261 | | Returns: the return from the callout |
262 | | or 0 if no callout function exists |
263 | | */ |
264 | | |
265 | | static int |
266 | | do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr) |
267 | 0 | { |
268 | 0 | int rc; |
269 | 0 | PCRE2_SIZE save0, save1; |
270 | 0 | PCRE2_SIZE *callout_ovector; |
271 | 0 | pcre2_callout_block *cb; |
272 | |
|
273 | 0 | *lengthptr = (*Fecode == OP_CALLOUT)? |
274 | 0 | PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE); |
275 | |
|
276 | 0 | if (mb->callout == NULL) return 0; /* No callout function provided */ |
277 | | |
278 | | /* The original matching code (pre 10.30) worked directly with the ovector |
279 | | passed by the user, and this was passed to callouts. Now that the working |
280 | | ovector is in the backtracking frame, it no longer needs to reserve space for |
281 | | the overall match offsets (which would waste space in the frame). For backward |
282 | | compatibility, however, we pass capture_top and offset_vector to the callout as |
283 | | if for the extended ovector, and we ensure that the first two slots are unset |
284 | | by preserving and restoring their current contents. Picky compilers complain if |
285 | | references such as Fovector[-2] are use directly, so we set up a separate |
286 | | pointer. */ |
287 | | |
288 | 0 | callout_ovector = (PCRE2_SIZE *)(Fovector) - 2; |
289 | | |
290 | | /* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields |
291 | | are set externally. The first 3 never change; the last is updated for each |
292 | | bumpalong. */ |
293 | |
|
294 | 0 | cb = mb->cb; |
295 | 0 | cb->capture_top = (uint32_t)Foffset_top/2 + 1; |
296 | 0 | cb->capture_last = Fcapture_last; |
297 | 0 | cb->offset_vector = callout_ovector; |
298 | 0 | cb->mark = mb->nomatch_mark; |
299 | 0 | cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject); |
300 | 0 | cb->pattern_position = GET(Fecode, 1); |
301 | 0 | cb->next_item_length = GET(Fecode, 1 + LINK_SIZE); |
302 | |
|
303 | 0 | if (*Fecode == OP_CALLOUT) /* Numerical callout */ |
304 | 0 | { |
305 | 0 | cb->callout_number = Fecode[1 + 2*LINK_SIZE]; |
306 | 0 | cb->callout_string_offset = 0; |
307 | 0 | cb->callout_string = NULL; |
308 | 0 | cb->callout_string_length = 0; |
309 | 0 | } |
310 | 0 | else /* String callout */ |
311 | 0 | { |
312 | 0 | cb->callout_number = 0; |
313 | 0 | cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE); |
314 | 0 | cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1; |
315 | 0 | cb->callout_string_length = |
316 | 0 | *lengthptr - (1 + 4*LINK_SIZE) - 2; |
317 | 0 | } |
318 | |
|
319 | 0 | save0 = callout_ovector[0]; |
320 | 0 | save1 = callout_ovector[1]; |
321 | 0 | callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET; |
322 | 0 | rc = mb->callout(cb, mb->callout_data); |
323 | 0 | callout_ovector[0] = save0; |
324 | 0 | callout_ovector[1] = save1; |
325 | 0 | cb->callout_flags = 0; |
326 | 0 | return rc; |
327 | 0 | } |
328 | | |
329 | | |
330 | | |
331 | | /************************************************* |
332 | | * Match a back-reference * |
333 | | *************************************************/ |
334 | | |
335 | | /* This function is called only when it is known that the offset lies within |
336 | | the offsets that have so far been used in the match. Note that in caseless |
337 | | UTF-8 mode, the number of subject bytes matched may be different to the number |
338 | | of reference bytes. (In theory this could also happen in UTF-16 mode, but it |
339 | | seems unlikely.) |
340 | | |
341 | | Arguments: |
342 | | offset index into the offset vector |
343 | | caseless TRUE if caseless |
344 | | F the current backtracking frame pointer |
345 | | mb points to match block |
346 | | lengthptr pointer for returning the length matched |
347 | | |
348 | | Returns: = 0 sucessful match; number of code units matched is set |
349 | | < 0 no match |
350 | | > 0 partial match |
351 | | */ |
352 | | |
353 | | static int |
354 | | match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb, |
355 | | PCRE2_SIZE *lengthptr) |
356 | 7.77M | { |
357 | 7.77M | PCRE2_SPTR p; |
358 | 7.77M | PCRE2_SIZE length; |
359 | 7.77M | PCRE2_SPTR eptr; |
360 | 7.77M | PCRE2_SPTR eptr_start; |
361 | | |
362 | | /* Deal with an unset group. The default is no match, but there is an option to |
363 | | match an empty string. */ |
364 | | |
365 | 7.77M | if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET) |
366 | 5.99M | { |
367 | 5.99M | if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) |
368 | 0 | { |
369 | 0 | *lengthptr = 0; |
370 | 0 | return 0; /* Match */ |
371 | 0 | } |
372 | 5.99M | else return -1; /* No match */ |
373 | 5.99M | } |
374 | | |
375 | | /* Separate the caseless and UTF cases for speed. */ |
376 | | |
377 | 1.78M | eptr = eptr_start = Feptr; |
378 | 1.78M | p = mb->start_subject + Fovector[offset]; |
379 | 1.78M | length = Fovector[offset+1] - Fovector[offset]; |
380 | | |
381 | 1.78M | if (caseless) |
382 | 0 | { |
383 | 0 | #if defined SUPPORT_UNICODE |
384 | 0 | BOOL utf = (mb->poptions & PCRE2_UTF) != 0; |
385 | |
|
386 | 0 | if (utf || (mb->poptions & PCRE2_UCP) != 0) |
387 | 0 | { |
388 | 0 | PCRE2_SPTR endptr = p + length; |
389 | | |
390 | | /* Match characters up to the end of the reference. NOTE: the number of |
391 | | code units matched may differ, because in UTF-8 there are some characters |
392 | | whose upper and lower case codes have different numbers of bytes. For |
393 | | example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3 |
394 | | bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a |
395 | | sequence of two of the latter. It is important, therefore, to check the |
396 | | length along the reference, not along the subject (earlier code did this |
397 | | wrong). UCP without uses Unicode properties but without UTF encoding. */ |
398 | |
|
399 | 0 | while (p < endptr) |
400 | 0 | { |
401 | 0 | uint32_t c, d; |
402 | 0 | const ucd_record *ur; |
403 | 0 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
404 | | |
405 | 0 | if (utf) |
406 | 0 | { |
407 | 0 | GETCHARINC(c, eptr); |
408 | 0 | GETCHARINC(d, p); |
409 | 0 | } |
410 | 0 | else |
411 | 0 | { |
412 | 0 | c = *eptr++; |
413 | 0 | d = *p++; |
414 | 0 | } |
415 | |
|
416 | 0 | ur = GET_UCD(d); |
417 | 0 | if (c != d && c != (uint32_t)((int)d + ur->other_case)) |
418 | 0 | { |
419 | 0 | const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; |
420 | 0 | for (;;) |
421 | 0 | { |
422 | 0 | if (c < *pp) return -1; /* No match */ |
423 | 0 | if (c == *pp++) break; |
424 | 0 | } |
425 | 0 | } |
426 | 0 | } |
427 | 0 | } |
428 | 0 | else |
429 | 0 | #endif |
430 | | |
431 | | /* Not in UTF or UCP mode */ |
432 | 0 | { |
433 | 0 | for (; length > 0; length--) |
434 | 0 | { |
435 | 0 | uint32_t cc, cp; |
436 | 0 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
437 | 0 | cc = UCHAR21TEST(eptr); |
438 | 0 | cp = UCHAR21TEST(p); |
439 | 0 | if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) |
440 | 0 | return -1; /* No match */ |
441 | 0 | p++; |
442 | 0 | eptr++; |
443 | 0 | } |
444 | 0 | } |
445 | 0 | } |
446 | | |
447 | | /* In the caseful case, we can just compare the code units, whether or not we |
448 | | are in UTF and/or UCP mode. When partial matching, we have to do this unit by |
449 | | unit. */ |
450 | | |
451 | 1.78M | else |
452 | 1.78M | { |
453 | 1.78M | if (mb->partial != 0) |
454 | 0 | { |
455 | 0 | for (; length > 0; length--) |
456 | 0 | { |
457 | 0 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
458 | 0 | if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */ |
459 | 0 | } |
460 | 0 | } |
461 | | |
462 | | /* Not partial matching */ |
463 | | |
464 | 1.78M | else |
465 | 1.78M | { |
466 | 1.78M | if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */ |
467 | 1.78M | if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */ |
468 | 1.78M | eptr += length; |
469 | 1.78M | } |
470 | 1.78M | } |
471 | | |
472 | 1.78M | *lengthptr = eptr - eptr_start; |
473 | 1.78M | return 0; /* Match */ |
474 | 1.78M | } |
475 | | |
476 | | |
477 | | |
478 | | /****************************************************************************** |
479 | | ******************************************************************************* |
480 | | "Recursion" in the match() function |
481 | | |
482 | | The original match() function was highly recursive, but this proved to be the |
483 | | source of a number of problems over the years, mostly because of the relatively |
484 | | small system stacks that are commonly found. As new features were added to |
485 | | patterns, various kludges were invented to reduce the amount of stack used, |
486 | | making the code hard to understand in places. |
487 | | |
488 | | A version did exist that used individual frames on the heap instead of calling |
489 | | match() recursively, but this ran substantially slower. The current version is |
490 | | a refactoring that uses a vector of frames to remember backtracking points. |
491 | | This runs no slower, and possibly even a bit faster than the original recursive |
492 | | implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe |
493 | | 50 frames) is allocated on the system stack. If this is not big enough, the |
494 | | heap is used for a larger vector. |
495 | | |
496 | | ******************************************************************************* |
497 | | ******************************************************************************/ |
498 | | |
499 | | |
500 | | |
501 | | |
502 | | /************************************************* |
503 | | * Macros for the match() function * |
504 | | *************************************************/ |
505 | | |
506 | | /* These macros pack up tests that are used for partial matching several times |
507 | | in the code. The second one is used when we already know we are past the end of |
508 | | the subject. We set the "hit end" flag if the pointer is at the end of the |
509 | | subject and either (a) the pointer is past the earliest inspected character |
510 | | (i.e. something has been matched, even if not part of the actual matched |
511 | | string), or (b) the pattern contains a lookbehind. These are the conditions for |
512 | | which adding more characters may allow the current match to continue. |
513 | | |
514 | | For hard partial matching, we immediately return a partial match. Otherwise, |
515 | | carrying on means that a complete match on the current subject will be sought. |
516 | | A partial match is returned only if no complete match can be found. */ |
517 | | |
518 | | #define CHECK_PARTIAL()\ |
519 | 112M | if (Feptr >= mb->end_subject) \ |
520 | 112M | { \ |
521 | 524k | SCHECK_PARTIAL(); \ |
522 | 524k | } |
523 | | |
524 | | #define SCHECK_PARTIAL()\ |
525 | 12.5M | if (mb->partial != 0 && \ |
526 | 12.5M | (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \ |
527 | 12.5M | { \ |
528 | 0 | mb->hitend = TRUE; \ |
529 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ |
530 | 0 | } |
531 | | |
532 | | |
533 | | /* These macros are used to implement backtracking. They simulate a recursive |
534 | | call to the match() function by means of a local vector of frames which |
535 | | remember the backtracking points. */ |
536 | | |
537 | | #define RMATCH(ra,rb)\ |
538 | 754M | {\ |
539 | 754M | start_ecode = ra;\ |
540 | 1.09G | Freturn_id = rb;\ |
541 | 754M | goto MATCH_RECURSE;\ |
542 | 1.09G | L_##rb:;\ |
543 | 1.08G | } |
544 | | |
545 | | #define RRETURN(ra)\ |
546 | 407M | {\ |
547 | 407M | rrc = ra;\ |
548 | 407M | goto RETURN_SWITCH;\ |
549 | 1.13G | } |
550 | | |
551 | | |
552 | | |
553 | | /************************************************* |
554 | | * Match from current position * |
555 | | *************************************************/ |
556 | | |
557 | | /* This function is called to run one match attempt at a single starting point |
558 | | in the subject. |
559 | | |
560 | | Performance note: It might be tempting to extract commonly used fields from the |
561 | | mb structure (e.g. end_subject) into individual variables to improve |
562 | | performance. Tests using gcc on a SPARC disproved this; in the first case, it |
563 | | made performance worse. |
564 | | |
565 | | Arguments: |
566 | | start_eptr starting character in subject |
567 | | start_ecode starting position in compiled code |
568 | | ovector pointer to the final output vector |
569 | | oveccount number of pairs in ovector |
570 | | top_bracket number of capturing parentheses in the pattern |
571 | | frame_size size of each backtracking frame |
572 | | mb pointer to "static" variables block |
573 | | |
574 | | Returns: MATCH_MATCH if matched ) these values are >= 0 |
575 | | MATCH_NOMATCH if failed to match ) |
576 | | negative MATCH_xxx value for PRUNE, SKIP, etc |
577 | | negative PCRE2_ERROR_xxx value if aborted by an error condition |
578 | | (e.g. stopped by repeated call or depth limit) |
579 | | */ |
580 | | |
581 | | static int |
582 | | match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector, |
583 | | uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size, |
584 | | match_block *mb) |
585 | 51.8M | { |
586 | | /* Frame-handling variables */ |
587 | | |
588 | 51.8M | heapframe *F; /* Current frame pointer */ |
589 | 51.8M | heapframe *N = NULL; /* Temporary frame pointers */ |
590 | 51.8M | heapframe *P = NULL; |
591 | 51.8M | heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */ |
592 | 51.8M | PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ |
593 | | |
594 | | /* Local variables that do not need to be preserved over calls to RRMATCH(). */ |
595 | | |
596 | 51.8M | PCRE2_SPTR bracode; /* Temp pointer to start of group */ |
597 | 51.8M | PCRE2_SIZE offset; /* Used for group offsets */ |
598 | 51.8M | PCRE2_SIZE length; /* Used for various length calculations */ |
599 | | |
600 | 51.8M | int rrc; /* Return from functions & backtracking "recursions" */ |
601 | 51.8M | #ifdef SUPPORT_UNICODE |
602 | 51.8M | int proptype; /* Type of character property */ |
603 | 51.8M | #endif |
604 | | |
605 | 51.8M | uint32_t i; /* Used for local loops */ |
606 | 51.8M | uint32_t fc; /* Character values */ |
607 | 51.8M | uint32_t number; /* Used for group and other numbers */ |
608 | 51.8M | uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */ |
609 | 51.8M | uint32_t group_frame_type; /* Specifies type for new group frames */ |
610 | | |
611 | 51.8M | BOOL condition; /* Used in conditional groups */ |
612 | 51.8M | BOOL cur_is_word; /* Used in "word" tests */ |
613 | 51.8M | BOOL prev_is_word; /* Used in "word" tests */ |
614 | | |
615 | | /* UTF and UCP flags */ |
616 | | |
617 | 51.8M | #ifdef SUPPORT_UNICODE |
618 | 51.8M | BOOL utf = (mb->poptions & PCRE2_UTF) != 0; |
619 | 51.8M | BOOL ucp = (mb->poptions & PCRE2_UCP) != 0; |
620 | | #else |
621 | | BOOL utf = FALSE; /* Required for convenience even when no Unicode support */ |
622 | | #endif |
623 | | |
624 | | /* This is the length of the last part of a backtracking frame that must be |
625 | | copied when a new frame is created. */ |
626 | | |
627 | 51.8M | frame_copy_size = frame_size - offsetof(heapframe, eptr); |
628 | | |
629 | | /* Set up the first current frame at the start of the vector, and initialize |
630 | | fields that are not reset for new frames. */ |
631 | | |
632 | 51.8M | F = mb->match_frames; |
633 | 51.8M | Frdepth = 0; /* "Recursion" depth */ |
634 | 51.8M | Fcapture_last = 0; /* Number of most recent capture */ |
635 | 51.8M | Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */ |
636 | 51.8M | Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */ |
637 | 51.8M | Fmark = NULL; /* Most recent mark */ |
638 | 51.8M | Foffset_top = 0; /* End of captures within the frame */ |
639 | 51.8M | Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */ |
640 | 51.8M | group_frame_type = 0; /* Not a start of group frame */ |
641 | 51.8M | goto NEW_FRAME; /* Start processing with this frame */ |
642 | | |
643 | | /* Come back here when we want to create a new frame for remembering a |
644 | | backtracking point. */ |
645 | | |
646 | 1.09G | MATCH_RECURSE: |
647 | | |
648 | | /* Set up a new backtracking frame. If the vector is full, get a new one |
649 | | on the heap, doubling the size, but constrained by the heap limit. */ |
650 | | |
651 | 1.09G | N = (heapframe *)((char *)F + frame_size); |
652 | 1.09G | if (N >= mb->match_frames_top) |
653 | 14 | { |
654 | 14 | PCRE2_SIZE newsize = mb->frame_vector_size * 2; |
655 | 14 | heapframe *new; |
656 | | |
657 | 14 | if ((newsize / 1024) > mb->heap_limit) |
658 | 0 | { |
659 | 0 | PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size; |
660 | 0 | if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT; |
661 | 0 | newsize = maxsize; |
662 | 0 | } |
663 | | |
664 | 14 | new = mb->memctl.malloc(newsize, mb->memctl.memory_data); |
665 | 14 | if (new == NULL) return PCRE2_ERROR_NOMEMORY; |
666 | 14 | memcpy(new, mb->match_frames, mb->frame_vector_size); |
667 | | |
668 | 14 | F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames)); |
669 | 14 | N = (heapframe *)((char *)F + frame_size); |
670 | | |
671 | 14 | if (mb->match_frames != mb->stack_frames) |
672 | 8 | mb->memctl.free(mb->match_frames, mb->memctl.memory_data); |
673 | 14 | mb->match_frames = new; |
674 | 14 | mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize); |
675 | 14 | mb->frame_vector_size = newsize; |
676 | 14 | } |
677 | | |
678 | | #ifdef DEBUG_SHOW_RMATCH |
679 | | fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1); |
680 | | if (group_frame_type != 0) |
681 | | { |
682 | | fprintf(stderr, " type=%x ", group_frame_type); |
683 | | switch (GF_IDMASK(group_frame_type)) |
684 | | { |
685 | | case GF_CAPTURE: |
686 | | fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type)); |
687 | | break; |
688 | | |
689 | | case GF_NOCAPTURE: |
690 | | fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type)); |
691 | | break; |
692 | | |
693 | | case GF_CONDASSERT: |
694 | | fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type)); |
695 | | break; |
696 | | |
697 | | case GF_RECURSE: |
698 | | fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type)); |
699 | | break; |
700 | | |
701 | | default: |
702 | | fprintf(stderr, "*** unknown ***"); |
703 | | break; |
704 | | } |
705 | | } |
706 | | fprintf(stderr, "\n"); |
707 | | #endif |
708 | | |
709 | | /* Copy those fields that must be copied into the new frame, increase the |
710 | | "recursion" depth (i.e. the new frame's index) and then make the new frame |
711 | | current. */ |
712 | | |
713 | 1.09G | memcpy((char *)N + offsetof(heapframe, eptr), |
714 | 1.09G | (char *)F + offsetof(heapframe, eptr), |
715 | 1.09G | frame_copy_size); |
716 | | |
717 | 1.09G | N->rdepth = Frdepth + 1; |
718 | 1.09G | F = N; |
719 | | |
720 | | /* Carry on processing with a new frame. */ |
721 | | |
722 | 1.14G | NEW_FRAME: |
723 | 1.14G | Fgroup_frame_type = group_frame_type; |
724 | 1.14G | Fecode = start_ecode; /* Starting code pointer */ |
725 | 1.14G | Fback_frame = frame_size; /* Default is go back one frame */ |
726 | | |
727 | | /* If this is a special type of group frame, remember its offset for quick |
728 | | access at the end of the group. If this is a recursion, set a new current |
729 | | recursion value. */ |
730 | | |
731 | 1.14G | if (group_frame_type != 0) |
732 | 70.3M | { |
733 | 70.3M | Flast_group_offset = (char *)F - (char *)mb->match_frames; |
734 | 70.3M | if (GF_IDMASK(group_frame_type) == GF_RECURSE) |
735 | 0 | Fcurrent_recurse = GF_DATAMASK(group_frame_type); |
736 | 70.3M | group_frame_type = 0; |
737 | 70.3M | } |
738 | | |
739 | | |
740 | | /* ========================================================================= */ |
741 | | /* This is the main processing loop. First check that we haven't recorded too |
742 | | many backtracks (search tree is too large), or that we haven't exceeded the |
743 | | recursive depth limit (used too many backtracking frames). If not, process the |
744 | | opcodes. */ |
745 | | |
746 | 1.14G | if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT; |
747 | 1.14G | if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT; |
748 | | |
749 | 1.14G | for (;;) |
750 | 1.77G | { |
751 | | #ifdef DEBUG_SHOW_OPS |
752 | | fprintf(stderr, "++ op=%d\n", *Fecode); |
753 | | #endif |
754 | | |
755 | 1.77G | Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */ |
756 | 1.77G | switch(Fop) |
757 | 1.77G | { |
758 | | /* ===================================================================== */ |
759 | | /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close |
760 | | any currently open capturing brackets. Unlike reaching the end of a group, |
761 | | where we know the starting frame is at the top of the chained frames, in |
762 | | this case we have to search back for the relevant frame in case other types |
763 | | of group that use chained frames have intervened. Multiple OP_CLOSEs always |
764 | | come innermost first, which matches the chain order. We can ignore this in |
765 | | a recursion, because captures are not passed out of recursions. */ |
766 | | |
767 | 0 | case OP_CLOSE: |
768 | 0 | if (Fcurrent_recurse == RECURSE_UNSET) |
769 | 0 | { |
770 | 0 | number = GET2(Fecode, 1); |
771 | 0 | offset = Flast_group_offset; |
772 | 0 | for(;;) |
773 | 0 | { |
774 | 0 | if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; |
775 | 0 | N = (heapframe *)((char *)mb->match_frames + offset); |
776 | 0 | P = (heapframe *)((char *)N - frame_size); |
777 | 0 | if (N->group_frame_type == (GF_CAPTURE | number)) break; |
778 | 0 | offset = P->last_group_offset; |
779 | 0 | } |
780 | 0 | offset = (number << 1) - 2; |
781 | 0 | Fcapture_last = number; |
782 | 0 | Fovector[offset] = P->eptr - mb->start_subject; |
783 | 0 | Fovector[offset+1] = Feptr - mb->start_subject; |
784 | 0 | if (offset >= Foffset_top) Foffset_top = offset + 2; |
785 | 0 | } |
786 | 0 | Fecode += PRIV(OP_lengths)[*Fecode]; |
787 | 0 | break; |
788 | | |
789 | | |
790 | | /* ===================================================================== */ |
791 | | /* Real or forced end of the pattern, assertion, or recursion. In an |
792 | | assertion ACCEPT, update the last used pointer and remember the current |
793 | | frame so that the captures and mark can be fished out of it. */ |
794 | | |
795 | 0 | case OP_ASSERT_ACCEPT: |
796 | 0 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
797 | 0 | assert_accept_frame = F; |
798 | 0 | RRETURN(MATCH_ACCEPT); |
799 | | |
800 | | /* If recursing, we have to find the most recent recursion. */ |
801 | |
|
802 | 0 | case OP_ACCEPT: |
803 | 2.56M | case OP_END: |
804 | | |
805 | | /* Handle end of a recursion. */ |
806 | | |
807 | 2.56M | if (Fcurrent_recurse != RECURSE_UNSET) |
808 | 0 | { |
809 | 0 | offset = Flast_group_offset; |
810 | 0 | for(;;) |
811 | 0 | { |
812 | 0 | if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; |
813 | 0 | N = (heapframe *)((char *)mb->match_frames + offset); |
814 | 0 | P = (heapframe *)((char *)N - frame_size); |
815 | 0 | if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break; |
816 | 0 | offset = P->last_group_offset; |
817 | 0 | } |
818 | | |
819 | | /* N is now the frame of the recursion; the previous frame is at the |
820 | | OP_RECURSE position. Go back there, copying the current subject position |
821 | | and mark, and the start_match position (\K might have changed it), and |
822 | | then move on past the OP_RECURSE. */ |
823 | | |
824 | 0 | P->eptr = Feptr; |
825 | 0 | P->mark = Fmark; |
826 | 0 | P->start_match = Fstart_match; |
827 | 0 | F = P; |
828 | 0 | Fecode += 1 + LINK_SIZE; |
829 | 0 | continue; |
830 | 0 | } |
831 | | |
832 | | /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY |
833 | | is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the |
834 | | start of the subject. In both cases, backtracking will then try other |
835 | | alternatives, if any. */ |
836 | | |
837 | 2.56M | if (Feptr == Fstart_match && |
838 | 2.56M | ((mb->moptions & PCRE2_NOTEMPTY) != 0 || |
839 | 32.0k | ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 && |
840 | 32.0k | Fstart_match == mb->start_subject + mb->start_offset))) |
841 | 2.56M | RRETURN(MATCH_NOMATCH); |
842 | | |
843 | | /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not |
844 | | the end of the subject. After (*ACCEPT) we fail the entire match (at this |
845 | | position) but backtrack on reaching the end of the pattern. */ |
846 | | |
847 | 2.56M | if (Feptr < mb->end_subject && |
848 | 2.56M | ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0) |
849 | 0 | { |
850 | 0 | if (Fop == OP_END) RRETURN(MATCH_NOMATCH); |
851 | 0 | return MATCH_NOMATCH; |
852 | 0 | } |
853 | | |
854 | | /* We have a successful match of the whole pattern. Record the result and |
855 | | then do a direct return from the function. If there is space in the offset |
856 | | vector, set any pairs that follow the highest-numbered captured string but |
857 | | are less than the number of capturing groups in the pattern to PCRE2_UNSET. |
858 | | It is documented that this happens. "Gaps" are set to PCRE2_UNSET |
859 | | dynamically. It is only those at the end that need setting here. */ |
860 | | |
861 | 2.56M | mb->end_match_ptr = Feptr; /* Record where we ended */ |
862 | 2.56M | mb->end_offset_top = Foffset_top; /* and how many extracts were taken */ |
863 | 2.56M | mb->mark = Fmark; /* and the last success mark */ |
864 | 2.56M | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
865 | | |
866 | 2.56M | ovector[0] = Fstart_match - mb->start_subject; |
867 | 2.56M | ovector[1] = Feptr - mb->start_subject; |
868 | | |
869 | | /* Set i to the smaller of the sizes of the external and frame ovectors. */ |
870 | | |
871 | 2.56M | i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1); |
872 | 2.56M | memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE)); |
873 | 5.40M | while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET; |
874 | 2.56M | return MATCH_MATCH; /* Note: NOT RRETURN */ |
875 | | |
876 | | |
877 | | /*===================================================================== */ |
878 | | /* Match any single character type except newline; have to take care with |
879 | | CRLF newlines and partial matching. */ |
880 | | |
881 | 217M | case OP_ANY: |
882 | 217M | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
883 | 214M | if (mb->partial != 0 && |
884 | 214M | Feptr == mb->end_subject - 1 && |
885 | 214M | NLBLOCK->nltype == NLTYPE_FIXED && |
886 | 214M | NLBLOCK->nllen == 2 && |
887 | 214M | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
888 | 0 | { |
889 | 0 | mb->hitend = TRUE; |
890 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
891 | 0 | } |
892 | | /* Fall through */ |
893 | | |
894 | | /* Match any single character whatsoever. */ |
895 | | |
896 | 216M | case OP_ALLANY: |
897 | 216M | if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ |
898 | 577k | { /* not be updated before SCHECK_PARTIAL. */ |
899 | 577k | SCHECK_PARTIAL(); |
900 | 577k | RRETURN(MATCH_NOMATCH); |
901 | 0 | } |
902 | 215M | Feptr++; |
903 | 215M | #ifdef SUPPORT_UNICODE |
904 | 215M | if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
905 | 215M | #endif |
906 | 215M | Fecode++; |
907 | 215M | break; |
908 | | |
909 | | |
910 | | /* ===================================================================== */ |
911 | | /* Match a single code unit, even in UTF mode. This opcode really does |
912 | | match any code unit, even newline. (It really should be called ANYCODEUNIT, |
913 | | of course - the byte name is from pre-16 bit days.) */ |
914 | | |
915 | 0 | case OP_ANYBYTE: |
916 | 0 | if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ |
917 | 0 | { /* not be updated before SCHECK_PARTIAL. */ |
918 | 0 | SCHECK_PARTIAL(); |
919 | 0 | RRETURN(MATCH_NOMATCH); |
920 | 0 | } |
921 | 0 | Feptr++; |
922 | 0 | Fecode++; |
923 | 0 | break; |
924 | | |
925 | | |
926 | | /* ===================================================================== */ |
927 | | /* Match a single character, casefully */ |
928 | | |
929 | 455M | case OP_CHAR: |
930 | 455M | #ifdef SUPPORT_UNICODE |
931 | 455M | if (utf) |
932 | 0 | { |
933 | 0 | Flength = 1; |
934 | 0 | Fecode++; |
935 | 0 | GETCHARLEN(fc, Fecode, Flength); |
936 | 0 | if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr)) |
937 | 0 | { |
938 | 0 | CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ |
939 | 0 | RRETURN(MATCH_NOMATCH); |
940 | 0 | } |
941 | 0 | for (; Flength > 0; Flength--) |
942 | 0 | { |
943 | 0 | if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH); |
944 | 0 | } |
945 | 0 | } |
946 | 455M | else |
947 | 455M | #endif |
948 | | |
949 | | /* Not UTF mode */ |
950 | 455M | { |
951 | 455M | if (mb->end_subject - Feptr < 1) |
952 | 3.06M | { |
953 | 3.06M | SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ |
954 | 3.06M | RRETURN(MATCH_NOMATCH); |
955 | 0 | } |
956 | 452M | if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH); |
957 | 11.4M | Fecode += 2; |
958 | 11.4M | } |
959 | 11.4M | break; |
960 | | |
961 | | |
962 | | /* ===================================================================== */ |
963 | | /* Match a single character, caselessly. If we are at the end of the |
964 | | subject, give up immediately. We get here only when the pattern character |
965 | | has at most one other case. Characters with more than two cases are coded |
966 | | as OP_PROP with the pseudo-property PT_CLIST. */ |
967 | | |
968 | 160M | case OP_CHARI: |
969 | 160M | if (Feptr >= mb->end_subject) |
970 | 121k | { |
971 | 121k | SCHECK_PARTIAL(); |
972 | 121k | RRETURN(MATCH_NOMATCH); |
973 | 0 | } |
974 | | |
975 | 160M | #ifdef SUPPORT_UNICODE |
976 | 160M | if (utf) |
977 | 0 | { |
978 | 0 | Flength = 1; |
979 | 0 | Fecode++; |
980 | 0 | GETCHARLEN(fc, Fecode, Flength); |
981 | | |
982 | | /* If the pattern character's value is < 128, we know that its other case |
983 | | (if any) is also < 128 (and therefore only one code unit long in all |
984 | | code-unit widths), so we can use the fast lookup table. We checked above |
985 | | that there is at least one character left in the subject. */ |
986 | |
|
987 | 0 | if (fc < 128) |
988 | 0 | { |
989 | 0 | uint32_t cc = UCHAR21(Feptr); |
990 | 0 | if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); |
991 | 0 | Fecode++; |
992 | 0 | Feptr++; |
993 | 0 | } |
994 | | |
995 | | /* Otherwise we must pick up the subject character and use Unicode |
996 | | property support to test its other case. Note that we cannot use the |
997 | | value of "Flength" to check for sufficient bytes left, because the other |
998 | | case of the character may have more or fewer code units. */ |
999 | | |
1000 | 0 | else |
1001 | 0 | { |
1002 | 0 | uint32_t dc; |
1003 | 0 | GETCHARINC(dc, Feptr); |
1004 | 0 | Fecode += Flength; |
1005 | 0 | if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); |
1006 | 0 | } |
1007 | 0 | } |
1008 | | |
1009 | | /* If UCP is set without UTF we must do the same as above, but with one |
1010 | | character per code unit. */ |
1011 | | |
1012 | 160M | else if (ucp) |
1013 | 0 | { |
1014 | 0 | uint32_t cc = UCHAR21(Feptr); |
1015 | 0 | fc = Fecode[1]; |
1016 | 0 | if (fc < 128) |
1017 | 0 | { |
1018 | 0 | if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); |
1019 | 0 | } |
1020 | 0 | else |
1021 | 0 | { |
1022 | 0 | if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); |
1023 | 0 | } |
1024 | 0 | Feptr++; |
1025 | 0 | Fecode += 2; |
1026 | 0 | } |
1027 | | |
1028 | 160M | else |
1029 | 160M | #endif /* SUPPORT_UNICODE */ |
1030 | | |
1031 | | /* Not UTF or UCP mode; use the table for characters < 256. */ |
1032 | 160M | { |
1033 | 160M | if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) |
1034 | 160M | != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); |
1035 | 3.93M | Feptr++; |
1036 | 3.93M | Fecode += 2; |
1037 | 3.93M | } |
1038 | 3.93M | break; |
1039 | | |
1040 | | |
1041 | | /* ===================================================================== */ |
1042 | | /* Match not a single character. */ |
1043 | | |
1044 | 3.93M | case OP_NOT: |
1045 | 117 | case OP_NOTI: |
1046 | 117 | if (Feptr >= mb->end_subject) |
1047 | 0 | { |
1048 | 0 | SCHECK_PARTIAL(); |
1049 | 0 | RRETURN(MATCH_NOMATCH); |
1050 | 0 | } |
1051 | | |
1052 | 117 | #ifdef SUPPORT_UNICODE |
1053 | 117 | if (utf) |
1054 | 0 | { |
1055 | 0 | uint32_t ch; |
1056 | 0 | Fecode++; |
1057 | 0 | GETCHARINC(ch, Fecode); |
1058 | 0 | GETCHARINC(fc, Feptr); |
1059 | 0 | if (ch == fc) |
1060 | 0 | { |
1061 | 0 | RRETURN(MATCH_NOMATCH); /* Caseful match */ |
1062 | 0 | } |
1063 | 0 | else if (Fop == OP_NOTI) /* If caseless */ |
1064 | 0 | { |
1065 | 0 | if (ch > 127) |
1066 | 0 | ch = UCD_OTHERCASE(ch); |
1067 | 0 | else |
1068 | 0 | ch = (mb->fcc)[ch]; |
1069 | 0 | if (ch == fc) RRETURN(MATCH_NOMATCH); |
1070 | 0 | } |
1071 | 0 | } |
1072 | | |
1073 | | /* UCP without UTF is as above, but with one character per code unit. */ |
1074 | | |
1075 | 117 | else if (ucp) |
1076 | 0 | { |
1077 | 0 | uint32_t ch; |
1078 | 0 | fc = UCHAR21INC(Feptr); |
1079 | 0 | ch = Fecode[1]; |
1080 | 0 | Fecode += 2; |
1081 | |
|
1082 | 0 | if (ch == fc) |
1083 | 0 | { |
1084 | 0 | RRETURN(MATCH_NOMATCH); /* Caseful match */ |
1085 | 0 | } |
1086 | 0 | else if (Fop == OP_NOTI) /* If caseless */ |
1087 | 0 | { |
1088 | 0 | if (ch > 127) |
1089 | 0 | ch = UCD_OTHERCASE(ch); |
1090 | 0 | else |
1091 | 0 | ch = (mb->fcc)[ch]; |
1092 | 0 | if (ch == fc) RRETURN(MATCH_NOMATCH); |
1093 | 0 | } |
1094 | 0 | } |
1095 | | |
1096 | 117 | else |
1097 | 117 | #endif /* SUPPORT_UNICODE */ |
1098 | | |
1099 | | /* Neither UTF nor UCP is set */ |
1100 | | |
1101 | 117 | { |
1102 | 117 | uint32_t ch = Fecode[1]; |
1103 | 117 | fc = UCHAR21INC(Feptr); |
1104 | 117 | if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) |
1105 | 117 | RRETURN(MATCH_NOMATCH); |
1106 | 117 | Fecode += 2; |
1107 | 117 | } |
1108 | 117 | break; |
1109 | | |
1110 | | |
1111 | | /* ===================================================================== */ |
1112 | | /* Match a single character repeatedly. */ |
1113 | | |
1114 | 117 | #define Loclength F->temp_size |
1115 | 126M | #define Lstart_eptr F->temp_sptr[0] |
1116 | 117 | #define Lcharptr F->temp_sptr[1] |
1117 | 471M | #define Lmin F->temp_32[0] |
1118 | 342M | #define Lmax F->temp_32[1] |
1119 | 274M | #define Lc F->temp_32[2] |
1120 | 28.3M | #define Loc F->temp_32[3] |
1121 | | |
1122 | 20.3M | case OP_EXACT: |
1123 | 20.5M | case OP_EXACTI: |
1124 | 20.5M | Lmin = Lmax = GET2(Fecode, 1); |
1125 | 20.5M | Fecode += 1 + IMM2_SIZE; |
1126 | 20.5M | goto REPEATCHAR; |
1127 | | |
1128 | 0 | case OP_POSUPTO: |
1129 | 0 | case OP_POSUPTOI: |
1130 | 0 | reptype = REPTYPE_POS; |
1131 | 0 | Lmin = 0; |
1132 | 0 | Lmax = GET2(Fecode, 1); |
1133 | 0 | Fecode += 1 + IMM2_SIZE; |
1134 | 0 | goto REPEATCHAR; |
1135 | | |
1136 | 0 | case OP_UPTO: |
1137 | 0 | case OP_UPTOI: |
1138 | 0 | reptype = REPTYPE_MAX; |
1139 | 0 | Lmin = 0; |
1140 | 0 | Lmax = GET2(Fecode, 1); |
1141 | 0 | Fecode += 1 + IMM2_SIZE; |
1142 | 0 | goto REPEATCHAR; |
1143 | | |
1144 | 0 | case OP_MINUPTO: |
1145 | 0 | case OP_MINUPTOI: |
1146 | 0 | reptype = REPTYPE_MIN; |
1147 | 0 | Lmin = 0; |
1148 | 0 | Lmax = GET2(Fecode, 1); |
1149 | 0 | Fecode += 1 + IMM2_SIZE; |
1150 | 0 | goto REPEATCHAR; |
1151 | | |
1152 | 7.43M | case OP_POSSTAR: |
1153 | 10.7M | case OP_POSSTARI: |
1154 | 10.7M | reptype = REPTYPE_POS; |
1155 | 10.7M | Lmin = 0; |
1156 | 10.7M | Lmax = UINT32_MAX; |
1157 | 10.7M | Fecode++; |
1158 | 10.7M | goto REPEATCHAR; |
1159 | | |
1160 | 2.19M | case OP_POSPLUS: |
1161 | 2.38M | case OP_POSPLUSI: |
1162 | 2.38M | reptype = REPTYPE_POS; |
1163 | 2.38M | Lmin = 1; |
1164 | 2.38M | Lmax = UINT32_MAX; |
1165 | 2.38M | Fecode++; |
1166 | 2.38M | goto REPEATCHAR; |
1167 | | |
1168 | 70.7M | case OP_POSQUERY: |
1169 | 74.2M | case OP_POSQUERYI: |
1170 | 74.2M | reptype = REPTYPE_POS; |
1171 | 74.2M | Lmin = 0; |
1172 | 74.2M | Lmax = 1; |
1173 | 74.2M | Fecode++; |
1174 | 74.2M | goto REPEATCHAR; |
1175 | | |
1176 | 13.3M | case OP_STAR: |
1177 | 19.2M | case OP_STARI: |
1178 | 19.2M | case OP_MINSTAR: |
1179 | 19.3M | case OP_MINSTARI: |
1180 | 19.3M | case OP_PLUS: |
1181 | 19.8M | case OP_PLUSI: |
1182 | 19.8M | case OP_MINPLUS: |
1183 | 20.2M | case OP_MINPLUSI: |
1184 | 21.3M | case OP_QUERY: |
1185 | 21.5M | case OP_QUERYI: |
1186 | 21.5M | case OP_MINQUERY: |
1187 | 21.5M | case OP_MINQUERYI: |
1188 | 21.5M | fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI); |
1189 | 21.5M | Lmin = rep_min[fc]; |
1190 | 21.5M | Lmax = rep_max[fc]; |
1191 | 21.5M | reptype = rep_typ[fc]; |
1192 | | |
1193 | | /* Common code for all repeated single-character matches. We first check |
1194 | | for the minimum number of characters. If the minimum equals the maximum, we |
1195 | | are done. Otherwise, if minimizing, check the rest of the pattern for a |
1196 | | match; if there isn't one, advance up to the maximum, one character at a |
1197 | | time. |
1198 | | |
1199 | | If maximizing, advance up to the maximum number of matching characters, |
1200 | | until Feptr is past the end of the maximum run. If possessive, we are |
1201 | | then done (no backing up). Otherwise, match at this position; anything |
1202 | | other than no match is immediately returned. For nomatch, back up one |
1203 | | character, unless we are matching \R and the last thing matched was |
1204 | | \r\n, in which case, back up two code units until we reach the first |
1205 | | optional character position. |
1206 | | |
1207 | | The various UTF/non-UTF and caseful/caseless cases are handled separately, |
1208 | | for speed. */ |
1209 | | |
1210 | 129M | REPEATCHAR: |
1211 | 129M | #ifdef SUPPORT_UNICODE |
1212 | 129M | if (utf) |
1213 | 0 | { |
1214 | 0 | Flength = 1; |
1215 | 0 | Lcharptr = Fecode; |
1216 | 0 | GETCHARLEN(fc, Fecode, Flength); |
1217 | 0 | Fecode += Flength; |
1218 | | |
1219 | | /* Handle multi-code-unit character matching, caseful and caseless. */ |
1220 | |
|
1221 | 0 | if (Flength > 1) |
1222 | 0 | { |
1223 | 0 | uint32_t othercase; |
1224 | |
|
1225 | 0 | if (Fop >= OP_STARI && /* Caseless */ |
1226 | 0 | (othercase = UCD_OTHERCASE(fc)) != fc) |
1227 | 0 | Loclength = PRIV(ord2utf)(othercase, Foccu); |
1228 | 0 | else Loclength = 0; |
1229 | |
|
1230 | 0 | for (i = 1; i <= Lmin; i++) |
1231 | 0 | { |
1232 | 0 | if (Feptr <= mb->end_subject - Flength && |
1233 | 0 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; |
1234 | 0 | else if (Loclength > 0 && |
1235 | 0 | Feptr <= mb->end_subject - Loclength && |
1236 | 0 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1237 | 0 | Feptr += Loclength; |
1238 | 0 | else |
1239 | 0 | { |
1240 | 0 | CHECK_PARTIAL(); |
1241 | 0 | RRETURN(MATCH_NOMATCH); |
1242 | 0 | } |
1243 | 0 | } |
1244 | | |
1245 | 0 | if (Lmin == Lmax) continue; |
1246 | | |
1247 | 0 | if (reptype == REPTYPE_MIN) |
1248 | 0 | { |
1249 | 0 | for (;;) |
1250 | 0 | { |
1251 | 0 | RMATCH(Fecode, RM202); |
1252 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1253 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1254 | 0 | if (Feptr <= mb->end_subject - Flength && |
1255 | 0 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; |
1256 | 0 | else if (Loclength > 0 && |
1257 | 0 | Feptr <= mb->end_subject - Loclength && |
1258 | 0 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1259 | 0 | Feptr += Loclength; |
1260 | 0 | else |
1261 | 0 | { |
1262 | 0 | CHECK_PARTIAL(); |
1263 | 0 | RRETURN(MATCH_NOMATCH); |
1264 | 0 | } |
1265 | 0 | } |
1266 | | /* Control never gets here */ |
1267 | 0 | } |
1268 | | |
1269 | 0 | else /* Maximize */ |
1270 | 0 | { |
1271 | 0 | Lstart_eptr = Feptr; |
1272 | 0 | for (i = Lmin; i < Lmax; i++) |
1273 | 0 | { |
1274 | 0 | if (Feptr <= mb->end_subject - Flength && |
1275 | 0 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) |
1276 | 0 | Feptr += Flength; |
1277 | 0 | else if (Loclength > 0 && |
1278 | 0 | Feptr <= mb->end_subject - Loclength && |
1279 | 0 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1280 | 0 | Feptr += Loclength; |
1281 | 0 | else |
1282 | 0 | { |
1283 | 0 | CHECK_PARTIAL(); |
1284 | 0 | break; |
1285 | 0 | } |
1286 | 0 | } |
1287 | | |
1288 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1289 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1290 | | go too far. */ |
1291 | | |
1292 | 0 | if (reptype != REPTYPE_POS) for(;;) |
1293 | 0 | { |
1294 | 0 | if (Feptr <= Lstart_eptr) break; |
1295 | 0 | RMATCH(Fecode, RM203); |
1296 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1297 | 0 | Feptr--; |
1298 | 0 | BACKCHAR(Feptr); |
1299 | 0 | } |
1300 | 0 | } |
1301 | 0 | break; /* End of repeated wide character handling */ |
1302 | 0 | } |
1303 | | |
1304 | | /* Length of UTF character is 1. Put it into the preserved variable and |
1305 | | fall through to the non-UTF code. */ |
1306 | | |
1307 | 0 | Lc = fc; |
1308 | 0 | } |
1309 | 129M | else |
1310 | 129M | #endif /* SUPPORT_UNICODE */ |
1311 | | |
1312 | | /* When not in UTF mode, load a single-code-unit character. Then proceed as |
1313 | | above, using Unicode casing if either UTF or UCP is set. */ |
1314 | | |
1315 | 129M | Lc = *Fecode++; |
1316 | | |
1317 | | /* Caseless comparison */ |
1318 | | |
1319 | 129M | if (Fop >= OP_STARI) |
1320 | 14.1M | { |
1321 | 14.1M | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1322 | 14.1M | #ifdef SUPPORT_UNICODE |
1323 | 14.1M | if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); |
1324 | 14.1M | else |
1325 | 14.1M | #endif /* SUPPORT_UNICODE */ |
1326 | | /* Lc will be < 128 in UTF-8 mode. */ |
1327 | 14.1M | Loc = mb->fcc[Lc]; |
1328 | | #else /* 16-bit & 32-bit */ |
1329 | | #ifdef SUPPORT_UNICODE |
1330 | | if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); |
1331 | | else |
1332 | | #endif /* SUPPORT_UNICODE */ |
1333 | | Loc = TABLE_GET(Lc, mb->fcc, Lc); |
1334 | | #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
1335 | | |
1336 | 14.2M | for (i = 1; i <= Lmin; i++) |
1337 | 1.30M | { |
1338 | 1.30M | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1339 | 1.30M | if (Feptr >= mb->end_subject) |
1340 | 2.87k | { |
1341 | 2.87k | SCHECK_PARTIAL(); |
1342 | 2.87k | RRETURN(MATCH_NOMATCH); |
1343 | 0 | } |
1344 | 1.29M | cc = UCHAR21TEST(Feptr); |
1345 | 1.29M | if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); |
1346 | 27.0k | Feptr++; |
1347 | 27.0k | } |
1348 | 12.9M | if (Lmin == Lmax) continue; |
1349 | | |
1350 | 12.9M | if (reptype == REPTYPE_MIN) |
1351 | 48.1k | { |
1352 | 48.1k | for (;;) |
1353 | 56.4k | { |
1354 | 56.4k | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1355 | 56.4k | RMATCH(Fecode, RM25); |
1356 | 56.4k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1357 | 56.4k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1358 | 56.4k | if (Feptr >= mb->end_subject) |
1359 | 74 | { |
1360 | 74 | SCHECK_PARTIAL(); |
1361 | 74 | RRETURN(MATCH_NOMATCH); |
1362 | 0 | } |
1363 | 56.3k | cc = UCHAR21TEST(Feptr); |
1364 | 56.3k | if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); |
1365 | 8.28k | Feptr++; |
1366 | 8.28k | } |
1367 | | /* Control never gets here */ |
1368 | 48.1k | } |
1369 | | |
1370 | 12.8M | else /* Maximize */ |
1371 | 12.8M | { |
1372 | 12.8M | Lstart_eptr = Feptr; |
1373 | 12.9M | for (i = Lmin; i < Lmax; i++) |
1374 | 12.9M | { |
1375 | 12.9M | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1376 | 12.9M | if (Feptr >= mb->end_subject) |
1377 | 43.5k | { |
1378 | 43.5k | SCHECK_PARTIAL(); |
1379 | 43.5k | break; |
1380 | 43.5k | } |
1381 | 12.9M | cc = UCHAR21TEST(Feptr); |
1382 | 12.9M | if (Lc != cc && Loc != cc) break; |
1383 | 107k | Feptr++; |
1384 | 107k | } |
1385 | 12.8M | if (reptype != REPTYPE_POS) for (;;) |
1386 | 6.07M | { |
1387 | 6.07M | if (Feptr == Lstart_eptr) break; |
1388 | 43.0k | RMATCH(Fecode, RM26); |
1389 | 42.9k | Feptr--; |
1390 | 42.9k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1391 | 42.9k | } |
1392 | 12.8M | } |
1393 | 12.9M | } |
1394 | | |
1395 | | /* Caseful comparisons (includes all multi-byte characters) */ |
1396 | | |
1397 | 115M | else |
1398 | 115M | { |
1399 | 115M | for (i = 1; i <= Lmin; i++) |
1400 | 23.0M | { |
1401 | 23.0M | if (Feptr >= mb->end_subject) |
1402 | 3.87k | { |
1403 | 3.87k | SCHECK_PARTIAL(); |
1404 | 3.87k | RRETURN(MATCH_NOMATCH); |
1405 | 0 | } |
1406 | 23.0M | if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); |
1407 | 573k | } |
1408 | | |
1409 | 92.8M | if (Lmin == Lmax) continue; |
1410 | | |
1411 | 92.8M | if (reptype == REPTYPE_MIN) |
1412 | 39.4k | { |
1413 | 39.4k | for (;;) |
1414 | 46.1k | { |
1415 | 46.1k | RMATCH(Fecode, RM27); |
1416 | 46.0k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1417 | 46.0k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1418 | 45.7k | if (Feptr >= mb->end_subject) |
1419 | 7 | { |
1420 | 7 | SCHECK_PARTIAL(); |
1421 | 7 | RRETURN(MATCH_NOMATCH); |
1422 | 0 | } |
1423 | 45.7k | if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); |
1424 | 6.65k | } |
1425 | | /* Control never gets here */ |
1426 | 39.4k | } |
1427 | 92.7M | else /* Maximize */ |
1428 | 92.7M | { |
1429 | 92.7M | Lstart_eptr = Feptr; |
1430 | 94.2M | for (i = Lmin; i < Lmax; i++) |
1431 | 93.7M | { |
1432 | 93.7M | if (Feptr >= mb->end_subject) |
1433 | 541k | { |
1434 | 541k | SCHECK_PARTIAL(); |
1435 | 541k | break; |
1436 | 541k | } |
1437 | | |
1438 | 93.2M | if (Lc != UCHAR21TEST(Feptr)) break; |
1439 | 1.50M | Feptr++; |
1440 | 1.50M | } |
1441 | | |
1442 | 92.7M | if (reptype != REPTYPE_POS) for (;;) |
1443 | 14.6M | { |
1444 | 14.6M | if (Feptr <= Lstart_eptr) break; |
1445 | 187k | RMATCH(Fecode, RM28); |
1446 | 143k | Feptr--; |
1447 | 143k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1448 | 143k | } |
1449 | 92.7M | } |
1450 | 92.8M | } |
1451 | 105M | break; |
1452 | | |
1453 | 105M | #undef Loclength |
1454 | 105M | #undef Lstart_eptr |
1455 | 105M | #undef Lcharptr |
1456 | 105M | #undef Lmin |
1457 | 105M | #undef Lmax |
1458 | 105M | #undef Lc |
1459 | 105M | #undef Loc |
1460 | | |
1461 | | |
1462 | | /* ===================================================================== */ |
1463 | | /* Match a negated single one-byte character repeatedly. This is almost a |
1464 | | repeat of the code for a repeated single character, but I haven't found a |
1465 | | nice way of commoning these up that doesn't require a test of the |
1466 | | positive/negative option for each character match. Maybe that wouldn't add |
1467 | | very much to the time taken, but character matching *is* what this is all |
1468 | | about... */ |
1469 | | |
1470 | 105M | #define Lstart_eptr F->temp_sptr[0] |
1471 | 105M | #define Lmin F->temp_32[0] |
1472 | 105M | #define Lmax F->temp_32[1] |
1473 | 105M | #define Lc F->temp_32[2] |
1474 | 105M | #define Loc F->temp_32[3] |
1475 | | |
1476 | 105M | case OP_NOTEXACT: |
1477 | 0 | case OP_NOTEXACTI: |
1478 | 0 | Lmin = Lmax = GET2(Fecode, 1); |
1479 | 0 | Fecode += 1 + IMM2_SIZE; |
1480 | 0 | goto REPEATNOTCHAR; |
1481 | | |
1482 | 0 | case OP_NOTUPTO: |
1483 | 0 | case OP_NOTUPTOI: |
1484 | 0 | Lmin = 0; |
1485 | 0 | Lmax = GET2(Fecode, 1); |
1486 | 0 | reptype = REPTYPE_MAX; |
1487 | 0 | Fecode += 1 + IMM2_SIZE; |
1488 | 0 | goto REPEATNOTCHAR; |
1489 | | |
1490 | 0 | case OP_NOTMINUPTO: |
1491 | 0 | case OP_NOTMINUPTOI: |
1492 | 0 | Lmin = 0; |
1493 | 0 | Lmax = GET2(Fecode, 1); |
1494 | 0 | reptype = REPTYPE_MIN; |
1495 | 0 | Fecode += 1 + IMM2_SIZE; |
1496 | 0 | goto REPEATNOTCHAR; |
1497 | | |
1498 | 768k | case OP_NOTPOSSTAR: |
1499 | 768k | case OP_NOTPOSSTARI: |
1500 | 768k | reptype = REPTYPE_POS; |
1501 | 768k | Lmin = 0; |
1502 | 768k | Lmax = UINT32_MAX; |
1503 | 768k | Fecode++; |
1504 | 768k | goto REPEATNOTCHAR; |
1505 | | |
1506 | 19.7k | case OP_NOTPOSPLUS: |
1507 | 19.7k | case OP_NOTPOSPLUSI: |
1508 | 19.7k | reptype = REPTYPE_POS; |
1509 | 19.7k | Lmin = 1; |
1510 | 19.7k | Lmax = UINT32_MAX; |
1511 | 19.7k | Fecode++; |
1512 | 19.7k | goto REPEATNOTCHAR; |
1513 | | |
1514 | 0 | case OP_NOTPOSQUERY: |
1515 | 0 | case OP_NOTPOSQUERYI: |
1516 | 0 | reptype = REPTYPE_POS; |
1517 | 0 | Lmin = 0; |
1518 | 0 | Lmax = 1; |
1519 | 0 | Fecode++; |
1520 | 0 | goto REPEATNOTCHAR; |
1521 | | |
1522 | 0 | case OP_NOTPOSUPTO: |
1523 | 0 | case OP_NOTPOSUPTOI: |
1524 | 0 | reptype = REPTYPE_POS; |
1525 | 0 | Lmin = 0; |
1526 | 0 | Lmax = GET2(Fecode, 1); |
1527 | 0 | Fecode += 1 + IMM2_SIZE; |
1528 | 0 | goto REPEATNOTCHAR; |
1529 | | |
1530 | 2.71k | case OP_NOTSTAR: |
1531 | 2.71k | case OP_NOTSTARI: |
1532 | 2.71k | case OP_NOTMINSTAR: |
1533 | 2.71k | case OP_NOTMINSTARI: |
1534 | 2.71k | case OP_NOTPLUS: |
1535 | 2.71k | case OP_NOTPLUSI: |
1536 | 2.71k | case OP_NOTMINPLUS: |
1537 | 2.71k | case OP_NOTMINPLUSI: |
1538 | 2.71k | case OP_NOTQUERY: |
1539 | 2.71k | case OP_NOTQUERYI: |
1540 | 2.71k | case OP_NOTMINQUERY: |
1541 | 2.71k | case OP_NOTMINQUERYI: |
1542 | 2.71k | fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); |
1543 | 2.71k | Lmin = rep_min[fc]; |
1544 | 2.71k | Lmax = rep_max[fc]; |
1545 | 2.71k | reptype = rep_typ[fc]; |
1546 | | |
1547 | | /* Common code for all repeated single-character non-matches. */ |
1548 | | |
1549 | 791k | REPEATNOTCHAR: |
1550 | 791k | GETCHARINCTEST(Lc, Fecode); |
1551 | | |
1552 | | /* The code is duplicated for the caseless and caseful cases, for speed, |
1553 | | since matching characters is likely to be quite common. First, ensure the |
1554 | | minimum number of matches are present. If Lmin = Lmax, we are done. |
1555 | | Otherwise, if minimizing, keep trying the rest of the expression and |
1556 | | advancing one matching character if failing, up to the maximum. |
1557 | | Alternatively, if maximizing, find the maximum number of characters and |
1558 | | work backwards. */ |
1559 | | |
1560 | 791k | if (Fop >= OP_NOTSTARI) /* Caseless */ |
1561 | 0 | { |
1562 | 0 | #ifdef SUPPORT_UNICODE |
1563 | 0 | if ((utf || ucp) && Lc > 127) |
1564 | 0 | Loc = UCD_OTHERCASE(Lc); |
1565 | 0 | else |
1566 | 0 | #endif /* SUPPORT_UNICODE */ |
1567 | | |
1568 | 0 | Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */ |
1569 | |
|
1570 | 0 | #ifdef SUPPORT_UNICODE |
1571 | 0 | if (utf) |
1572 | 0 | { |
1573 | 0 | uint32_t d; |
1574 | 0 | for (i = 1; i <= Lmin; i++) |
1575 | 0 | { |
1576 | 0 | if (Feptr >= mb->end_subject) |
1577 | 0 | { |
1578 | 0 | SCHECK_PARTIAL(); |
1579 | 0 | RRETURN(MATCH_NOMATCH); |
1580 | 0 | } |
1581 | 0 | GETCHARINC(d, Feptr); |
1582 | 0 | if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); |
1583 | 0 | } |
1584 | 0 | } |
1585 | 0 | else |
1586 | 0 | #endif /* SUPPORT_UNICODE */ |
1587 | | |
1588 | | /* Not UTF mode */ |
1589 | 0 | { |
1590 | 0 | for (i = 1; i <= Lmin; i++) |
1591 | 0 | { |
1592 | 0 | if (Feptr >= mb->end_subject) |
1593 | 0 | { |
1594 | 0 | SCHECK_PARTIAL(); |
1595 | 0 | RRETURN(MATCH_NOMATCH); |
1596 | 0 | } |
1597 | 0 | if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); |
1598 | 0 | Feptr++; |
1599 | 0 | } |
1600 | 0 | } |
1601 | | |
1602 | 0 | if (Lmin == Lmax) continue; /* Finished for exact count */ |
1603 | | |
1604 | 0 | if (reptype == REPTYPE_MIN) |
1605 | 0 | { |
1606 | 0 | #ifdef SUPPORT_UNICODE |
1607 | 0 | if (utf) |
1608 | 0 | { |
1609 | 0 | uint32_t d; |
1610 | 0 | for (;;) |
1611 | 0 | { |
1612 | 0 | RMATCH(Fecode, RM204); |
1613 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1614 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1615 | 0 | if (Feptr >= mb->end_subject) |
1616 | 0 | { |
1617 | 0 | SCHECK_PARTIAL(); |
1618 | 0 | RRETURN(MATCH_NOMATCH); |
1619 | 0 | } |
1620 | 0 | GETCHARINC(d, Feptr); |
1621 | 0 | if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); |
1622 | 0 | } |
1623 | 0 | } |
1624 | 0 | else |
1625 | 0 | #endif /*SUPPORT_UNICODE */ |
1626 | | |
1627 | | /* Not UTF mode */ |
1628 | 0 | { |
1629 | 0 | for (;;) |
1630 | 0 | { |
1631 | 0 | RMATCH(Fecode, RM29); |
1632 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1633 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1634 | 0 | if (Feptr >= mb->end_subject) |
1635 | 0 | { |
1636 | 0 | SCHECK_PARTIAL(); |
1637 | 0 | RRETURN(MATCH_NOMATCH); |
1638 | 0 | } |
1639 | 0 | if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); |
1640 | 0 | Feptr++; |
1641 | 0 | } |
1642 | 0 | } |
1643 | | /* Control never gets here */ |
1644 | 0 | } |
1645 | | |
1646 | | /* Maximize case */ |
1647 | | |
1648 | 0 | else |
1649 | 0 | { |
1650 | 0 | Lstart_eptr = Feptr; |
1651 | |
|
1652 | 0 | #ifdef SUPPORT_UNICODE |
1653 | 0 | if (utf) |
1654 | 0 | { |
1655 | 0 | uint32_t d; |
1656 | 0 | for (i = Lmin; i < Lmax; i++) |
1657 | 0 | { |
1658 | 0 | int len = 1; |
1659 | 0 | if (Feptr >= mb->end_subject) |
1660 | 0 | { |
1661 | 0 | SCHECK_PARTIAL(); |
1662 | 0 | break; |
1663 | 0 | } |
1664 | 0 | GETCHARLEN(d, Feptr, len); |
1665 | 0 | if (Lc == d || Loc == d) break; |
1666 | 0 | Feptr += len; |
1667 | 0 | } |
1668 | | |
1669 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1670 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1671 | | go too far. */ |
1672 | | |
1673 | 0 | if (reptype != REPTYPE_POS) for(;;) |
1674 | 0 | { |
1675 | 0 | if (Feptr <= Lstart_eptr) break; |
1676 | 0 | RMATCH(Fecode, RM205); |
1677 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1678 | 0 | Feptr--; |
1679 | 0 | BACKCHAR(Feptr); |
1680 | 0 | } |
1681 | 0 | } |
1682 | 0 | else |
1683 | 0 | #endif /* SUPPORT_UNICODE */ |
1684 | | |
1685 | | /* Not UTF mode */ |
1686 | 0 | { |
1687 | 0 | for (i = Lmin; i < Lmax; i++) |
1688 | 0 | { |
1689 | 0 | if (Feptr >= mb->end_subject) |
1690 | 0 | { |
1691 | 0 | SCHECK_PARTIAL(); |
1692 | 0 | break; |
1693 | 0 | } |
1694 | 0 | if (Lc == *Feptr || Loc == *Feptr) break; |
1695 | 0 | Feptr++; |
1696 | 0 | } |
1697 | 0 | if (reptype != REPTYPE_POS) for (;;) |
1698 | 0 | { |
1699 | 0 | if (Feptr == Lstart_eptr) break; |
1700 | 0 | RMATCH(Fecode, RM30); |
1701 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1702 | 0 | Feptr--; |
1703 | 0 | } |
1704 | 0 | } |
1705 | 0 | } |
1706 | 0 | } |
1707 | | |
1708 | | /* Caseful comparisons */ |
1709 | | |
1710 | 791k | else |
1711 | 791k | { |
1712 | 791k | #ifdef SUPPORT_UNICODE |
1713 | 791k | if (utf) |
1714 | 0 | { |
1715 | 0 | uint32_t d; |
1716 | 0 | for (i = 1; i <= Lmin; i++) |
1717 | 0 | { |
1718 | 0 | if (Feptr >= mb->end_subject) |
1719 | 0 | { |
1720 | 0 | SCHECK_PARTIAL(); |
1721 | 0 | RRETURN(MATCH_NOMATCH); |
1722 | 0 | } |
1723 | 0 | GETCHARINC(d, Feptr); |
1724 | 0 | if (Lc == d) RRETURN(MATCH_NOMATCH); |
1725 | 0 | } |
1726 | 0 | } |
1727 | 791k | else |
1728 | 791k | #endif |
1729 | | /* Not UTF mode */ |
1730 | 791k | { |
1731 | 808k | for (i = 1; i <= Lmin; i++) |
1732 | 19.7k | { |
1733 | 19.7k | if (Feptr >= mb->end_subject) |
1734 | 508 | { |
1735 | 508 | SCHECK_PARTIAL(); |
1736 | 508 | RRETURN(MATCH_NOMATCH); |
1737 | 0 | } |
1738 | 19.2k | if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); |
1739 | 17.4k | } |
1740 | 791k | } |
1741 | | |
1742 | 789k | if (Lmin == Lmax) continue; |
1743 | | |
1744 | 789k | if (reptype == REPTYPE_MIN) |
1745 | 0 | { |
1746 | 0 | #ifdef SUPPORT_UNICODE |
1747 | 0 | if (utf) |
1748 | 0 | { |
1749 | 0 | uint32_t d; |
1750 | 0 | for (;;) |
1751 | 0 | { |
1752 | 0 | RMATCH(Fecode, RM206); |
1753 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1754 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1755 | 0 | if (Feptr >= mb->end_subject) |
1756 | 0 | { |
1757 | 0 | SCHECK_PARTIAL(); |
1758 | 0 | RRETURN(MATCH_NOMATCH); |
1759 | 0 | } |
1760 | 0 | GETCHARINC(d, Feptr); |
1761 | 0 | if (Lc == d) RRETURN(MATCH_NOMATCH); |
1762 | 0 | } |
1763 | 0 | } |
1764 | 0 | else |
1765 | 0 | #endif |
1766 | | /* Not UTF mode */ |
1767 | 0 | { |
1768 | 0 | for (;;) |
1769 | 0 | { |
1770 | 0 | RMATCH(Fecode, RM31); |
1771 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1772 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1773 | 0 | if (Feptr >= mb->end_subject) |
1774 | 0 | { |
1775 | 0 | SCHECK_PARTIAL(); |
1776 | 0 | RRETURN(MATCH_NOMATCH); |
1777 | 0 | } |
1778 | 0 | if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); |
1779 | 0 | } |
1780 | 0 | } |
1781 | | /* Control never gets here */ |
1782 | 0 | } |
1783 | | |
1784 | | /* Maximize case */ |
1785 | | |
1786 | 789k | else |
1787 | 789k | { |
1788 | 789k | Lstart_eptr = Feptr; |
1789 | | |
1790 | 789k | #ifdef SUPPORT_UNICODE |
1791 | 789k | if (utf) |
1792 | 0 | { |
1793 | 0 | uint32_t d; |
1794 | 0 | for (i = Lmin; i < Lmax; i++) |
1795 | 0 | { |
1796 | 0 | int len = 1; |
1797 | 0 | if (Feptr >= mb->end_subject) |
1798 | 0 | { |
1799 | 0 | SCHECK_PARTIAL(); |
1800 | 0 | break; |
1801 | 0 | } |
1802 | 0 | GETCHARLEN(d, Feptr, len); |
1803 | 0 | if (Lc == d) break; |
1804 | 0 | Feptr += len; |
1805 | 0 | } |
1806 | | |
1807 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1808 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1809 | | go too far. */ |
1810 | | |
1811 | 0 | if (reptype != REPTYPE_POS) for(;;) |
1812 | 0 | { |
1813 | 0 | if (Feptr <= Lstart_eptr) break; |
1814 | 0 | RMATCH(Fecode, RM207); |
1815 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1816 | 0 | Feptr--; |
1817 | 0 | BACKCHAR(Feptr); |
1818 | 0 | } |
1819 | 0 | } |
1820 | 789k | else |
1821 | 789k | #endif |
1822 | | /* Not UTF mode */ |
1823 | 789k | { |
1824 | 3.61M | for (i = Lmin; i < Lmax; i++) |
1825 | 3.61M | { |
1826 | 3.61M | if (Feptr >= mb->end_subject) |
1827 | 408k | { |
1828 | 408k | SCHECK_PARTIAL(); |
1829 | 408k | break; |
1830 | 408k | } |
1831 | 3.20M | if (Lc == *Feptr) break; |
1832 | 2.82M | Feptr++; |
1833 | 2.82M | } |
1834 | 789k | if (reptype != REPTYPE_POS) for (;;) |
1835 | 55.8k | { |
1836 | 55.8k | if (Feptr == Lstart_eptr) break; |
1837 | 53.4k | RMATCH(Fecode, RM32); |
1838 | 53.1k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1839 | 53.1k | Feptr--; |
1840 | 53.1k | } |
1841 | 789k | } |
1842 | 789k | } |
1843 | 789k | } |
1844 | 788k | break; |
1845 | | |
1846 | 788k | #undef Lstart_eptr |
1847 | 788k | #undef Lmin |
1848 | 788k | #undef Lmax |
1849 | 788k | #undef Lc |
1850 | 788k | #undef Loc |
1851 | | |
1852 | | |
1853 | | /* ===================================================================== */ |
1854 | | /* Match a bit-mapped character class, possibly repeatedly. These opcodes |
1855 | | are used when all the characters in the class have values in the range |
1856 | | 0-255, and either the matching is caseful, or the characters are in the |
1857 | | range 0-127 when UTF processing is enabled. The only difference between |
1858 | | OP_CLASS and OP_NCLASS occurs when a data character outside the range is |
1859 | | encountered. */ |
1860 | | |
1861 | 51.9M | #define Lmin F->temp_32[0] |
1862 | 57.5M | #define Lmax F->temp_32[1] |
1863 | 7.82M | #define Lstart_eptr F->temp_sptr[0] |
1864 | 64.7M | #define Lbyte_map_address F->temp_sptr[1] |
1865 | 47.9M | #define Lbyte_map ((unsigned char *)Lbyte_map_address) |
1866 | | |
1867 | 788k | case OP_NCLASS: |
1868 | 16.8M | case OP_CLASS: |
1869 | 16.8M | { |
1870 | 16.8M | Lbyte_map_address = Fecode + 1; /* Save for matching */ |
1871 | 16.8M | Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */ |
1872 | | |
1873 | | /* Look past the end of the item to see if there is repeat information |
1874 | | following. Then obey similar code to character type repeats. */ |
1875 | | |
1876 | 16.8M | switch (*Fecode) |
1877 | 16.8M | { |
1878 | 275k | case OP_CRSTAR: |
1879 | 275k | case OP_CRMINSTAR: |
1880 | 1.49M | case OP_CRPLUS: |
1881 | 1.50M | case OP_CRMINPLUS: |
1882 | 1.52M | case OP_CRQUERY: |
1883 | 1.52M | case OP_CRMINQUERY: |
1884 | 1.63M | case OP_CRPOSSTAR: |
1885 | 3.60M | case OP_CRPOSPLUS: |
1886 | 3.60M | case OP_CRPOSQUERY: |
1887 | 3.60M | fc = *Fecode++ - OP_CRSTAR; |
1888 | 3.60M | Lmin = rep_min[fc]; |
1889 | 3.60M | Lmax = rep_max[fc]; |
1890 | 3.60M | reptype = rep_typ[fc]; |
1891 | 3.60M | break; |
1892 | | |
1893 | 424k | case OP_CRRANGE: |
1894 | 424k | case OP_CRMINRANGE: |
1895 | 437k | case OP_CRPOSRANGE: |
1896 | 437k | Lmin = GET2(Fecode, 1); |
1897 | 437k | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
1898 | 437k | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
1899 | 437k | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
1900 | 437k | Fecode += 1 + 2 * IMM2_SIZE; |
1901 | 437k | break; |
1902 | | |
1903 | 12.7M | default: /* No repeat follows */ |
1904 | 12.7M | Lmin = Lmax = 1; |
1905 | 12.7M | break; |
1906 | 16.8M | } |
1907 | | |
1908 | | /* First, ensure the minimum number of matches are present. */ |
1909 | | |
1910 | 16.8M | #ifdef SUPPORT_UNICODE |
1911 | 16.8M | if (utf) |
1912 | 0 | { |
1913 | 0 | for (i = 1; i <= Lmin; i++) |
1914 | 0 | { |
1915 | 0 | if (Feptr >= mb->end_subject) |
1916 | 0 | { |
1917 | 0 | SCHECK_PARTIAL(); |
1918 | 0 | RRETURN(MATCH_NOMATCH); |
1919 | 0 | } |
1920 | 0 | GETCHARINC(fc, Feptr); |
1921 | 0 | if (fc > 255) |
1922 | 0 | { |
1923 | 0 | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
1924 | 0 | } |
1925 | 0 | else |
1926 | 0 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
1927 | 0 | } |
1928 | 0 | } |
1929 | 16.8M | else |
1930 | 16.8M | #endif |
1931 | | /* Not UTF mode */ |
1932 | 16.8M | { |
1933 | 23.9M | for (i = 1; i <= Lmin; i++) |
1934 | 16.3M | { |
1935 | 16.3M | if (Feptr >= mb->end_subject) |
1936 | 24.8k | { |
1937 | 24.8k | SCHECK_PARTIAL(); |
1938 | 24.8k | RRETURN(MATCH_NOMATCH); |
1939 | 0 | } |
1940 | 16.3M | fc = *Feptr++; |
1941 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
1942 | | if (fc > 255) |
1943 | | { |
1944 | | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
1945 | | } |
1946 | | else |
1947 | | #endif |
1948 | 16.3M | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
1949 | 7.10M | } |
1950 | 16.8M | } |
1951 | | |
1952 | | /* If Lmax == Lmin we are done. Continue with main loop. */ |
1953 | | |
1954 | 7.50M | if (Lmin == Lmax) continue; |
1955 | | |
1956 | | /* If minimizing, keep testing the rest of the expression and advancing |
1957 | | the pointer while it matches the class. */ |
1958 | | |
1959 | 3.66M | if (reptype == REPTYPE_MIN) |
1960 | 12.1k | { |
1961 | 12.1k | #ifdef SUPPORT_UNICODE |
1962 | 12.1k | if (utf) |
1963 | 0 | { |
1964 | 0 | for (;;) |
1965 | 0 | { |
1966 | 0 | RMATCH(Fecode, RM200); |
1967 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1968 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1969 | 0 | if (Feptr >= mb->end_subject) |
1970 | 0 | { |
1971 | 0 | SCHECK_PARTIAL(); |
1972 | 0 | RRETURN(MATCH_NOMATCH); |
1973 | 0 | } |
1974 | 0 | GETCHARINC(fc, Feptr); |
1975 | 0 | if (fc > 255) |
1976 | 0 | { |
1977 | 0 | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
1978 | 0 | } |
1979 | 0 | else |
1980 | 0 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
1981 | 0 | } |
1982 | 0 | } |
1983 | 12.1k | else |
1984 | 12.1k | #endif |
1985 | | /* Not UTF mode */ |
1986 | 12.1k | { |
1987 | 12.1k | for (;;) |
1988 | 58.1k | { |
1989 | 58.1k | RMATCH(Fecode, RM23); |
1990 | 49.1k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1991 | 49.1k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1992 | 49.1k | if (Feptr >= mb->end_subject) |
1993 | 0 | { |
1994 | 0 | SCHECK_PARTIAL(); |
1995 | 0 | RRETURN(MATCH_NOMATCH); |
1996 | 0 | } |
1997 | 49.1k | fc = *Feptr++; |
1998 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
1999 | | if (fc > 255) |
2000 | | { |
2001 | | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2002 | | } |
2003 | | else |
2004 | | #endif |
2005 | 49.1k | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
2006 | 46.0k | } |
2007 | 12.1k | } |
2008 | | /* Control never gets here */ |
2009 | 12.1k | } |
2010 | | |
2011 | | /* If maximizing, find the longest possible run, then work backwards. */ |
2012 | | |
2013 | 3.65M | else |
2014 | 3.65M | { |
2015 | 3.65M | Lstart_eptr = Feptr; |
2016 | | |
2017 | 3.65M | #ifdef SUPPORT_UNICODE |
2018 | 3.65M | if (utf) |
2019 | 0 | { |
2020 | 0 | for (i = Lmin; i < Lmax; i++) |
2021 | 0 | { |
2022 | 0 | int len = 1; |
2023 | 0 | if (Feptr >= mb->end_subject) |
2024 | 0 | { |
2025 | 0 | SCHECK_PARTIAL(); |
2026 | 0 | break; |
2027 | 0 | } |
2028 | 0 | GETCHARLEN(fc, Feptr, len); |
2029 | 0 | if (fc > 255) |
2030 | 0 | { |
2031 | 0 | if (Fop == OP_CLASS) break; |
2032 | 0 | } |
2033 | 0 | else |
2034 | 0 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; |
2035 | 0 | Feptr += len; |
2036 | 0 | } |
2037 | | |
2038 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2039 | | |
2040 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
2041 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
2042 | | go too far. */ |
2043 | | |
2044 | 0 | for (;;) |
2045 | 0 | { |
2046 | 0 | RMATCH(Fecode, RM201); |
2047 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2048 | 0 | if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ |
2049 | 0 | BACKCHAR(Feptr); |
2050 | 0 | } |
2051 | 0 | } |
2052 | 3.65M | else |
2053 | 3.65M | #endif |
2054 | | /* Not UTF mode */ |
2055 | 3.65M | { |
2056 | 32.8M | for (i = Lmin; i < Lmax; i++) |
2057 | 32.7M | { |
2058 | 32.7M | if (Feptr >= mb->end_subject) |
2059 | 1.21M | { |
2060 | 1.21M | SCHECK_PARTIAL(); |
2061 | 1.21M | break; |
2062 | 1.21M | } |
2063 | 31.5M | fc = *Feptr; |
2064 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
2065 | | if (fc > 255) |
2066 | | { |
2067 | | if (Fop == OP_CLASS) break; |
2068 | | } |
2069 | | else |
2070 | | #endif |
2071 | 31.5M | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; |
2072 | 29.1M | Feptr++; |
2073 | 29.1M | } |
2074 | | |
2075 | 3.65M | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2076 | | |
2077 | 4.16M | while (Feptr >= Lstart_eptr) |
2078 | 3.97M | { |
2079 | 3.97M | RMATCH(Fecode, RM24); |
2080 | 2.57M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2081 | 2.57M | Feptr--; |
2082 | 2.57M | } |
2083 | 1.58M | } |
2084 | | |
2085 | 3.65M | RRETURN(MATCH_NOMATCH); |
2086 | 0 | } |
2087 | 3.66M | } |
2088 | | /* Control never gets here */ |
2089 | | |
2090 | 0 | #undef Lbyte_map_address |
2091 | 0 | #undef Lbyte_map |
2092 | 0 | #undef Lstart_eptr |
2093 | 0 | #undef Lmin |
2094 | 0 | #undef Lmax |
2095 | | |
2096 | | |
2097 | | /* ===================================================================== */ |
2098 | | /* Match an extended character class. In the 8-bit library, this opcode is |
2099 | | encountered only when UTF-8 mode mode is supported. In the 16-bit and |
2100 | | 32-bit libraries, codepoints greater than 255 may be encountered even when |
2101 | | UTF is not supported. */ |
2102 | | |
2103 | 713 | #define Lstart_eptr F->temp_sptr[0] |
2104 | 2.25k | #define Lxclass_data F->temp_sptr[1] |
2105 | 3.40k | #define Lmin F->temp_32[0] |
2106 | 2.32k | #define Lmax F->temp_32[1] |
2107 | | |
2108 | 0 | #ifdef SUPPORT_WIDE_CHARS |
2109 | 865 | case OP_XCLASS: |
2110 | 865 | { |
2111 | 865 | Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */ |
2112 | 865 | Fecode += GET(Fecode, 1); /* Advance past the item */ |
2113 | | |
2114 | 865 | switch (*Fecode) |
2115 | 865 | { |
2116 | 98 | case OP_CRSTAR: |
2117 | 98 | case OP_CRMINSTAR: |
2118 | 98 | case OP_CRPLUS: |
2119 | 98 | case OP_CRMINPLUS: |
2120 | 98 | case OP_CRQUERY: |
2121 | 98 | case OP_CRMINQUERY: |
2122 | 98 | case OP_CRPOSSTAR: |
2123 | 98 | case OP_CRPOSPLUS: |
2124 | 98 | case OP_CRPOSQUERY: |
2125 | 98 | fc = *Fecode++ - OP_CRSTAR; |
2126 | 98 | Lmin = rep_min[fc]; |
2127 | 98 | Lmax = rep_max[fc]; |
2128 | 98 | reptype = rep_typ[fc]; |
2129 | 98 | break; |
2130 | | |
2131 | 0 | case OP_CRRANGE: |
2132 | 0 | case OP_CRMINRANGE: |
2133 | 0 | case OP_CRPOSRANGE: |
2134 | 0 | Lmin = GET2(Fecode, 1); |
2135 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
2136 | 0 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
2137 | 0 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
2138 | 0 | Fecode += 1 + 2 * IMM2_SIZE; |
2139 | 0 | break; |
2140 | | |
2141 | 767 | default: /* No repeat follows */ |
2142 | 767 | Lmin = Lmax = 1; |
2143 | 767 | break; |
2144 | 865 | } |
2145 | | |
2146 | | /* First, ensure the minimum number of matches are present. */ |
2147 | | |
2148 | 1.60k | for (i = 1; i <= Lmin; i++) |
2149 | 767 | { |
2150 | 767 | if (Feptr >= mb->end_subject) |
2151 | 0 | { |
2152 | 0 | SCHECK_PARTIAL(); |
2153 | 0 | RRETURN(MATCH_NOMATCH); |
2154 | 0 | } |
2155 | 767 | GETCHARINCTEST(fc, Feptr); |
2156 | 767 | if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH); |
2157 | 738 | } |
2158 | | |
2159 | | /* If Lmax == Lmin we can just continue with the main loop. */ |
2160 | | |
2161 | 836 | if (Lmin == Lmax) continue; |
2162 | | |
2163 | | /* If minimizing, keep testing the rest of the expression and advancing |
2164 | | the pointer while it matches the class. */ |
2165 | | |
2166 | 98 | if (reptype == REPTYPE_MIN) |
2167 | 0 | { |
2168 | 0 | for (;;) |
2169 | 0 | { |
2170 | 0 | RMATCH(Fecode, RM100); |
2171 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2172 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2173 | 0 | if (Feptr >= mb->end_subject) |
2174 | 0 | { |
2175 | 0 | SCHECK_PARTIAL(); |
2176 | 0 | RRETURN(MATCH_NOMATCH); |
2177 | 0 | } |
2178 | 0 | GETCHARINCTEST(fc, Feptr); |
2179 | 0 | if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH); |
2180 | 0 | } |
2181 | | /* Control never gets here */ |
2182 | 0 | } |
2183 | | |
2184 | | /* If maximizing, find the longest possible run, then work backwards. */ |
2185 | | |
2186 | 98 | else |
2187 | 98 | { |
2188 | 98 | Lstart_eptr = Feptr; |
2189 | 619 | for (i = Lmin; i < Lmax; i++) |
2190 | 619 | { |
2191 | 619 | int len = 1; |
2192 | 619 | if (Feptr >= mb->end_subject) |
2193 | 1 | { |
2194 | 1 | SCHECK_PARTIAL(); |
2195 | 1 | break; |
2196 | 1 | } |
2197 | 618 | #ifdef SUPPORT_UNICODE |
2198 | 618 | GETCHARLENTEST(fc, Feptr, len); |
2199 | | #else |
2200 | | fc = *Feptr; |
2201 | | #endif |
2202 | 618 | if (!PRIV(xclass)(fc, Lxclass_data, utf)) break; |
2203 | 521 | Feptr += len; |
2204 | 521 | } |
2205 | | |
2206 | 98 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2207 | | |
2208 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
2209 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
2210 | | go too far. */ |
2211 | | |
2212 | 98 | for(;;) |
2213 | 616 | { |
2214 | 616 | RMATCH(Fecode, RM101); |
2215 | 615 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2216 | 615 | if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ |
2217 | 518 | #ifdef SUPPORT_UNICODE |
2218 | 518 | if (utf) BACKCHAR(Feptr); |
2219 | 518 | #endif |
2220 | 518 | } |
2221 | 98 | RRETURN(MATCH_NOMATCH); |
2222 | 0 | } |
2223 | | |
2224 | | /* Control never gets here */ |
2225 | 98 | } |
2226 | 0 | #endif /* SUPPORT_WIDE_CHARS: end of XCLASS */ |
2227 | | |
2228 | 0 | #undef Lstart_eptr |
2229 | 0 | #undef Lxclass_data |
2230 | 0 | #undef Lmin |
2231 | 0 | #undef Lmax |
2232 | | |
2233 | | |
2234 | | /* ===================================================================== */ |
2235 | | /* Match various character types when PCRE2_UCP is not set. These opcodes |
2236 | | are not generated when PCRE2_UCP is set - instead appropriate property |
2237 | | tests are compiled. */ |
2238 | | |
2239 | 6.85M | case OP_NOT_DIGIT: |
2240 | 6.85M | if (Feptr >= mb->end_subject) |
2241 | 25.2k | { |
2242 | 25.2k | SCHECK_PARTIAL(); |
2243 | 25.2k | RRETURN(MATCH_NOMATCH); |
2244 | 0 | } |
2245 | 6.82M | GETCHARINCTEST(fc, Feptr); |
2246 | 6.82M | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) |
2247 | 6.36M | RRETURN(MATCH_NOMATCH); |
2248 | 6.36M | Fecode++; |
2249 | 6.36M | break; |
2250 | | |
2251 | 4.58M | case OP_DIGIT: |
2252 | 4.58M | if (Feptr >= mb->end_subject) |
2253 | 4.13k | { |
2254 | 4.13k | SCHECK_PARTIAL(); |
2255 | 4.13k | RRETURN(MATCH_NOMATCH); |
2256 | 0 | } |
2257 | 4.57M | GETCHARINCTEST(fc, Feptr); |
2258 | 4.57M | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) |
2259 | 2.36M | RRETURN(MATCH_NOMATCH); |
2260 | 2.36M | Fecode++; |
2261 | 2.36M | break; |
2262 | | |
2263 | 916k | case OP_NOT_WHITESPACE: |
2264 | 916k | if (Feptr >= mb->end_subject) |
2265 | 738 | { |
2266 | 738 | SCHECK_PARTIAL(); |
2267 | 738 | RRETURN(MATCH_NOMATCH); |
2268 | 0 | } |
2269 | 915k | GETCHARINCTEST(fc, Feptr); |
2270 | 915k | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) |
2271 | 861k | RRETURN(MATCH_NOMATCH); |
2272 | 861k | Fecode++; |
2273 | 861k | break; |
2274 | | |
2275 | 292k | case OP_WHITESPACE: |
2276 | 292k | if (Feptr >= mb->end_subject) |
2277 | 1.25k | { |
2278 | 1.25k | SCHECK_PARTIAL(); |
2279 | 1.25k | RRETURN(MATCH_NOMATCH); |
2280 | 0 | } |
2281 | 291k | GETCHARINCTEST(fc, Feptr); |
2282 | 291k | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) |
2283 | 279k | RRETURN(MATCH_NOMATCH); |
2284 | 11.8k | Fecode++; |
2285 | 11.8k | break; |
2286 | | |
2287 | 272k | case OP_NOT_WORDCHAR: |
2288 | 272k | if (Feptr >= mb->end_subject) |
2289 | 3.70k | { |
2290 | 3.70k | SCHECK_PARTIAL(); |
2291 | 3.70k | RRETURN(MATCH_NOMATCH); |
2292 | 0 | } |
2293 | 269k | GETCHARINCTEST(fc, Feptr); |
2294 | 269k | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) |
2295 | 162k | RRETURN(MATCH_NOMATCH); |
2296 | 162k | Fecode++; |
2297 | 162k | break; |
2298 | | |
2299 | 2.57M | case OP_WORDCHAR: |
2300 | 2.57M | if (Feptr >= mb->end_subject) |
2301 | 22.7k | { |
2302 | 22.7k | SCHECK_PARTIAL(); |
2303 | 22.7k | RRETURN(MATCH_NOMATCH); |
2304 | 0 | } |
2305 | 2.54M | GETCHARINCTEST(fc, Feptr); |
2306 | 2.54M | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) |
2307 | 1.80M | RRETURN(MATCH_NOMATCH); |
2308 | 741k | Fecode++; |
2309 | 741k | break; |
2310 | | |
2311 | 141M | case OP_ANYNL: |
2312 | 141M | if (Feptr >= mb->end_subject) |
2313 | 516k | { |
2314 | 516k | SCHECK_PARTIAL(); |
2315 | 516k | RRETURN(MATCH_NOMATCH); |
2316 | 0 | } |
2317 | 140M | GETCHARINCTEST(fc, Feptr); |
2318 | 140M | switch(fc) |
2319 | 140M | { |
2320 | 136M | default: RRETURN(MATCH_NOMATCH); |
2321 | |
|
2322 | 1.28M | case CHAR_CR: |
2323 | 1.28M | if (Feptr >= mb->end_subject) |
2324 | 883 | { |
2325 | 883 | SCHECK_PARTIAL(); |
2326 | 883 | } |
2327 | 1.28M | else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++; |
2328 | 1.28M | break; |
2329 | | |
2330 | 1.48M | case CHAR_LF: |
2331 | 1.48M | break; |
2332 | | |
2333 | 421k | case CHAR_VT: |
2334 | 513k | case CHAR_FF: |
2335 | 986k | case CHAR_NEL: |
2336 | 986k | #ifndef EBCDIC |
2337 | 986k | case 0x2028: |
2338 | 986k | case 0x2029: |
2339 | 986k | #endif /* Not EBCDIC */ |
2340 | 986k | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
2341 | 986k | break; |
2342 | 140M | } |
2343 | 3.75M | Fecode++; |
2344 | 3.75M | break; |
2345 | | |
2346 | 10.7M | case OP_NOT_HSPACE: |
2347 | 10.7M | if (Feptr >= mb->end_subject) |
2348 | 3.52k | { |
2349 | 3.52k | SCHECK_PARTIAL(); |
2350 | 3.52k | RRETURN(MATCH_NOMATCH); |
2351 | 0 | } |
2352 | 10.7M | GETCHARINCTEST(fc, Feptr); |
2353 | 10.7M | switch(fc) |
2354 | 10.7M | { |
2355 | 241k | HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ |
2356 | 10.4M | default: break; |
2357 | 10.7M | } |
2358 | 10.4M | Fecode++; |
2359 | 10.4M | break; |
2360 | | |
2361 | 31.0M | case OP_HSPACE: |
2362 | 31.0M | if (Feptr >= mb->end_subject) |
2363 | 41.2k | { |
2364 | 41.2k | SCHECK_PARTIAL(); |
2365 | 41.2k | RRETURN(MATCH_NOMATCH); |
2366 | 0 | } |
2367 | 31.0M | GETCHARINCTEST(fc, Feptr); |
2368 | 31.0M | switch(fc) |
2369 | 31.0M | { |
2370 | 469k | HSPACE_CASES: break; /* Byte and multibyte cases */ |
2371 | 30.5M | default: RRETURN(MATCH_NOMATCH); |
2372 | 31.0M | } |
2373 | 469k | Fecode++; |
2374 | 469k | break; |
2375 | | |
2376 | 8.20M | case OP_NOT_VSPACE: |
2377 | 8.20M | if (Feptr >= mb->end_subject) |
2378 | 6.96k | { |
2379 | 6.96k | SCHECK_PARTIAL(); |
2380 | 6.96k | RRETURN(MATCH_NOMATCH); |
2381 | 0 | } |
2382 | 8.19M | GETCHARINCTEST(fc, Feptr); |
2383 | 8.19M | switch(fc) |
2384 | 8.19M | { |
2385 | 175k | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
2386 | 8.01M | default: break; |
2387 | 8.19M | } |
2388 | 8.01M | Fecode++; |
2389 | 8.01M | break; |
2390 | | |
2391 | 807k | case OP_VSPACE: |
2392 | 807k | if (Feptr >= mb->end_subject) |
2393 | 10.6k | { |
2394 | 10.6k | SCHECK_PARTIAL(); |
2395 | 10.6k | RRETURN(MATCH_NOMATCH); |
2396 | 0 | } |
2397 | 797k | GETCHARINCTEST(fc, Feptr); |
2398 | 797k | switch(fc) |
2399 | 797k | { |
2400 | 79.7k | VSPACE_CASES: break; |
2401 | 717k | default: RRETURN(MATCH_NOMATCH); |
2402 | 797k | } |
2403 | 79.7k | Fecode++; |
2404 | 79.7k | break; |
2405 | | |
2406 | | |
2407 | 0 | #ifdef SUPPORT_UNICODE |
2408 | | |
2409 | | /* ===================================================================== */ |
2410 | | /* Check the next character by Unicode property. We will get here only |
2411 | | if the support is in the binary; otherwise a compile-time error occurs. */ |
2412 | | |
2413 | 93.5k | case OP_PROP: |
2414 | 338k | case OP_NOTPROP: |
2415 | 338k | if (Feptr >= mb->end_subject) |
2416 | 368 | { |
2417 | 368 | SCHECK_PARTIAL(); |
2418 | 368 | RRETURN(MATCH_NOMATCH); |
2419 | 0 | } |
2420 | 338k | GETCHARINCTEST(fc, Feptr); |
2421 | 338k | { |
2422 | 338k | const uint32_t *cp; |
2423 | 338k | const ucd_record *prop = GET_UCD(fc); |
2424 | | |
2425 | 338k | switch(Fecode[1]) |
2426 | 338k | { |
2427 | 0 | case PT_ANY: |
2428 | 0 | if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
2429 | 0 | break; |
2430 | | |
2431 | 0 | case PT_LAMP: |
2432 | 0 | if ((prop->chartype == ucp_Lu || |
2433 | 0 | prop->chartype == ucp_Ll || |
2434 | 0 | prop->chartype == ucp_Lt) == (Fop == OP_NOTPROP)) |
2435 | 0 | RRETURN(MATCH_NOMATCH); |
2436 | 0 | break; |
2437 | | |
2438 | 338k | case PT_GC: |
2439 | 338k | if ((Fecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (Fop == OP_PROP)) |
2440 | 204k | RRETURN(MATCH_NOMATCH); |
2441 | 204k | break; |
2442 | | |
2443 | 0 | case PT_PC: |
2444 | 0 | if ((Fecode[2] != prop->chartype) == (Fop == OP_PROP)) |
2445 | 0 | RRETURN(MATCH_NOMATCH); |
2446 | 0 | break; |
2447 | | |
2448 | 0 | case PT_SC: |
2449 | 0 | if ((Fecode[2] != prop->script) == (Fop == OP_PROP)) |
2450 | 0 | RRETURN(MATCH_NOMATCH); |
2451 | 0 | break; |
2452 | | |
2453 | | /* These are specials */ |
2454 | | |
2455 | 0 | case PT_ALNUM: |
2456 | 0 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
2457 | 0 | PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (Fop == OP_NOTPROP)) |
2458 | 0 | RRETURN(MATCH_NOMATCH); |
2459 | 0 | break; |
2460 | | |
2461 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
2462 | | which means that Perl space and POSIX space are now identical. PCRE |
2463 | | was changed at release 8.34. */ |
2464 | | |
2465 | 0 | case PT_SPACE: /* Perl space */ |
2466 | 0 | case PT_PXSPACE: /* POSIX space */ |
2467 | 0 | switch(fc) |
2468 | 0 | { |
2469 | 0 | HSPACE_CASES: |
2470 | 0 | VSPACE_CASES: |
2471 | 0 | if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
2472 | 0 | break; |
2473 | | |
2474 | 0 | default: |
2475 | 0 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == |
2476 | 0 | (Fop == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); |
2477 | 0 | break; |
2478 | 0 | } |
2479 | 0 | break; |
2480 | | |
2481 | 0 | case PT_WORD: |
2482 | 0 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
2483 | 0 | PRIV(ucp_gentype)[prop->chartype] == ucp_N || |
2484 | 0 | fc == CHAR_UNDERSCORE) == (Fop == OP_NOTPROP)) |
2485 | 0 | RRETURN(MATCH_NOMATCH); |
2486 | 0 | break; |
2487 | | |
2488 | 0 | case PT_CLIST: |
2489 | 0 | cp = PRIV(ucd_caseless_sets) + Fecode[2]; |
2490 | 0 | for (;;) |
2491 | 0 | { |
2492 | 0 | if (fc < *cp) |
2493 | 0 | { if (Fop == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; } |
2494 | 0 | if (fc == *cp++) |
2495 | 0 | { if (Fop == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } } |
2496 | 0 | } |
2497 | 0 | break; |
2498 | | |
2499 | 0 | case PT_UCNC: |
2500 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
2501 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
2502 | 0 | fc >= 0xe000) == (Fop == OP_NOTPROP)) |
2503 | 0 | RRETURN(MATCH_NOMATCH); |
2504 | 0 | break; |
2505 | | |
2506 | | /* This should never occur */ |
2507 | | |
2508 | 0 | default: |
2509 | 0 | return PCRE2_ERROR_INTERNAL; |
2510 | 338k | } |
2511 | | |
2512 | 204k | Fecode += 3; |
2513 | 204k | } |
2514 | 0 | break; |
2515 | | |
2516 | | |
2517 | | /* ===================================================================== */ |
2518 | | /* Match an extended Unicode sequence. We will get here only if the support |
2519 | | is in the binary; otherwise a compile-time error occurs. */ |
2520 | | |
2521 | 869k | case OP_EXTUNI: |
2522 | 869k | if (Feptr >= mb->end_subject) |
2523 | 1.80k | { |
2524 | 1.80k | SCHECK_PARTIAL(); |
2525 | 1.80k | RRETURN(MATCH_NOMATCH); |
2526 | 0 | } |
2527 | 867k | else |
2528 | 867k | { |
2529 | 867k | GETCHARINCTEST(fc, Feptr); |
2530 | 867k | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf, |
2531 | 867k | NULL); |
2532 | 867k | } |
2533 | 867k | CHECK_PARTIAL(); |
2534 | 867k | Fecode++; |
2535 | 867k | break; |
2536 | | |
2537 | 0 | #endif /* SUPPORT_UNICODE */ |
2538 | | |
2539 | | |
2540 | | /* ===================================================================== */ |
2541 | | /* Match a single character type repeatedly. Note that the property type |
2542 | | does not need to be in a stack frame as it is not used within an RMATCH() |
2543 | | loop. */ |
2544 | | |
2545 | 1.52G | #define Lstart_eptr F->temp_sptr[0] |
2546 | 666M | #define Lmin F->temp_32[0] |
2547 | 3.63G | #define Lmax F->temp_32[1] |
2548 | 1.55G | #define Lctype F->temp_32[2] |
2549 | 3.47M | #define Lpropvalue F->temp_32[3] |
2550 | | |
2551 | 8.30M | case OP_TYPEEXACT: |
2552 | 8.30M | Lmin = Lmax = GET2(Fecode, 1); |
2553 | 8.30M | Fecode += 1 + IMM2_SIZE; |
2554 | 8.30M | goto REPEATTYPE; |
2555 | | |
2556 | 0 | case OP_TYPEUPTO: |
2557 | 0 | case OP_TYPEMINUPTO: |
2558 | 0 | Lmin = 0; |
2559 | 0 | Lmax = GET2(Fecode, 1); |
2560 | 0 | reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX; |
2561 | 0 | Fecode += 1 + IMM2_SIZE; |
2562 | 0 | goto REPEATTYPE; |
2563 | | |
2564 | 11.4M | case OP_TYPEPOSSTAR: |
2565 | 11.4M | reptype = REPTYPE_POS; |
2566 | 11.4M | Lmin = 0; |
2567 | 11.4M | Lmax = UINT32_MAX; |
2568 | 11.4M | Fecode++; |
2569 | 11.4M | goto REPEATTYPE; |
2570 | | |
2571 | 51.5M | case OP_TYPEPOSPLUS: |
2572 | 51.5M | reptype = REPTYPE_POS; |
2573 | 51.5M | Lmin = 1; |
2574 | 51.5M | Lmax = UINT32_MAX; |
2575 | 51.5M | Fecode++; |
2576 | 51.5M | goto REPEATTYPE; |
2577 | | |
2578 | 3.93M | case OP_TYPEPOSQUERY: |
2579 | 3.93M | reptype = REPTYPE_POS; |
2580 | 3.93M | Lmin = 0; |
2581 | 3.93M | Lmax = 1; |
2582 | 3.93M | Fecode++; |
2583 | 3.93M | goto REPEATTYPE; |
2584 | | |
2585 | 0 | case OP_TYPEPOSUPTO: |
2586 | 0 | reptype = REPTYPE_POS; |
2587 | 0 | Lmin = 0; |
2588 | 0 | Lmax = GET2(Fecode, 1); |
2589 | 0 | Fecode += 1 + IMM2_SIZE; |
2590 | 0 | goto REPEATTYPE; |
2591 | | |
2592 | 9.91M | case OP_TYPESTAR: |
2593 | 11.1M | case OP_TYPEMINSTAR: |
2594 | 21.4M | case OP_TYPEPLUS: |
2595 | 22.7M | case OP_TYPEMINPLUS: |
2596 | 56.2M | case OP_TYPEQUERY: |
2597 | 56.9M | case OP_TYPEMINQUERY: |
2598 | 56.9M | fc = *Fecode++ - OP_TYPESTAR; |
2599 | 56.9M | Lmin = rep_min[fc]; |
2600 | 56.9M | Lmax = rep_max[fc]; |
2601 | 56.9M | reptype = rep_typ[fc]; |
2602 | | |
2603 | | /* Common code for all repeated character type matches. */ |
2604 | | |
2605 | 132M | REPEATTYPE: |
2606 | 132M | Lctype = *Fecode++; /* Code for the character type */ |
2607 | | |
2608 | 132M | #ifdef SUPPORT_UNICODE |
2609 | 132M | if (Lctype == OP_PROP || Lctype == OP_NOTPROP) |
2610 | 215k | { |
2611 | 215k | proptype = *Fecode++; |
2612 | 215k | Lpropvalue = *Fecode++; |
2613 | 215k | } |
2614 | 131M | else proptype = -1; |
2615 | 132M | #endif |
2616 | | |
2617 | | /* First, ensure the minimum number of matches are present. Use inline |
2618 | | code for maximizing the speed, and do the type test once at the start |
2619 | | (i.e. keep it out of the loop). The code for UTF mode is separated out for |
2620 | | tidiness, except for Unicode property tests. */ |
2621 | | |
2622 | 132M | if (Lmin > 0) |
2623 | 71.3M | { |
2624 | 71.3M | #ifdef SUPPORT_UNICODE |
2625 | 71.3M | if (proptype >= 0) /* Property tests in all modes */ |
2626 | 202k | { |
2627 | 202k | switch(proptype) |
2628 | 202k | { |
2629 | 0 | case PT_ANY: |
2630 | 0 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
2631 | 0 | for (i = 1; i <= Lmin; i++) |
2632 | 0 | { |
2633 | 0 | if (Feptr >= mb->end_subject) |
2634 | 0 | { |
2635 | 0 | SCHECK_PARTIAL(); |
2636 | 0 | RRETURN(MATCH_NOMATCH); |
2637 | 0 | } |
2638 | 0 | GETCHARINCTEST(fc, Feptr); |
2639 | 0 | } |
2640 | 0 | break; |
2641 | | |
2642 | 0 | case PT_LAMP: |
2643 | 0 | for (i = 1; i <= Lmin; i++) |
2644 | 0 | { |
2645 | 0 | int chartype; |
2646 | 0 | if (Feptr >= mb->end_subject) |
2647 | 0 | { |
2648 | 0 | SCHECK_PARTIAL(); |
2649 | 0 | RRETURN(MATCH_NOMATCH); |
2650 | 0 | } |
2651 | 0 | GETCHARINCTEST(fc, Feptr); |
2652 | 0 | chartype = UCD_CHARTYPE(fc); |
2653 | 0 | if ((chartype == ucp_Lu || |
2654 | 0 | chartype == ucp_Ll || |
2655 | 0 | chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) |
2656 | 0 | RRETURN(MATCH_NOMATCH); |
2657 | 0 | } |
2658 | 0 | break; |
2659 | | |
2660 | 202k | case PT_GC: |
2661 | 348k | for (i = 1; i <= Lmin; i++) |
2662 | 202k | { |
2663 | 202k | if (Feptr >= mb->end_subject) |
2664 | 0 | { |
2665 | 0 | SCHECK_PARTIAL(); |
2666 | 0 | RRETURN(MATCH_NOMATCH); |
2667 | 0 | } |
2668 | 202k | GETCHARINCTEST(fc, Feptr); |
2669 | 202k | if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
2670 | 145k | RRETURN(MATCH_NOMATCH); |
2671 | 145k | } |
2672 | 145k | break; |
2673 | | |
2674 | 145k | case PT_PC: |
2675 | 0 | for (i = 1; i <= Lmin; i++) |
2676 | 0 | { |
2677 | 0 | if (Feptr >= mb->end_subject) |
2678 | 0 | { |
2679 | 0 | SCHECK_PARTIAL(); |
2680 | 0 | RRETURN(MATCH_NOMATCH); |
2681 | 0 | } |
2682 | 0 | GETCHARINCTEST(fc, Feptr); |
2683 | 0 | if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
2684 | 0 | RRETURN(MATCH_NOMATCH); |
2685 | 0 | } |
2686 | 0 | break; |
2687 | | |
2688 | 0 | case PT_SC: |
2689 | 0 | for (i = 1; i <= Lmin; i++) |
2690 | 0 | { |
2691 | 0 | if (Feptr >= mb->end_subject) |
2692 | 0 | { |
2693 | 0 | SCHECK_PARTIAL(); |
2694 | 0 | RRETURN(MATCH_NOMATCH); |
2695 | 0 | } |
2696 | 0 | GETCHARINCTEST(fc, Feptr); |
2697 | 0 | if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
2698 | 0 | RRETURN(MATCH_NOMATCH); |
2699 | 0 | } |
2700 | 0 | break; |
2701 | | |
2702 | 0 | case PT_ALNUM: |
2703 | 0 | for (i = 1; i <= Lmin; i++) |
2704 | 0 | { |
2705 | 0 | int category; |
2706 | 0 | if (Feptr >= mb->end_subject) |
2707 | 0 | { |
2708 | 0 | SCHECK_PARTIAL(); |
2709 | 0 | RRETURN(MATCH_NOMATCH); |
2710 | 0 | } |
2711 | 0 | GETCHARINCTEST(fc, Feptr); |
2712 | 0 | category = UCD_CATEGORY(fc); |
2713 | 0 | if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP)) |
2714 | 0 | RRETURN(MATCH_NOMATCH); |
2715 | 0 | } |
2716 | 0 | break; |
2717 | | |
2718 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
2719 | | which means that Perl space and POSIX space are now identical. PCRE |
2720 | | was changed at release 8.34. */ |
2721 | | |
2722 | 0 | case PT_SPACE: /* Perl space */ |
2723 | 0 | case PT_PXSPACE: /* POSIX space */ |
2724 | 0 | for (i = 1; i <= Lmin; i++) |
2725 | 0 | { |
2726 | 0 | if (Feptr >= mb->end_subject) |
2727 | 0 | { |
2728 | 0 | SCHECK_PARTIAL(); |
2729 | 0 | RRETURN(MATCH_NOMATCH); |
2730 | 0 | } |
2731 | 0 | GETCHARINCTEST(fc, Feptr); |
2732 | 0 | switch(fc) |
2733 | 0 | { |
2734 | 0 | HSPACE_CASES: |
2735 | 0 | VSPACE_CASES: |
2736 | 0 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
2737 | 0 | break; |
2738 | | |
2739 | 0 | default: |
2740 | 0 | if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) |
2741 | 0 | RRETURN(MATCH_NOMATCH); |
2742 | 0 | break; |
2743 | 0 | } |
2744 | 0 | } |
2745 | 0 | break; |
2746 | | |
2747 | 0 | case PT_WORD: |
2748 | 0 | for (i = 1; i <= Lmin; i++) |
2749 | 0 | { |
2750 | 0 | int category; |
2751 | 0 | if (Feptr >= mb->end_subject) |
2752 | 0 | { |
2753 | 0 | SCHECK_PARTIAL(); |
2754 | 0 | RRETURN(MATCH_NOMATCH); |
2755 | 0 | } |
2756 | 0 | GETCHARINCTEST(fc, Feptr); |
2757 | 0 | category = UCD_CATEGORY(fc); |
2758 | 0 | if ((category == ucp_L || category == ucp_N || |
2759 | 0 | fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) |
2760 | 0 | RRETURN(MATCH_NOMATCH); |
2761 | 0 | } |
2762 | 0 | break; |
2763 | | |
2764 | 0 | case PT_CLIST: |
2765 | 0 | for (i = 1; i <= Lmin; i++) |
2766 | 0 | { |
2767 | 0 | const uint32_t *cp; |
2768 | 0 | if (Feptr >= mb->end_subject) |
2769 | 0 | { |
2770 | 0 | SCHECK_PARTIAL(); |
2771 | 0 | RRETURN(MATCH_NOMATCH); |
2772 | 0 | } |
2773 | 0 | GETCHARINCTEST(fc, Feptr); |
2774 | 0 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
2775 | 0 | for (;;) |
2776 | 0 | { |
2777 | 0 | if (fc < *cp) |
2778 | 0 | { |
2779 | 0 | if (Lctype == OP_NOTPROP) break; |
2780 | 0 | RRETURN(MATCH_NOMATCH); |
2781 | 0 | } |
2782 | 0 | if (fc == *cp++) |
2783 | 0 | { |
2784 | 0 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
2785 | 0 | break; |
2786 | 0 | } |
2787 | 0 | } |
2788 | 0 | } |
2789 | 0 | break; |
2790 | | |
2791 | 0 | case PT_UCNC: |
2792 | 0 | for (i = 1; i <= Lmin; i++) |
2793 | 0 | { |
2794 | 0 | if (Feptr >= mb->end_subject) |
2795 | 0 | { |
2796 | 0 | SCHECK_PARTIAL(); |
2797 | 0 | RRETURN(MATCH_NOMATCH); |
2798 | 0 | } |
2799 | 0 | GETCHARINCTEST(fc, Feptr); |
2800 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
2801 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
2802 | 0 | fc >= 0xe000) == (Lctype == OP_NOTPROP)) |
2803 | 0 | RRETURN(MATCH_NOMATCH); |
2804 | 0 | } |
2805 | 0 | break; |
2806 | | |
2807 | | /* This should not occur */ |
2808 | | |
2809 | 0 | default: |
2810 | 0 | return PCRE2_ERROR_INTERNAL; |
2811 | 202k | } |
2812 | 202k | } |
2813 | | |
2814 | | /* Match extended Unicode sequences. We will get here only if the |
2815 | | support is in the binary; otherwise a compile-time error occurs. */ |
2816 | | |
2817 | 71.1M | else if (Lctype == OP_EXTUNI) |
2818 | 520k | { |
2819 | 1.04M | for (i = 1; i <= Lmin; i++) |
2820 | 520k | { |
2821 | 520k | if (Feptr >= mb->end_subject) |
2822 | 259 | { |
2823 | 259 | SCHECK_PARTIAL(); |
2824 | 259 | RRETURN(MATCH_NOMATCH); |
2825 | 0 | } |
2826 | 520k | else |
2827 | 520k | { |
2828 | 520k | GETCHARINCTEST(fc, Feptr); |
2829 | 520k | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, |
2830 | 520k | mb->end_subject, utf, NULL); |
2831 | 520k | } |
2832 | 520k | CHECK_PARTIAL(); |
2833 | 520k | } |
2834 | 520k | } |
2835 | 70.6M | else |
2836 | 70.6M | #endif /* SUPPORT_UNICODE */ |
2837 | | |
2838 | | /* Handle all other cases in UTF mode */ |
2839 | | |
2840 | 70.6M | #ifdef SUPPORT_UNICODE |
2841 | 70.6M | if (utf) switch(Lctype) |
2842 | 0 | { |
2843 | 0 | case OP_ANY: |
2844 | 0 | for (i = 1; i <= Lmin; i++) |
2845 | 0 | { |
2846 | 0 | if (Feptr >= mb->end_subject) |
2847 | 0 | { |
2848 | 0 | SCHECK_PARTIAL(); |
2849 | 0 | RRETURN(MATCH_NOMATCH); |
2850 | 0 | } |
2851 | 0 | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
2852 | 0 | if (mb->partial != 0 && |
2853 | 0 | Feptr + 1 >= mb->end_subject && |
2854 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
2855 | 0 | NLBLOCK->nllen == 2 && |
2856 | 0 | UCHAR21(Feptr) == NLBLOCK->nl[0]) |
2857 | 0 | { |
2858 | 0 | mb->hitend = TRUE; |
2859 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
2860 | 0 | } |
2861 | 0 | Feptr++; |
2862 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
2863 | 0 | } |
2864 | 0 | break; |
2865 | | |
2866 | 0 | case OP_ALLANY: |
2867 | 0 | for (i = 1; i <= Lmin; i++) |
2868 | 0 | { |
2869 | 0 | if (Feptr >= mb->end_subject) |
2870 | 0 | { |
2871 | 0 | SCHECK_PARTIAL(); |
2872 | 0 | RRETURN(MATCH_NOMATCH); |
2873 | 0 | } |
2874 | 0 | Feptr++; |
2875 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
2876 | 0 | } |
2877 | 0 | break; |
2878 | | |
2879 | 0 | case OP_ANYBYTE: |
2880 | 0 | if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH); |
2881 | 0 | Feptr += Lmin; |
2882 | 0 | break; |
2883 | | |
2884 | 0 | case OP_ANYNL: |
2885 | 0 | for (i = 1; i <= Lmin; i++) |
2886 | 0 | { |
2887 | 0 | if (Feptr >= mb->end_subject) |
2888 | 0 | { |
2889 | 0 | SCHECK_PARTIAL(); |
2890 | 0 | RRETURN(MATCH_NOMATCH); |
2891 | 0 | } |
2892 | 0 | GETCHARINC(fc, Feptr); |
2893 | 0 | switch(fc) |
2894 | 0 | { |
2895 | 0 | default: RRETURN(MATCH_NOMATCH); |
2896 | |
|
2897 | 0 | case CHAR_CR: |
2898 | 0 | if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; |
2899 | 0 | break; |
2900 | | |
2901 | 0 | case CHAR_LF: |
2902 | 0 | break; |
2903 | | |
2904 | 0 | case CHAR_VT: |
2905 | 0 | case CHAR_FF: |
2906 | 0 | case CHAR_NEL: |
2907 | 0 | #ifndef EBCDIC |
2908 | 0 | case 0x2028: |
2909 | 0 | case 0x2029: |
2910 | 0 | #endif /* Not EBCDIC */ |
2911 | 0 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
2912 | 0 | break; |
2913 | 0 | } |
2914 | 0 | } |
2915 | 0 | break; |
2916 | | |
2917 | 0 | case OP_NOT_HSPACE: |
2918 | 0 | for (i = 1; i <= Lmin; i++) |
2919 | 0 | { |
2920 | 0 | if (Feptr >= mb->end_subject) |
2921 | 0 | { |
2922 | 0 | SCHECK_PARTIAL(); |
2923 | 0 | RRETURN(MATCH_NOMATCH); |
2924 | 0 | } |
2925 | 0 | GETCHARINC(fc, Feptr); |
2926 | 0 | switch(fc) |
2927 | 0 | { |
2928 | 0 | HSPACE_CASES: RRETURN(MATCH_NOMATCH); |
2929 | 0 | default: break; |
2930 | 0 | } |
2931 | 0 | } |
2932 | 0 | break; |
2933 | | |
2934 | 0 | case OP_HSPACE: |
2935 | 0 | for (i = 1; i <= Lmin; i++) |
2936 | 0 | { |
2937 | 0 | if (Feptr >= mb->end_subject) |
2938 | 0 | { |
2939 | 0 | SCHECK_PARTIAL(); |
2940 | 0 | RRETURN(MATCH_NOMATCH); |
2941 | 0 | } |
2942 | 0 | GETCHARINC(fc, Feptr); |
2943 | 0 | switch(fc) |
2944 | 0 | { |
2945 | 0 | HSPACE_CASES: break; |
2946 | 0 | default: RRETURN(MATCH_NOMATCH); |
2947 | 0 | } |
2948 | 0 | } |
2949 | 0 | break; |
2950 | | |
2951 | 0 | case OP_NOT_VSPACE: |
2952 | 0 | for (i = 1; i <= Lmin; i++) |
2953 | 0 | { |
2954 | 0 | if (Feptr >= mb->end_subject) |
2955 | 0 | { |
2956 | 0 | SCHECK_PARTIAL(); |
2957 | 0 | RRETURN(MATCH_NOMATCH); |
2958 | 0 | } |
2959 | 0 | GETCHARINC(fc, Feptr); |
2960 | 0 | switch(fc) |
2961 | 0 | { |
2962 | 0 | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
2963 | 0 | default: break; |
2964 | 0 | } |
2965 | 0 | } |
2966 | 0 | break; |
2967 | | |
2968 | 0 | case OP_VSPACE: |
2969 | 0 | for (i = 1; i <= Lmin; i++) |
2970 | 0 | { |
2971 | 0 | if (Feptr >= mb->end_subject) |
2972 | 0 | { |
2973 | 0 | SCHECK_PARTIAL(); |
2974 | 0 | RRETURN(MATCH_NOMATCH); |
2975 | 0 | } |
2976 | 0 | GETCHARINC(fc, Feptr); |
2977 | 0 | switch(fc) |
2978 | 0 | { |
2979 | 0 | VSPACE_CASES: break; |
2980 | 0 | default: RRETURN(MATCH_NOMATCH); |
2981 | 0 | } |
2982 | 0 | } |
2983 | 0 | break; |
2984 | | |
2985 | 0 | case OP_NOT_DIGIT: |
2986 | 0 | for (i = 1; i <= Lmin; i++) |
2987 | 0 | { |
2988 | 0 | if (Feptr >= mb->end_subject) |
2989 | 0 | { |
2990 | 0 | SCHECK_PARTIAL(); |
2991 | 0 | RRETURN(MATCH_NOMATCH); |
2992 | 0 | } |
2993 | 0 | GETCHARINC(fc, Feptr); |
2994 | 0 | if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0) |
2995 | 0 | RRETURN(MATCH_NOMATCH); |
2996 | 0 | } |
2997 | 0 | break; |
2998 | | |
2999 | 0 | case OP_DIGIT: |
3000 | 0 | for (i = 1; i <= Lmin; i++) |
3001 | 0 | { |
3002 | 0 | uint32_t cc; |
3003 | 0 | if (Feptr >= mb->end_subject) |
3004 | 0 | { |
3005 | 0 | SCHECK_PARTIAL(); |
3006 | 0 | RRETURN(MATCH_NOMATCH); |
3007 | 0 | } |
3008 | 0 | cc = UCHAR21(Feptr); |
3009 | 0 | if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0) |
3010 | 0 | RRETURN(MATCH_NOMATCH); |
3011 | 0 | Feptr++; |
3012 | | /* No need to skip more code units - we know it has only one. */ |
3013 | 0 | } |
3014 | 0 | break; |
3015 | | |
3016 | 0 | case OP_NOT_WHITESPACE: |
3017 | 0 | for (i = 1; i <= Lmin; i++) |
3018 | 0 | { |
3019 | 0 | uint32_t cc; |
3020 | 0 | if (Feptr >= mb->end_subject) |
3021 | 0 | { |
3022 | 0 | SCHECK_PARTIAL(); |
3023 | 0 | RRETURN(MATCH_NOMATCH); |
3024 | 0 | } |
3025 | 0 | cc = UCHAR21(Feptr); |
3026 | 0 | if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0) |
3027 | 0 | RRETURN(MATCH_NOMATCH); |
3028 | 0 | Feptr++; |
3029 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3030 | 0 | } |
3031 | 0 | break; |
3032 | | |
3033 | 0 | case OP_WHITESPACE: |
3034 | 0 | for (i = 1; i <= Lmin; i++) |
3035 | 0 | { |
3036 | 0 | uint32_t cc; |
3037 | 0 | if (Feptr >= mb->end_subject) |
3038 | 0 | { |
3039 | 0 | SCHECK_PARTIAL(); |
3040 | 0 | RRETURN(MATCH_NOMATCH); |
3041 | 0 | } |
3042 | 0 | cc = UCHAR21(Feptr); |
3043 | 0 | if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0) |
3044 | 0 | RRETURN(MATCH_NOMATCH); |
3045 | 0 | Feptr++; |
3046 | | /* No need to skip more code units - we know it has only one. */ |
3047 | 0 | } |
3048 | 0 | break; |
3049 | | |
3050 | 0 | case OP_NOT_WORDCHAR: |
3051 | 0 | for (i = 1; i <= Lmin; i++) |
3052 | 0 | { |
3053 | 0 | uint32_t cc; |
3054 | 0 | if (Feptr >= mb->end_subject) |
3055 | 0 | { |
3056 | 0 | SCHECK_PARTIAL(); |
3057 | 0 | RRETURN(MATCH_NOMATCH); |
3058 | 0 | } |
3059 | 0 | cc = UCHAR21(Feptr); |
3060 | 0 | if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0) |
3061 | 0 | RRETURN(MATCH_NOMATCH); |
3062 | 0 | Feptr++; |
3063 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3064 | 0 | } |
3065 | 0 | break; |
3066 | | |
3067 | 0 | case OP_WORDCHAR: |
3068 | 0 | for (i = 1; i <= Lmin; i++) |
3069 | 0 | { |
3070 | 0 | uint32_t cc; |
3071 | 0 | if (Feptr >= mb->end_subject) |
3072 | 0 | { |
3073 | 0 | SCHECK_PARTIAL(); |
3074 | 0 | RRETURN(MATCH_NOMATCH); |
3075 | 0 | } |
3076 | 0 | cc = UCHAR21(Feptr); |
3077 | 0 | if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0) |
3078 | 0 | RRETURN(MATCH_NOMATCH); |
3079 | 0 | Feptr++; |
3080 | | /* No need to skip more code units - we know it has only one. */ |
3081 | 0 | } |
3082 | 0 | break; |
3083 | | |
3084 | 0 | default: |
3085 | 0 | return PCRE2_ERROR_INTERNAL; |
3086 | 0 | } /* End switch(Lctype) */ |
3087 | | |
3088 | 70.6M | else |
3089 | 70.6M | #endif /* SUPPORT_UNICODE */ |
3090 | | |
3091 | | /* Code for the non-UTF case for minimum matching of operators other |
3092 | | than OP_PROP and OP_NOTPROP. */ |
3093 | | |
3094 | 70.6M | switch(Lctype) |
3095 | 70.6M | { |
3096 | 232k | case OP_ANY: |
3097 | 464k | for (i = 1; i <= Lmin; i++) |
3098 | 244k | { |
3099 | 244k | if (Feptr >= mb->end_subject) |
3100 | 5.24k | { |
3101 | 5.24k | SCHECK_PARTIAL(); |
3102 | 5.24k | RRETURN(MATCH_NOMATCH); |
3103 | 0 | } |
3104 | 239k | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
3105 | 232k | if (mb->partial != 0 && |
3106 | 232k | Feptr + 1 >= mb->end_subject && |
3107 | 232k | NLBLOCK->nltype == NLTYPE_FIXED && |
3108 | 232k | NLBLOCK->nllen == 2 && |
3109 | 232k | *Feptr == NLBLOCK->nl[0]) |
3110 | 0 | { |
3111 | 0 | mb->hitend = TRUE; |
3112 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
3113 | 0 | } |
3114 | 232k | Feptr++; |
3115 | 232k | } |
3116 | 219k | break; |
3117 | | |
3118 | 219k | case OP_ALLANY: |
3119 | 24.2k | if (Feptr > mb->end_subject - Lmin) |
3120 | 188 | { |
3121 | 188 | SCHECK_PARTIAL(); |
3122 | 188 | RRETURN(MATCH_NOMATCH); |
3123 | 0 | } |
3124 | 24.0k | Feptr += Lmin; |
3125 | 24.0k | break; |
3126 | | |
3127 | | /* This OP_ANYBYTE case will never be reached because \C gets turned |
3128 | | into OP_ALLANY in non-UTF mode. Cut out the code so that coverage |
3129 | | reports don't complain about it's never being used. */ |
3130 | | |
3131 | | /* case OP_ANYBYTE: |
3132 | | * if (Feptr > mb->end_subject - Lmin) |
3133 | | * { |
3134 | | * SCHECK_PARTIAL(); |
3135 | | * RRETURN(MATCH_NOMATCH); |
3136 | | * } |
3137 | | * Feptr += Lmin; |
3138 | | * break; |
3139 | | */ |
3140 | 48.8M | case OP_ANYNL: |
3141 | 51.8M | for (i = 1; i <= Lmin; i++) |
3142 | 48.8M | { |
3143 | 48.8M | if (Feptr >= mb->end_subject) |
3144 | 27.1k | { |
3145 | 27.1k | SCHECK_PARTIAL(); |
3146 | 27.1k | RRETURN(MATCH_NOMATCH); |
3147 | 0 | } |
3148 | 48.8M | switch(*Feptr++) |
3149 | 48.8M | { |
3150 | 45.9M | default: RRETURN(MATCH_NOMATCH); |
3151 | |
|
3152 | 693k | case CHAR_CR: |
3153 | 693k | if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; |
3154 | 693k | break; |
3155 | | |
3156 | 1.41M | case CHAR_LF: |
3157 | 1.41M | break; |
3158 | | |
3159 | 265k | case CHAR_VT: |
3160 | 585k | case CHAR_FF: |
3161 | 823k | case CHAR_NEL: |
3162 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3163 | | case 0x2028: |
3164 | | case 0x2029: |
3165 | | #endif |
3166 | 823k | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
3167 | 823k | break; |
3168 | 48.8M | } |
3169 | 48.8M | } |
3170 | 2.93M | break; |
3171 | | |
3172 | 2.93M | case OP_NOT_HSPACE: |
3173 | 4.66M | for (i = 1; i <= Lmin; i++) |
3174 | 2.44M | { |
3175 | 2.44M | if (Feptr >= mb->end_subject) |
3176 | 0 | { |
3177 | 0 | SCHECK_PARTIAL(); |
3178 | 0 | RRETURN(MATCH_NOMATCH); |
3179 | 0 | } |
3180 | 2.44M | switch(*Feptr++) |
3181 | 2.44M | { |
3182 | 2.34M | default: break; |
3183 | 2.34M | HSPACE_BYTE_CASES: |
3184 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3185 | | HSPACE_MULTIBYTE_CASES: |
3186 | | #endif |
3187 | 92.5k | RRETURN(MATCH_NOMATCH); |
3188 | 2.44M | } |
3189 | 2.44M | } |
3190 | 2.22M | break; |
3191 | | |
3192 | 2.22M | case OP_HSPACE: |
3193 | 1.08M | for (i = 1; i <= Lmin; i++) |
3194 | 1.05M | { |
3195 | 1.05M | if (Feptr >= mb->end_subject) |
3196 | 300 | { |
3197 | 300 | SCHECK_PARTIAL(); |
3198 | 300 | RRETURN(MATCH_NOMATCH); |
3199 | 0 | } |
3200 | 1.05M | switch(*Feptr++) |
3201 | 1.05M | { |
3202 | 1.02M | default: RRETURN(MATCH_NOMATCH); |
3203 | 54.4k | HSPACE_BYTE_CASES: |
3204 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3205 | | HSPACE_MULTIBYTE_CASES: |
3206 | | #endif |
3207 | 54.4k | break; |
3208 | 1.05M | } |
3209 | 1.05M | } |
3210 | 26.7k | break; |
3211 | | |
3212 | 895k | case OP_NOT_VSPACE: |
3213 | 1.72M | for (i = 1; i <= Lmin; i++) |
3214 | 895k | { |
3215 | 895k | if (Feptr >= mb->end_subject) |
3216 | 1.48k | { |
3217 | 1.48k | SCHECK_PARTIAL(); |
3218 | 1.48k | RRETURN(MATCH_NOMATCH); |
3219 | 0 | } |
3220 | 894k | switch(*Feptr++) |
3221 | 894k | { |
3222 | 60.9k | VSPACE_BYTE_CASES: |
3223 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3224 | | VSPACE_MULTIBYTE_CASES: |
3225 | | #endif |
3226 | 60.9k | RRETURN(MATCH_NOMATCH); |
3227 | 833k | default: break; |
3228 | 894k | } |
3229 | 894k | } |
3230 | 833k | break; |
3231 | | |
3232 | 2.65M | case OP_VSPACE: |
3233 | 2.70M | for (i = 1; i <= Lmin; i++) |
3234 | 2.65M | { |
3235 | 2.65M | if (Feptr >= mb->end_subject) |
3236 | 16.9k | { |
3237 | 16.9k | SCHECK_PARTIAL(); |
3238 | 16.9k | RRETURN(MATCH_NOMATCH); |
3239 | 0 | } |
3240 | 2.63M | switch(*Feptr++) |
3241 | 2.63M | { |
3242 | 2.57M | default: RRETURN(MATCH_NOMATCH); |
3243 | 207k | VSPACE_BYTE_CASES: |
3244 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3245 | | VSPACE_MULTIBYTE_CASES: |
3246 | | #endif |
3247 | 207k | break; |
3248 | 2.63M | } |
3249 | 2.63M | } |
3250 | 58.3k | break; |
3251 | | |
3252 | 10.9M | case OP_NOT_DIGIT: |
3253 | 27.9M | for (i = 1; i <= Lmin; i++) |
3254 | 18.2M | { |
3255 | 18.2M | if (Feptr >= mb->end_subject) |
3256 | 1.27k | { |
3257 | 1.27k | SCHECK_PARTIAL(); |
3258 | 1.27k | RRETURN(MATCH_NOMATCH); |
3259 | 0 | } |
3260 | 18.2M | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) |
3261 | 17.0M | RRETURN(MATCH_NOMATCH); |
3262 | 17.0M | Feptr++; |
3263 | 17.0M | } |
3264 | 9.75M | break; |
3265 | | |
3266 | 9.75M | case OP_DIGIT: |
3267 | 571k | for (i = 1; i <= Lmin; i++) |
3268 | 319k | { |
3269 | 319k | if (Feptr >= mb->end_subject) |
3270 | 31.0k | { |
3271 | 31.0k | SCHECK_PARTIAL(); |
3272 | 31.0k | RRETURN(MATCH_NOMATCH); |
3273 | 0 | } |
3274 | 288k | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) |
3275 | 253k | RRETURN(MATCH_NOMATCH); |
3276 | 253k | Feptr++; |
3277 | 253k | } |
3278 | 251k | break; |
3279 | | |
3280 | 251k | case OP_NOT_WHITESPACE: |
3281 | 270k | for (i = 1; i <= Lmin; i++) |
3282 | 137k | { |
3283 | 137k | if (Feptr >= mb->end_subject) |
3284 | 0 | { |
3285 | 0 | SCHECK_PARTIAL(); |
3286 | 0 | RRETURN(MATCH_NOMATCH); |
3287 | 0 | } |
3288 | 137k | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) |
3289 | 132k | RRETURN(MATCH_NOMATCH); |
3290 | 132k | Feptr++; |
3291 | 132k | } |
3292 | 132k | break; |
3293 | | |
3294 | 2.19M | case OP_WHITESPACE: |
3295 | 2.34M | for (i = 1; i <= Lmin; i++) |
3296 | 2.19M | { |
3297 | 2.19M | if (Feptr >= mb->end_subject) |
3298 | 45.4k | { |
3299 | 45.4k | SCHECK_PARTIAL(); |
3300 | 45.4k | RRETURN(MATCH_NOMATCH); |
3301 | 0 | } |
3302 | 2.14M | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) |
3303 | 1.99M | RRETURN(MATCH_NOMATCH); |
3304 | 154k | Feptr++; |
3305 | 154k | } |
3306 | 154k | break; |
3307 | | |
3308 | 909k | case OP_NOT_WORDCHAR: |
3309 | 1.56M | for (i = 1; i <= Lmin; i++) |
3310 | 909k | { |
3311 | 909k | if (Feptr >= mb->end_subject) |
3312 | 109 | { |
3313 | 109 | SCHECK_PARTIAL(); |
3314 | 109 | RRETURN(MATCH_NOMATCH); |
3315 | 0 | } |
3316 | 908k | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) |
3317 | 658k | RRETURN(MATCH_NOMATCH); |
3318 | 658k | Feptr++; |
3319 | 658k | } |
3320 | 658k | break; |
3321 | | |
3322 | 658k | case OP_WORDCHAR: |
3323 | 213k | for (i = 1; i <= Lmin; i++) |
3324 | 144k | { |
3325 | 144k | if (Feptr >= mb->end_subject) |
3326 | 2.79k | { |
3327 | 2.79k | SCHECK_PARTIAL(); |
3328 | 2.79k | RRETURN(MATCH_NOMATCH); |
3329 | 0 | } |
3330 | 141k | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) |
3331 | 72.9k | RRETURN(MATCH_NOMATCH); |
3332 | 68.6k | Feptr++; |
3333 | 68.6k | } |
3334 | 68.6k | break; |
3335 | | |
3336 | 68.6k | default: |
3337 | 0 | return PCRE2_ERROR_INTERNAL; |
3338 | 70.6M | } |
3339 | 71.3M | } |
3340 | | |
3341 | | /* If Lmin = Lmax we are done. Continue with the main loop. */ |
3342 | | |
3343 | 78.7M | if (Lmin == Lmax) continue; |
3344 | | |
3345 | | /* If minimizing, we have to test the rest of the pattern before each |
3346 | | subsequent match. */ |
3347 | | |
3348 | 71.5M | if (reptype == REPTYPE_MIN) |
3349 | 3.05M | { |
3350 | 3.05M | #ifdef SUPPORT_UNICODE |
3351 | 3.05M | if (proptype >= 0) |
3352 | 2.02k | { |
3353 | 2.02k | switch(proptype) |
3354 | 2.02k | { |
3355 | 0 | case PT_ANY: |
3356 | 0 | for (;;) |
3357 | 0 | { |
3358 | 0 | RMATCH(Fecode, RM208); |
3359 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3360 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3361 | 0 | if (Feptr >= mb->end_subject) |
3362 | 0 | { |
3363 | 0 | SCHECK_PARTIAL(); |
3364 | 0 | RRETURN(MATCH_NOMATCH); |
3365 | 0 | } |
3366 | 0 | GETCHARINCTEST(fc, Feptr); |
3367 | 0 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
3368 | 0 | } |
3369 | | /* Control never gets here */ |
3370 | | |
3371 | 0 | case PT_LAMP: |
3372 | 0 | for (;;) |
3373 | 0 | { |
3374 | 0 | int chartype; |
3375 | 0 | RMATCH(Fecode, RM209); |
3376 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3377 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3378 | 0 | if (Feptr >= mb->end_subject) |
3379 | 0 | { |
3380 | 0 | SCHECK_PARTIAL(); |
3381 | 0 | RRETURN(MATCH_NOMATCH); |
3382 | 0 | } |
3383 | 0 | GETCHARINCTEST(fc, Feptr); |
3384 | 0 | chartype = UCD_CHARTYPE(fc); |
3385 | 0 | if ((chartype == ucp_Lu || |
3386 | 0 | chartype == ucp_Ll || |
3387 | 0 | chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) |
3388 | 0 | RRETURN(MATCH_NOMATCH); |
3389 | 0 | } |
3390 | | /* Control never gets here */ |
3391 | | |
3392 | 2.02k | case PT_GC: |
3393 | 2.02k | for (;;) |
3394 | 134k | { |
3395 | 134k | RMATCH(Fecode, RM210); |
3396 | 134k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3397 | 134k | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3398 | 134k | if (Feptr >= mb->end_subject) |
3399 | 480 | { |
3400 | 480 | SCHECK_PARTIAL(); |
3401 | 480 | RRETURN(MATCH_NOMATCH); |
3402 | 0 | } |
3403 | 133k | GETCHARINCTEST(fc, Feptr); |
3404 | 133k | if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3405 | 132k | RRETURN(MATCH_NOMATCH); |
3406 | 132k | } |
3407 | | /* Control never gets here */ |
3408 | | |
3409 | 0 | case PT_PC: |
3410 | 0 | for (;;) |
3411 | 0 | { |
3412 | 0 | RMATCH(Fecode, RM211); |
3413 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3414 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3415 | 0 | if (Feptr >= mb->end_subject) |
3416 | 0 | { |
3417 | 0 | SCHECK_PARTIAL(); |
3418 | 0 | RRETURN(MATCH_NOMATCH); |
3419 | 0 | } |
3420 | 0 | GETCHARINCTEST(fc, Feptr); |
3421 | 0 | if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3422 | 0 | RRETURN(MATCH_NOMATCH); |
3423 | 0 | } |
3424 | | /* Control never gets here */ |
3425 | | |
3426 | 0 | case PT_SC: |
3427 | 0 | for (;;) |
3428 | 0 | { |
3429 | 0 | RMATCH(Fecode, RM212); |
3430 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3431 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3432 | 0 | if (Feptr >= mb->end_subject) |
3433 | 0 | { |
3434 | 0 | SCHECK_PARTIAL(); |
3435 | 0 | RRETURN(MATCH_NOMATCH); |
3436 | 0 | } |
3437 | 0 | GETCHARINCTEST(fc, Feptr); |
3438 | 0 | if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3439 | 0 | RRETURN(MATCH_NOMATCH); |
3440 | 0 | } |
3441 | | /* Control never gets here */ |
3442 | | |
3443 | 0 | case PT_ALNUM: |
3444 | 0 | for (;;) |
3445 | 0 | { |
3446 | 0 | int category; |
3447 | 0 | RMATCH(Fecode, RM213); |
3448 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3449 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3450 | 0 | if (Feptr >= mb->end_subject) |
3451 | 0 | { |
3452 | 0 | SCHECK_PARTIAL(); |
3453 | 0 | RRETURN(MATCH_NOMATCH); |
3454 | 0 | } |
3455 | 0 | GETCHARINCTEST(fc, Feptr); |
3456 | 0 | category = UCD_CATEGORY(fc); |
3457 | 0 | if ((category == ucp_L || category == ucp_N) == |
3458 | 0 | (Lctype == OP_NOTPROP)) |
3459 | 0 | RRETURN(MATCH_NOMATCH); |
3460 | 0 | } |
3461 | | /* Control never gets here */ |
3462 | | |
3463 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
3464 | | which means that Perl space and POSIX space are now identical. PCRE |
3465 | | was changed at release 8.34. */ |
3466 | | |
3467 | 0 | case PT_SPACE: /* Perl space */ |
3468 | 0 | case PT_PXSPACE: /* POSIX space */ |
3469 | 0 | for (;;) |
3470 | 0 | { |
3471 | 0 | RMATCH(Fecode, RM214); |
3472 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3473 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3474 | 0 | if (Feptr >= mb->end_subject) |
3475 | 0 | { |
3476 | 0 | SCHECK_PARTIAL(); |
3477 | 0 | RRETURN(MATCH_NOMATCH); |
3478 | 0 | } |
3479 | 0 | GETCHARINCTEST(fc, Feptr); |
3480 | 0 | switch(fc) |
3481 | 0 | { |
3482 | 0 | HSPACE_CASES: |
3483 | 0 | VSPACE_CASES: |
3484 | 0 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
3485 | 0 | break; |
3486 | | |
3487 | 0 | default: |
3488 | 0 | if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) |
3489 | 0 | RRETURN(MATCH_NOMATCH); |
3490 | 0 | break; |
3491 | 0 | } |
3492 | 0 | } |
3493 | | /* Control never gets here */ |
3494 | | |
3495 | 0 | case PT_WORD: |
3496 | 0 | for (;;) |
3497 | 0 | { |
3498 | 0 | int category; |
3499 | 0 | RMATCH(Fecode, RM215); |
3500 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3501 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3502 | 0 | if (Feptr >= mb->end_subject) |
3503 | 0 | { |
3504 | 0 | SCHECK_PARTIAL(); |
3505 | 0 | RRETURN(MATCH_NOMATCH); |
3506 | 0 | } |
3507 | 0 | GETCHARINCTEST(fc, Feptr); |
3508 | 0 | category = UCD_CATEGORY(fc); |
3509 | 0 | if ((category == ucp_L || |
3510 | 0 | category == ucp_N || |
3511 | 0 | fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) |
3512 | 0 | RRETURN(MATCH_NOMATCH); |
3513 | 0 | } |
3514 | | /* Control never gets here */ |
3515 | | |
3516 | 0 | case PT_CLIST: |
3517 | 0 | for (;;) |
3518 | 0 | { |
3519 | 0 | const uint32_t *cp; |
3520 | 0 | RMATCH(Fecode, RM216); |
3521 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3522 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3523 | 0 | if (Feptr >= mb->end_subject) |
3524 | 0 | { |
3525 | 0 | SCHECK_PARTIAL(); |
3526 | 0 | RRETURN(MATCH_NOMATCH); |
3527 | 0 | } |
3528 | 0 | GETCHARINCTEST(fc, Feptr); |
3529 | 0 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
3530 | 0 | for (;;) |
3531 | 0 | { |
3532 | 0 | if (fc < *cp) |
3533 | 0 | { |
3534 | 0 | if (Lctype == OP_NOTPROP) break; |
3535 | 0 | RRETURN(MATCH_NOMATCH); |
3536 | 0 | } |
3537 | 0 | if (fc == *cp++) |
3538 | 0 | { |
3539 | 0 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
3540 | 0 | break; |
3541 | 0 | } |
3542 | 0 | } |
3543 | 0 | } |
3544 | | /* Control never gets here */ |
3545 | | |
3546 | 0 | case PT_UCNC: |
3547 | 0 | for (;;) |
3548 | 0 | { |
3549 | 0 | RMATCH(Fecode, RM217); |
3550 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3551 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3552 | 0 | if (Feptr >= mb->end_subject) |
3553 | 0 | { |
3554 | 0 | SCHECK_PARTIAL(); |
3555 | 0 | RRETURN(MATCH_NOMATCH); |
3556 | 0 | } |
3557 | 0 | GETCHARINCTEST(fc, Feptr); |
3558 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
3559 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
3560 | 0 | fc >= 0xe000) == (Lctype == OP_NOTPROP)) |
3561 | 0 | RRETURN(MATCH_NOMATCH); |
3562 | 0 | } |
3563 | | /* Control never gets here */ |
3564 | | |
3565 | | /* This should never occur */ |
3566 | 0 | default: |
3567 | 0 | return PCRE2_ERROR_INTERNAL; |
3568 | 2.02k | } |
3569 | 2.02k | } |
3570 | | |
3571 | | /* Match extended Unicode sequences. We will get here only if the |
3572 | | support is in the binary; otherwise a compile-time error occurs. */ |
3573 | | |
3574 | 3.05M | else if (Lctype == OP_EXTUNI) |
3575 | 406k | { |
3576 | 406k | for (;;) |
3577 | 32.5M | { |
3578 | 32.5M | RMATCH(Fecode, RM218); |
3579 | 32.5M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3580 | 32.5M | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3581 | 32.5M | if (Feptr >= mb->end_subject) |
3582 | 406k | { |
3583 | 406k | SCHECK_PARTIAL(); |
3584 | 406k | RRETURN(MATCH_NOMATCH); |
3585 | 0 | } |
3586 | 32.1M | else |
3587 | 32.1M | { |
3588 | 32.1M | GETCHARINCTEST(fc, Feptr); |
3589 | 32.1M | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, |
3590 | 32.1M | utf, NULL); |
3591 | 32.1M | } |
3592 | 32.1M | CHECK_PARTIAL(); |
3593 | 32.1M | } |
3594 | 406k | } |
3595 | 2.64M | else |
3596 | 2.64M | #endif /* SUPPORT_UNICODE */ |
3597 | | |
3598 | | /* UTF mode for non-property testing character types. */ |
3599 | | |
3600 | 2.64M | #ifdef SUPPORT_UNICODE |
3601 | 2.64M | if (utf) |
3602 | 0 | { |
3603 | 0 | for (;;) |
3604 | 0 | { |
3605 | 0 | RMATCH(Fecode, RM219); |
3606 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3607 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3608 | 0 | if (Feptr >= mb->end_subject) |
3609 | 0 | { |
3610 | 0 | SCHECK_PARTIAL(); |
3611 | 0 | RRETURN(MATCH_NOMATCH); |
3612 | 0 | } |
3613 | 0 | if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
3614 | 0 | GETCHARINC(fc, Feptr); |
3615 | 0 | switch(Lctype) |
3616 | 0 | { |
3617 | 0 | case OP_ANY: /* This is the non-NL case */ |
3618 | 0 | if (mb->partial != 0 && /* Take care with CRLF partial */ |
3619 | 0 | Feptr >= mb->end_subject && |
3620 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
3621 | 0 | NLBLOCK->nllen == 2 && |
3622 | 0 | fc == NLBLOCK->nl[0]) |
3623 | 0 | { |
3624 | 0 | mb->hitend = TRUE; |
3625 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
3626 | 0 | } |
3627 | 0 | break; |
3628 | | |
3629 | 0 | case OP_ALLANY: |
3630 | 0 | case OP_ANYBYTE: |
3631 | 0 | break; |
3632 | | |
3633 | 0 | case OP_ANYNL: |
3634 | 0 | switch(fc) |
3635 | 0 | { |
3636 | 0 | default: RRETURN(MATCH_NOMATCH); |
3637 | |
|
3638 | 0 | case CHAR_CR: |
3639 | 0 | if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; |
3640 | 0 | break; |
3641 | | |
3642 | 0 | case CHAR_LF: |
3643 | 0 | break; |
3644 | | |
3645 | 0 | case CHAR_VT: |
3646 | 0 | case CHAR_FF: |
3647 | 0 | case CHAR_NEL: |
3648 | 0 | #ifndef EBCDIC |
3649 | 0 | case 0x2028: |
3650 | 0 | case 0x2029: |
3651 | 0 | #endif /* Not EBCDIC */ |
3652 | 0 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) |
3653 | 0 | RRETURN(MATCH_NOMATCH); |
3654 | 0 | break; |
3655 | 0 | } |
3656 | 0 | break; |
3657 | | |
3658 | 0 | case OP_NOT_HSPACE: |
3659 | 0 | switch(fc) |
3660 | 0 | { |
3661 | 0 | HSPACE_CASES: RRETURN(MATCH_NOMATCH); |
3662 | 0 | default: break; |
3663 | 0 | } |
3664 | 0 | break; |
3665 | | |
3666 | 0 | case OP_HSPACE: |
3667 | 0 | switch(fc) |
3668 | 0 | { |
3669 | 0 | HSPACE_CASES: break; |
3670 | 0 | default: RRETURN(MATCH_NOMATCH); |
3671 | 0 | } |
3672 | 0 | break; |
3673 | | |
3674 | 0 | case OP_NOT_VSPACE: |
3675 | 0 | switch(fc) |
3676 | 0 | { |
3677 | 0 | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
3678 | 0 | default: break; |
3679 | 0 | } |
3680 | 0 | break; |
3681 | | |
3682 | 0 | case OP_VSPACE: |
3683 | 0 | switch(fc) |
3684 | 0 | { |
3685 | 0 | VSPACE_CASES: break; |
3686 | 0 | default: RRETURN(MATCH_NOMATCH); |
3687 | 0 | } |
3688 | 0 | break; |
3689 | | |
3690 | 0 | case OP_NOT_DIGIT: |
3691 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) |
3692 | 0 | RRETURN(MATCH_NOMATCH); |
3693 | 0 | break; |
3694 | | |
3695 | 0 | case OP_DIGIT: |
3696 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0) |
3697 | 0 | RRETURN(MATCH_NOMATCH); |
3698 | 0 | break; |
3699 | | |
3700 | 0 | case OP_NOT_WHITESPACE: |
3701 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) |
3702 | 0 | RRETURN(MATCH_NOMATCH); |
3703 | 0 | break; |
3704 | | |
3705 | 0 | case OP_WHITESPACE: |
3706 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0) |
3707 | 0 | RRETURN(MATCH_NOMATCH); |
3708 | 0 | break; |
3709 | | |
3710 | 0 | case OP_NOT_WORDCHAR: |
3711 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) |
3712 | 0 | RRETURN(MATCH_NOMATCH); |
3713 | 0 | break; |
3714 | | |
3715 | 0 | case OP_WORDCHAR: |
3716 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) |
3717 | 0 | RRETURN(MATCH_NOMATCH); |
3718 | 0 | break; |
3719 | | |
3720 | 0 | default: |
3721 | 0 | return PCRE2_ERROR_INTERNAL; |
3722 | 0 | } |
3723 | 0 | } |
3724 | 0 | } |
3725 | 2.64M | else |
3726 | 2.64M | #endif /* SUPPORT_UNICODE */ |
3727 | | |
3728 | | /* Not UTF mode */ |
3729 | 2.64M | { |
3730 | 2.64M | for (;;) |
3731 | 125M | { |
3732 | 125M | RMATCH(Fecode, RM33); |
3733 | 125M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3734 | 125M | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3735 | 124M | if (Feptr >= mb->end_subject) |
3736 | 50.3k | { |
3737 | 50.3k | SCHECK_PARTIAL(); |
3738 | 50.3k | RRETURN(MATCH_NOMATCH); |
3739 | 0 | } |
3740 | 124M | if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) |
3741 | 124M | RRETURN(MATCH_NOMATCH); |
3742 | 124M | fc = *Feptr++; |
3743 | 124M | switch(Lctype) |
3744 | 124M | { |
3745 | 1.23M | case OP_ANY: /* This is the non-NL case */ |
3746 | 1.23M | if (mb->partial != 0 && /* Take care with CRLF partial */ |
3747 | 1.23M | Feptr >= mb->end_subject && |
3748 | 1.23M | NLBLOCK->nltype == NLTYPE_FIXED && |
3749 | 1.23M | NLBLOCK->nllen == 2 && |
3750 | 1.23M | fc == NLBLOCK->nl[0]) |
3751 | 0 | { |
3752 | 0 | mb->hitend = TRUE; |
3753 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
3754 | 0 | } |
3755 | 1.23M | break; |
3756 | | |
3757 | 1.23M | case OP_ALLANY: |
3758 | 432k | case OP_ANYBYTE: |
3759 | 432k | break; |
3760 | | |
3761 | 883 | case OP_ANYNL: |
3762 | 883 | switch(fc) |
3763 | 883 | { |
3764 | 600 | default: RRETURN(MATCH_NOMATCH); |
3765 | |
|
3766 | 189 | case CHAR_CR: |
3767 | 189 | if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; |
3768 | 189 | break; |
3769 | | |
3770 | 35 | case CHAR_LF: |
3771 | 35 | break; |
3772 | | |
3773 | 2 | case CHAR_VT: |
3774 | 4 | case CHAR_FF: |
3775 | 59 | case CHAR_NEL: |
3776 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3777 | | case 0x2028: |
3778 | | case 0x2029: |
3779 | | #endif |
3780 | 59 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) |
3781 | 59 | RRETURN(MATCH_NOMATCH); |
3782 | 59 | break; |
3783 | 883 | } |
3784 | 283 | break; |
3785 | | |
3786 | 47.1M | case OP_NOT_HSPACE: |
3787 | 47.1M | switch(fc) |
3788 | 47.1M | { |
3789 | 46.6M | default: break; |
3790 | 46.6M | HSPACE_BYTE_CASES: |
3791 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3792 | | HSPACE_MULTIBYTE_CASES: |
3793 | | #endif |
3794 | 477k | RRETURN(MATCH_NOMATCH); |
3795 | 47.1M | } |
3796 | 46.6M | break; |
3797 | | |
3798 | 46.6M | case OP_HSPACE: |
3799 | 6 | switch(fc) |
3800 | 6 | { |
3801 | 6 | default: RRETURN(MATCH_NOMATCH); |
3802 | 0 | HSPACE_BYTE_CASES: |
3803 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3804 | | HSPACE_MULTIBYTE_CASES: |
3805 | | #endif |
3806 | 0 | break; |
3807 | 6 | } |
3808 | 0 | break; |
3809 | | |
3810 | 44.3M | case OP_NOT_VSPACE: |
3811 | 44.3M | switch(fc) |
3812 | 44.3M | { |
3813 | 43.1M | default: break; |
3814 | 43.1M | VSPACE_BYTE_CASES: |
3815 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3816 | | VSPACE_MULTIBYTE_CASES: |
3817 | | #endif |
3818 | 1.19M | RRETURN(MATCH_NOMATCH); |
3819 | 44.3M | } |
3820 | 43.1M | break; |
3821 | | |
3822 | 43.1M | case OP_VSPACE: |
3823 | 0 | switch(fc) |
3824 | 0 | { |
3825 | 0 | default: RRETURN(MATCH_NOMATCH); |
3826 | 0 | VSPACE_BYTE_CASES: |
3827 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3828 | | VSPACE_MULTIBYTE_CASES: |
3829 | | #endif |
3830 | 0 | break; |
3831 | 0 | } |
3832 | 0 | break; |
3833 | | |
3834 | 6.08k | case OP_NOT_DIGIT: |
3835 | 6.08k | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) |
3836 | 5.74k | RRETURN(MATCH_NOMATCH); |
3837 | 5.74k | break; |
3838 | | |
3839 | 0 | case OP_DIGIT: |
3840 | 0 | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) |
3841 | 0 | RRETURN(MATCH_NOMATCH); |
3842 | 0 | break; |
3843 | | |
3844 | 29.8M | case OP_NOT_WHITESPACE: |
3845 | 29.8M | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) |
3846 | 29.7M | RRETURN(MATCH_NOMATCH); |
3847 | 29.7M | break; |
3848 | | |
3849 | 4 | case OP_WHITESPACE: |
3850 | 4 | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) |
3851 | 4 | RRETURN(MATCH_NOMATCH); |
3852 | 0 | break; |
3853 | | |
3854 | 1.82M | case OP_NOT_WORDCHAR: |
3855 | 1.82M | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) |
3856 | 1.74M | RRETURN(MATCH_NOMATCH); |
3857 | 1.74M | break; |
3858 | | |
3859 | 31.2k | case OP_WORDCHAR: |
3860 | 31.2k | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) |
3861 | 23.6k | RRETURN(MATCH_NOMATCH); |
3862 | 23.6k | break; |
3863 | | |
3864 | 0 | default: |
3865 | 0 | return PCRE2_ERROR_INTERNAL; |
3866 | 124M | } |
3867 | 124M | } |
3868 | 2.64M | } |
3869 | | /* Control never gets here */ |
3870 | 3.05M | } |
3871 | | |
3872 | | /* If maximizing, it is worth using inline code for speed, doing the type |
3873 | | test once at the start (i.e. keep it out of the loop). */ |
3874 | | |
3875 | 68.5M | else |
3876 | 68.5M | { |
3877 | 68.5M | Lstart_eptr = Feptr; /* Remember where we started */ |
3878 | | |
3879 | 68.5M | #ifdef SUPPORT_UNICODE |
3880 | 68.5M | if (proptype >= 0) |
3881 | 156k | { |
3882 | 156k | switch(proptype) |
3883 | 156k | { |
3884 | 0 | case PT_ANY: |
3885 | 0 | for (i = Lmin; i < Lmax; i++) |
3886 | 0 | { |
3887 | 0 | int len = 1; |
3888 | 0 | if (Feptr >= mb->end_subject) |
3889 | 0 | { |
3890 | 0 | SCHECK_PARTIAL(); |
3891 | 0 | break; |
3892 | 0 | } |
3893 | 0 | GETCHARLENTEST(fc, Feptr, len); |
3894 | 0 | if (Lctype == OP_NOTPROP) break; |
3895 | 0 | Feptr+= len; |
3896 | 0 | } |
3897 | 0 | break; |
3898 | | |
3899 | 0 | case PT_LAMP: |
3900 | 0 | for (i = Lmin; i < Lmax; i++) |
3901 | 0 | { |
3902 | 0 | int chartype; |
3903 | 0 | int len = 1; |
3904 | 0 | if (Feptr >= mb->end_subject) |
3905 | 0 | { |
3906 | 0 | SCHECK_PARTIAL(); |
3907 | 0 | break; |
3908 | 0 | } |
3909 | 0 | GETCHARLENTEST(fc, Feptr, len); |
3910 | 0 | chartype = UCD_CHARTYPE(fc); |
3911 | 0 | if ((chartype == ucp_Lu || |
3912 | 0 | chartype == ucp_Ll || |
3913 | 0 | chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) |
3914 | 0 | break; |
3915 | 0 | Feptr+= len; |
3916 | 0 | } |
3917 | 0 | break; |
3918 | | |
3919 | 156k | case PT_GC: |
3920 | 2.94M | for (i = Lmin; i < Lmax; i++) |
3921 | 2.94M | { |
3922 | 2.94M | int len = 1; |
3923 | 2.94M | if (Feptr >= mb->end_subject) |
3924 | 22.2k | { |
3925 | 22.2k | SCHECK_PARTIAL(); |
3926 | 22.2k | break; |
3927 | 22.2k | } |
3928 | 2.91M | GETCHARLENTEST(fc, Feptr, len); |
3929 | 2.91M | if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3930 | 128k | break; |
3931 | 2.79M | Feptr+= len; |
3932 | 2.79M | } |
3933 | 156k | break; |
3934 | | |
3935 | 156k | case PT_PC: |
3936 | 0 | for (i = Lmin; i < Lmax; i++) |
3937 | 0 | { |
3938 | 0 | int len = 1; |
3939 | 0 | if (Feptr >= mb->end_subject) |
3940 | 0 | { |
3941 | 0 | SCHECK_PARTIAL(); |
3942 | 0 | break; |
3943 | 0 | } |
3944 | 0 | GETCHARLENTEST(fc, Feptr, len); |
3945 | 0 | if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3946 | 0 | break; |
3947 | 0 | Feptr+= len; |
3948 | 0 | } |
3949 | 0 | break; |
3950 | | |
3951 | 0 | case PT_SC: |
3952 | 0 | for (i = Lmin; i < Lmax; i++) |
3953 | 0 | { |
3954 | 0 | int len = 1; |
3955 | 0 | if (Feptr >= mb->end_subject) |
3956 | 0 | { |
3957 | 0 | SCHECK_PARTIAL(); |
3958 | 0 | break; |
3959 | 0 | } |
3960 | 0 | GETCHARLENTEST(fc, Feptr, len); |
3961 | 0 | if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3962 | 0 | break; |
3963 | 0 | Feptr+= len; |
3964 | 0 | } |
3965 | 0 | break; |
3966 | | |
3967 | 0 | case PT_ALNUM: |
3968 | 0 | for (i = Lmin; i < Lmax; i++) |
3969 | 0 | { |
3970 | 0 | int category; |
3971 | 0 | int len = 1; |
3972 | 0 | if (Feptr >= mb->end_subject) |
3973 | 0 | { |
3974 | 0 | SCHECK_PARTIAL(); |
3975 | 0 | break; |
3976 | 0 | } |
3977 | 0 | GETCHARLENTEST(fc, Feptr, len); |
3978 | 0 | category = UCD_CATEGORY(fc); |
3979 | 0 | if ((category == ucp_L || category == ucp_N) == |
3980 | 0 | (Lctype == OP_NOTPROP)) |
3981 | 0 | break; |
3982 | 0 | Feptr+= len; |
3983 | 0 | } |
3984 | 0 | break; |
3985 | | |
3986 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
3987 | | which means that Perl space and POSIX space are now identical. PCRE |
3988 | | was changed at release 8.34. */ |
3989 | | |
3990 | 0 | case PT_SPACE: /* Perl space */ |
3991 | 0 | case PT_PXSPACE: /* POSIX space */ |
3992 | 0 | for (i = Lmin; i < Lmax; i++) |
3993 | 0 | { |
3994 | 0 | int len = 1; |
3995 | 0 | if (Feptr >= mb->end_subject) |
3996 | 0 | { |
3997 | 0 | SCHECK_PARTIAL(); |
3998 | 0 | break; |
3999 | 0 | } |
4000 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4001 | 0 | switch(fc) |
4002 | 0 | { |
4003 | 0 | HSPACE_CASES: |
4004 | 0 | VSPACE_CASES: |
4005 | 0 | if (Lctype == OP_NOTPROP) goto ENDLOOP99; /* Break the loop */ |
4006 | 0 | break; |
4007 | | |
4008 | 0 | default: |
4009 | 0 | if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) |
4010 | 0 | goto ENDLOOP99; /* Break the loop */ |
4011 | 0 | break; |
4012 | 0 | } |
4013 | 0 | Feptr+= len; |
4014 | 0 | } |
4015 | 0 | ENDLOOP99: |
4016 | 0 | break; |
4017 | | |
4018 | 0 | case PT_WORD: |
4019 | 0 | for (i = Lmin; i < Lmax; i++) |
4020 | 0 | { |
4021 | 0 | int category; |
4022 | 0 | int len = 1; |
4023 | 0 | if (Feptr >= mb->end_subject) |
4024 | 0 | { |
4025 | 0 | SCHECK_PARTIAL(); |
4026 | 0 | break; |
4027 | 0 | } |
4028 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4029 | 0 | category = UCD_CATEGORY(fc); |
4030 | 0 | if ((category == ucp_L || category == ucp_N || |
4031 | 0 | fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) |
4032 | 0 | break; |
4033 | 0 | Feptr+= len; |
4034 | 0 | } |
4035 | 0 | break; |
4036 | | |
4037 | 0 | case PT_CLIST: |
4038 | 0 | for (i = Lmin; i < Lmax; i++) |
4039 | 0 | { |
4040 | 0 | const uint32_t *cp; |
4041 | 0 | int len = 1; |
4042 | 0 | if (Feptr >= mb->end_subject) |
4043 | 0 | { |
4044 | 0 | SCHECK_PARTIAL(); |
4045 | 0 | break; |
4046 | 0 | } |
4047 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4048 | 0 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
4049 | 0 | for (;;) |
4050 | 0 | { |
4051 | 0 | if (fc < *cp) |
4052 | 0 | { if (Lctype == OP_NOTPROP) break; else goto GOT_MAX; } |
4053 | 0 | if (fc == *cp++) |
4054 | 0 | { if (Lctype == OP_NOTPROP) goto GOT_MAX; else break; } |
4055 | 0 | } |
4056 | 0 | Feptr += len; |
4057 | 0 | } |
4058 | 0 | GOT_MAX: |
4059 | 0 | break; |
4060 | | |
4061 | 0 | case PT_UCNC: |
4062 | 0 | for (i = Lmin; i < Lmax; i++) |
4063 | 0 | { |
4064 | 0 | int len = 1; |
4065 | 0 | if (Feptr >= mb->end_subject) |
4066 | 0 | { |
4067 | 0 | SCHECK_PARTIAL(); |
4068 | 0 | break; |
4069 | 0 | } |
4070 | 0 | GETCHARLENTEST(fc, Feptr, len); |
4071 | 0 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
4072 | 0 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
4073 | 0 | fc >= 0xe000) == (Lctype == OP_NOTPROP)) |
4074 | 0 | break; |
4075 | 0 | Feptr += len; |
4076 | 0 | } |
4077 | 0 | break; |
4078 | | |
4079 | 0 | default: |
4080 | 0 | return PCRE2_ERROR_INTERNAL; |
4081 | 156k | } |
4082 | | |
4083 | | /* Feptr is now past the end of the maximum run */ |
4084 | | |
4085 | 156k | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4086 | | |
4087 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
4088 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
4089 | | go too far. */ |
4090 | | |
4091 | 143k | for(;;) |
4092 | 2.92M | { |
4093 | 2.92M | if (Feptr <= Lstart_eptr) break; |
4094 | 2.78M | RMATCH(Fecode, RM222); |
4095 | 2.78M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4096 | 2.78M | Feptr--; |
4097 | 2.78M | if (utf) BACKCHAR(Feptr); |
4098 | 2.78M | } |
4099 | 143k | } |
4100 | | |
4101 | | /* Match extended Unicode grapheme clusters. We will get here only if the |
4102 | | support is in the binary; otherwise a compile-time error occurs. */ |
4103 | | |
4104 | 68.3M | else if (Lctype == OP_EXTUNI) |
4105 | 121k | { |
4106 | 73.4M | for (i = Lmin; i < Lmax; i++) |
4107 | 73.4M | { |
4108 | 73.4M | if (Feptr >= mb->end_subject) |
4109 | 113k | { |
4110 | 113k | SCHECK_PARTIAL(); |
4111 | 113k | break; |
4112 | 113k | } |
4113 | 73.3M | else |
4114 | 73.3M | { |
4115 | 73.3M | GETCHARINCTEST(fc, Feptr); |
4116 | 73.3M | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, |
4117 | 73.3M | utf, NULL); |
4118 | 73.3M | } |
4119 | 73.3M | CHECK_PARTIAL(); |
4120 | 73.3M | } |
4121 | | |
4122 | | /* Feptr is now past the end of the maximum run */ |
4123 | | |
4124 | 121k | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4125 | | |
4126 | | /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start |
4127 | | of the run while backtracking because the use of \C in UTF mode can |
4128 | | cause BACKCHAR to move back past Lstart_eptr. This is just palliative; |
4129 | | the use of \C in UTF mode is fraught with danger. */ |
4130 | | |
4131 | 120k | for(;;) |
4132 | 72.7M | { |
4133 | 72.7M | int lgb, rgb; |
4134 | 72.7M | PCRE2_SPTR fptr; |
4135 | | |
4136 | 72.7M | if (Feptr <= Lstart_eptr) break; /* At start of char run */ |
4137 | 72.5M | RMATCH(Fecode, RM220); |
4138 | 72.5M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4139 | | |
4140 | | /* Backtracking over an extended grapheme cluster involves inspecting |
4141 | | the previous two characters (if present) to see if a break is |
4142 | | permitted between them. */ |
4143 | | |
4144 | 72.5M | Feptr--; |
4145 | 72.5M | if (!utf) fc = *Feptr; else |
4146 | 0 | { |
4147 | 0 | BACKCHAR(Feptr); |
4148 | 0 | GETCHAR(fc, Feptr); |
4149 | 0 | } |
4150 | 72.5M | rgb = UCD_GRAPHBREAK(fc); |
4151 | | |
4152 | 72.5M | for (;;) |
4153 | 72.6M | { |
4154 | 72.6M | if (Feptr <= Lstart_eptr) break; /* At start of char run */ |
4155 | 72.5M | fptr = Feptr - 1; |
4156 | 72.5M | if (!utf) fc = *fptr; else |
4157 | 0 | { |
4158 | 0 | BACKCHAR(fptr); |
4159 | 0 | GETCHAR(fc, fptr); |
4160 | 0 | } |
4161 | 72.5M | lgb = UCD_GRAPHBREAK(fc); |
4162 | 72.5M | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; |
4163 | 65.5k | Feptr = fptr; |
4164 | 65.5k | rgb = lgb; |
4165 | 65.5k | } |
4166 | 72.5M | } |
4167 | 120k | } |
4168 | | |
4169 | 68.2M | else |
4170 | 68.2M | #endif /* SUPPORT_UNICODE */ |
4171 | | |
4172 | 68.2M | #ifdef SUPPORT_UNICODE |
4173 | 68.2M | if (utf) |
4174 | 0 | { |
4175 | 0 | switch(Lctype) |
4176 | 0 | { |
4177 | 0 | case OP_ANY: |
4178 | 0 | for (i = Lmin; i < Lmax; i++) |
4179 | 0 | { |
4180 | 0 | if (Feptr >= mb->end_subject) |
4181 | 0 | { |
4182 | 0 | SCHECK_PARTIAL(); |
4183 | 0 | break; |
4184 | 0 | } |
4185 | 0 | if (IS_NEWLINE(Feptr)) break; |
4186 | 0 | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4187 | 0 | Feptr + 1 >= mb->end_subject && |
4188 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
4189 | 0 | NLBLOCK->nllen == 2 && |
4190 | 0 | UCHAR21(Feptr) == NLBLOCK->nl[0]) |
4191 | 0 | { |
4192 | 0 | mb->hitend = TRUE; |
4193 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4194 | 0 | } |
4195 | 0 | Feptr++; |
4196 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
4197 | 0 | } |
4198 | 0 | break; |
4199 | | |
4200 | 0 | case OP_ALLANY: |
4201 | 0 | if (Lmax < UINT32_MAX) |
4202 | 0 | { |
4203 | 0 | for (i = Lmin; i < Lmax; i++) |
4204 | 0 | { |
4205 | 0 | if (Feptr >= mb->end_subject) |
4206 | 0 | { |
4207 | 0 | SCHECK_PARTIAL(); |
4208 | 0 | break; |
4209 | 0 | } |
4210 | 0 | Feptr++; |
4211 | 0 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
4212 | 0 | } |
4213 | 0 | } |
4214 | 0 | else |
4215 | 0 | { |
4216 | 0 | Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */ |
4217 | 0 | SCHECK_PARTIAL(); |
4218 | 0 | } |
4219 | 0 | break; |
4220 | | |
4221 | | /* The "byte" (i.e. "code unit") case is the same as non-UTF */ |
4222 | | |
4223 | 0 | case OP_ANYBYTE: |
4224 | 0 | fc = Lmax - Lmin; |
4225 | 0 | if (fc > (uint32_t)(mb->end_subject - Feptr)) |
4226 | 0 | { |
4227 | 0 | Feptr = mb->end_subject; |
4228 | 0 | SCHECK_PARTIAL(); |
4229 | 0 | } |
4230 | 0 | else Feptr += fc; |
4231 | 0 | break; |
4232 | | |
4233 | 0 | case OP_ANYNL: |
4234 | 0 | for (i = Lmin; i < Lmax; i++) |
4235 | 0 | { |
4236 | 0 | int len = 1; |
4237 | 0 | if (Feptr >= mb->end_subject) |
4238 | 0 | { |
4239 | 0 | SCHECK_PARTIAL(); |
4240 | 0 | break; |
4241 | 0 | } |
4242 | 0 | GETCHARLEN(fc, Feptr, len); |
4243 | 0 | if (fc == CHAR_CR) |
4244 | 0 | { |
4245 | 0 | if (++Feptr >= mb->end_subject) break; |
4246 | 0 | if (UCHAR21(Feptr) == CHAR_LF) Feptr++; |
4247 | 0 | } |
4248 | 0 | else |
4249 | 0 | { |
4250 | 0 | if (fc != CHAR_LF && |
4251 | 0 | (mb->bsr_convention == PCRE2_BSR_ANYCRLF || |
4252 | 0 | (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL |
4253 | 0 | #ifndef EBCDIC |
4254 | 0 | && fc != 0x2028 && fc != 0x2029 |
4255 | 0 | #endif /* Not EBCDIC */ |
4256 | 0 | ))) |
4257 | 0 | break; |
4258 | 0 | Feptr += len; |
4259 | 0 | } |
4260 | 0 | } |
4261 | 0 | break; |
4262 | | |
4263 | 0 | case OP_NOT_HSPACE: |
4264 | 0 | case OP_HSPACE: |
4265 | 0 | for (i = Lmin; i < Lmax; i++) |
4266 | 0 | { |
4267 | 0 | BOOL gotspace; |
4268 | 0 | int len = 1; |
4269 | 0 | if (Feptr >= mb->end_subject) |
4270 | 0 | { |
4271 | 0 | SCHECK_PARTIAL(); |
4272 | 0 | break; |
4273 | 0 | } |
4274 | 0 | GETCHARLEN(fc, Feptr, len); |
4275 | 0 | switch(fc) |
4276 | 0 | { |
4277 | 0 | HSPACE_CASES: gotspace = TRUE; break; |
4278 | 0 | default: gotspace = FALSE; break; |
4279 | 0 | } |
4280 | 0 | if (gotspace == (Lctype == OP_NOT_HSPACE)) break; |
4281 | 0 | Feptr += len; |
4282 | 0 | } |
4283 | 0 | break; |
4284 | | |
4285 | 0 | case OP_NOT_VSPACE: |
4286 | 0 | case OP_VSPACE: |
4287 | 0 | for (i = Lmin; i < Lmax; i++) |
4288 | 0 | { |
4289 | 0 | BOOL gotspace; |
4290 | 0 | int len = 1; |
4291 | 0 | if (Feptr >= mb->end_subject) |
4292 | 0 | { |
4293 | 0 | SCHECK_PARTIAL(); |
4294 | 0 | break; |
4295 | 0 | } |
4296 | 0 | GETCHARLEN(fc, Feptr, len); |
4297 | 0 | switch(fc) |
4298 | 0 | { |
4299 | 0 | VSPACE_CASES: gotspace = TRUE; break; |
4300 | 0 | default: gotspace = FALSE; break; |
4301 | 0 | } |
4302 | 0 | if (gotspace == (Lctype == OP_NOT_VSPACE)) break; |
4303 | 0 | Feptr += len; |
4304 | 0 | } |
4305 | 0 | break; |
4306 | | |
4307 | 0 | case OP_NOT_DIGIT: |
4308 | 0 | for (i = Lmin; i < Lmax; i++) |
4309 | 0 | { |
4310 | 0 | int len = 1; |
4311 | 0 | if (Feptr >= mb->end_subject) |
4312 | 0 | { |
4313 | 0 | SCHECK_PARTIAL(); |
4314 | 0 | break; |
4315 | 0 | } |
4316 | 0 | GETCHARLEN(fc, Feptr, len); |
4317 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break; |
4318 | 0 | Feptr+= len; |
4319 | 0 | } |
4320 | 0 | break; |
4321 | | |
4322 | 0 | case OP_DIGIT: |
4323 | 0 | for (i = Lmin; i < Lmax; i++) |
4324 | 0 | { |
4325 | 0 | int len = 1; |
4326 | 0 | if (Feptr >= mb->end_subject) |
4327 | 0 | { |
4328 | 0 | SCHECK_PARTIAL(); |
4329 | 0 | break; |
4330 | 0 | } |
4331 | 0 | GETCHARLEN(fc, Feptr, len); |
4332 | 0 | if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break; |
4333 | 0 | Feptr+= len; |
4334 | 0 | } |
4335 | 0 | break; |
4336 | | |
4337 | 0 | case OP_NOT_WHITESPACE: |
4338 | 0 | for (i = Lmin; i < Lmax; i++) |
4339 | 0 | { |
4340 | 0 | int len = 1; |
4341 | 0 | if (Feptr >= mb->end_subject) |
4342 | 0 | { |
4343 | 0 | SCHECK_PARTIAL(); |
4344 | 0 | break; |
4345 | 0 | } |
4346 | 0 | GETCHARLEN(fc, Feptr, len); |
4347 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break; |
4348 | 0 | Feptr+= len; |
4349 | 0 | } |
4350 | 0 | break; |
4351 | | |
4352 | 0 | case OP_WHITESPACE: |
4353 | 0 | for (i = Lmin; i < Lmax; i++) |
4354 | 0 | { |
4355 | 0 | int len = 1; |
4356 | 0 | if (Feptr >= mb->end_subject) |
4357 | 0 | { |
4358 | 0 | SCHECK_PARTIAL(); |
4359 | 0 | break; |
4360 | 0 | } |
4361 | 0 | GETCHARLEN(fc, Feptr, len); |
4362 | 0 | if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break; |
4363 | 0 | Feptr+= len; |
4364 | 0 | } |
4365 | 0 | break; |
4366 | | |
4367 | 0 | case OP_NOT_WORDCHAR: |
4368 | 0 | for (i = Lmin; i < Lmax; i++) |
4369 | 0 | { |
4370 | 0 | int len = 1; |
4371 | 0 | if (Feptr >= mb->end_subject) |
4372 | 0 | { |
4373 | 0 | SCHECK_PARTIAL(); |
4374 | 0 | break; |
4375 | 0 | } |
4376 | 0 | GETCHARLEN(fc, Feptr, len); |
4377 | 0 | if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break; |
4378 | 0 | Feptr+= len; |
4379 | 0 | } |
4380 | 0 | break; |
4381 | | |
4382 | 0 | case OP_WORDCHAR: |
4383 | 0 | for (i = Lmin; i < Lmax; i++) |
4384 | 0 | { |
4385 | 0 | int len = 1; |
4386 | 0 | if (Feptr >= mb->end_subject) |
4387 | 0 | { |
4388 | 0 | SCHECK_PARTIAL(); |
4389 | 0 | break; |
4390 | 0 | } |
4391 | 0 | GETCHARLEN(fc, Feptr, len); |
4392 | 0 | if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break; |
4393 | 0 | Feptr+= len; |
4394 | 0 | } |
4395 | 0 | break; |
4396 | | |
4397 | 0 | default: |
4398 | 0 | return PCRE2_ERROR_INTERNAL; |
4399 | 0 | } |
4400 | | |
4401 | 0 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4402 | | |
4403 | | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
4404 | | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go |
4405 | | too far. */ |
4406 | | |
4407 | 0 | for(;;) |
4408 | 0 | { |
4409 | 0 | if (Feptr <= Lstart_eptr) break; |
4410 | 0 | RMATCH(Fecode, RM221); |
4411 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4412 | 0 | Feptr--; |
4413 | 0 | BACKCHAR(Feptr); |
4414 | 0 | if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && |
4415 | 0 | UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR) |
4416 | 0 | Feptr--; |
4417 | 0 | } |
4418 | 0 | } |
4419 | 68.2M | else |
4420 | 68.2M | #endif /* SUPPORT_UNICODE */ |
4421 | | |
4422 | | /* Not UTF mode */ |
4423 | 68.2M | { |
4424 | 68.2M | switch(Lctype) |
4425 | 68.2M | { |
4426 | 5.41M | case OP_ANY: |
4427 | 363M | for (i = Lmin; i < Lmax; i++) |
4428 | 363M | { |
4429 | 363M | if (Feptr >= mb->end_subject) |
4430 | 1.63M | { |
4431 | 1.63M | SCHECK_PARTIAL(); |
4432 | 1.63M | break; |
4433 | 1.63M | } |
4434 | 362M | if (IS_NEWLINE(Feptr)) break; |
4435 | 358M | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4436 | 358M | Feptr + 1 >= mb->end_subject && |
4437 | 358M | NLBLOCK->nltype == NLTYPE_FIXED && |
4438 | 358M | NLBLOCK->nllen == 2 && |
4439 | 358M | *Feptr == NLBLOCK->nl[0]) |
4440 | 0 | { |
4441 | 0 | mb->hitend = TRUE; |
4442 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4443 | 0 | } |
4444 | 358M | Feptr++; |
4445 | 358M | } |
4446 | 5.41M | break; |
4447 | | |
4448 | 5.41M | case OP_ALLANY: |
4449 | 26.6k | case OP_ANYBYTE: |
4450 | 26.6k | fc = Lmax - Lmin; |
4451 | 26.6k | if (fc > (uint32_t)(mb->end_subject - Feptr)) |
4452 | 24.8k | { |
4453 | 24.8k | Feptr = mb->end_subject; |
4454 | 24.8k | SCHECK_PARTIAL(); |
4455 | 24.8k | } |
4456 | 1.78k | else Feptr += fc; |
4457 | 26.6k | break; |
4458 | | |
4459 | 11.3M | case OP_ANYNL: |
4460 | 16.9M | for (i = Lmin; i < Lmax; i++) |
4461 | 16.7M | { |
4462 | 16.7M | if (Feptr >= mb->end_subject) |
4463 | 15.1k | { |
4464 | 15.1k | SCHECK_PARTIAL(); |
4465 | 15.1k | break; |
4466 | 15.1k | } |
4467 | 16.7M | fc = *Feptr; |
4468 | 16.7M | if (fc == CHAR_CR) |
4469 | 689k | { |
4470 | 689k | if (++Feptr >= mb->end_subject) break; |
4471 | 689k | if (*Feptr == CHAR_LF) Feptr++; |
4472 | 689k | } |
4473 | 16.0M | else |
4474 | 16.0M | { |
4475 | 16.0M | if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF || |
4476 | 11.3M | (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL |
4477 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4478 | | && fc != 0x2028 && fc != 0x2029 |
4479 | | #endif |
4480 | 11.3M | ))) break; |
4481 | 4.95M | Feptr++; |
4482 | 4.95M | } |
4483 | 16.7M | } |
4484 | 11.3M | break; |
4485 | | |
4486 | 11.3M | case OP_NOT_HSPACE: |
4487 | 77.9M | for (i = Lmin; i < Lmax; i++) |
4488 | 77.8M | { |
4489 | 77.8M | if (Feptr >= mb->end_subject) |
4490 | 141k | { |
4491 | 141k | SCHECK_PARTIAL(); |
4492 | 141k | break; |
4493 | 141k | } |
4494 | 77.6M | switch(*Feptr) |
4495 | 77.6M | { |
4496 | 76.2M | default: Feptr++; break; |
4497 | 2.99M | HSPACE_BYTE_CASES: |
4498 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4499 | | HSPACE_MULTIBYTE_CASES: |
4500 | | #endif |
4501 | 2.99M | goto ENDLOOP00; |
4502 | 77.6M | } |
4503 | 77.6M | } |
4504 | 1.70M | ENDLOOP00: |
4505 | 1.70M | break; |
4506 | | |
4507 | 765k | case OP_HSPACE: |
4508 | 793k | for (i = Lmin; i < Lmax; i++) |
4509 | 788k | { |
4510 | 788k | if (Feptr >= mb->end_subject) |
4511 | 1.43k | { |
4512 | 1.43k | SCHECK_PARTIAL(); |
4513 | 1.43k | break; |
4514 | 1.43k | } |
4515 | 787k | switch(*Feptr) |
4516 | 787k | { |
4517 | 759k | default: goto ENDLOOP01; |
4518 | 759k | HSPACE_BYTE_CASES: |
4519 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4520 | | HSPACE_MULTIBYTE_CASES: |
4521 | | #endif |
4522 | 63.9k | Feptr++; break; |
4523 | 787k | } |
4524 | 787k | } |
4525 | 765k | ENDLOOP01: |
4526 | 765k | break; |
4527 | | |
4528 | 33.2M | case OP_NOT_VSPACE: |
4529 | 124M | for (i = Lmin; i < Lmax; i++) |
4530 | 92.3M | { |
4531 | 92.3M | if (Feptr >= mb->end_subject) |
4532 | 39.9k | { |
4533 | 39.9k | SCHECK_PARTIAL(); |
4534 | 39.9k | break; |
4535 | 39.9k | } |
4536 | 92.2M | switch(*Feptr) |
4537 | 92.2M | { |
4538 | 90.8M | default: Feptr++; break; |
4539 | 5.31M | VSPACE_BYTE_CASES: |
4540 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4541 | | VSPACE_MULTIBYTE_CASES: |
4542 | | #endif |
4543 | 5.31M | goto ENDLOOP02; |
4544 | 92.2M | } |
4545 | 92.2M | } |
4546 | 33.2M | ENDLOOP02: |
4547 | 33.2M | break; |
4548 | | |
4549 | 31.9M | case OP_VSPACE: |
4550 | 289k | for (i = Lmin; i < Lmax; i++) |
4551 | 282k | { |
4552 | 282k | if (Feptr >= mb->end_subject) |
4553 | 1.97k | { |
4554 | 1.97k | SCHECK_PARTIAL(); |
4555 | 1.97k | break; |
4556 | 1.97k | } |
4557 | 280k | switch(*Feptr) |
4558 | 280k | { |
4559 | 117k | default: goto ENDLOOP03; |
4560 | 607k | VSPACE_BYTE_CASES: |
4561 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4562 | | VSPACE_MULTIBYTE_CASES: |
4563 | | #endif |
4564 | 607k | Feptr++; break; |
4565 | 280k | } |
4566 | 280k | } |
4567 | 126k | ENDLOOP03: |
4568 | 126k | break; |
4569 | | |
4570 | 3.72M | case OP_NOT_DIGIT: |
4571 | 689M | for (i = Lmin; i < Lmax; i++) |
4572 | 688M | { |
4573 | 688M | if (Feptr >= mb->end_subject) |
4574 | 76.0k | { |
4575 | 76.0k | SCHECK_PARTIAL(); |
4576 | 76.0k | break; |
4577 | 76.0k | } |
4578 | 688M | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) |
4579 | 3.07M | break; |
4580 | 685M | Feptr++; |
4581 | 685M | } |
4582 | 3.72M | break; |
4583 | | |
4584 | 3.72M | case OP_DIGIT: |
4585 | 1.63M | for (i = Lmin; i < Lmax; i++) |
4586 | 1.63M | { |
4587 | 1.63M | if (Feptr >= mb->end_subject) |
4588 | 34.9k | { |
4589 | 34.9k | SCHECK_PARTIAL(); |
4590 | 34.9k | break; |
4591 | 34.9k | } |
4592 | 1.59M | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) |
4593 | 290k | break; |
4594 | 1.30M | Feptr++; |
4595 | 1.30M | } |
4596 | 328k | break; |
4597 | | |
4598 | 328k | case OP_NOT_WHITESPACE: |
4599 | 3.21M | for (i = Lmin; i < Lmax; i++) |
4600 | 2.94M | { |
4601 | 2.94M | if (Feptr >= mb->end_subject) |
4602 | 2.14k | { |
4603 | 2.14k | SCHECK_PARTIAL(); |
4604 | 2.14k | break; |
4605 | 2.14k | } |
4606 | 2.94M | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) |
4607 | 20.9k | break; |
4608 | 2.92M | Feptr++; |
4609 | 2.92M | } |
4610 | 287k | break; |
4611 | | |
4612 | 10.5M | case OP_WHITESPACE: |
4613 | 103M | for (i = Lmin; i < Lmax; i++) |
4614 | 103M | { |
4615 | 103M | if (Feptr >= mb->end_subject) |
4616 | 2.50M | { |
4617 | 2.50M | SCHECK_PARTIAL(); |
4618 | 2.50M | break; |
4619 | 2.50M | } |
4620 | 101M | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) |
4621 | 8.03M | break; |
4622 | 93.2M | Feptr++; |
4623 | 93.2M | } |
4624 | 10.5M | break; |
4625 | | |
4626 | 10.5M | case OP_NOT_WORDCHAR: |
4627 | 1.80G | for (i = Lmin; i < Lmax; i++) |
4628 | 1.80G | { |
4629 | 1.80G | if (Feptr >= mb->end_subject) |
4630 | 110k | { |
4631 | 110k | SCHECK_PARTIAL(); |
4632 | 110k | break; |
4633 | 110k | } |
4634 | 1.80G | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) |
4635 | 495k | break; |
4636 | 1.80G | Feptr++; |
4637 | 1.80G | } |
4638 | 685k | break; |
4639 | | |
4640 | 685k | case OP_WORDCHAR: |
4641 | 278k | for (i = Lmin; i < Lmax; i++) |
4642 | 277k | { |
4643 | 277k | if (Feptr >= mb->end_subject) |
4644 | 8.12k | { |
4645 | 8.12k | SCHECK_PARTIAL(); |
4646 | 8.12k | break; |
4647 | 8.12k | } |
4648 | 269k | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) |
4649 | 54.9k | break; |
4650 | 214k | Feptr++; |
4651 | 214k | } |
4652 | 64.3k | break; |
4653 | | |
4654 | 64.3k | default: |
4655 | 0 | return PCRE2_ERROR_INTERNAL; |
4656 | 68.2M | } |
4657 | | |
4658 | 68.2M | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4659 | | |
4660 | 50.4M | for (;;) |
4661 | 679M | { |
4662 | 679M | if (Feptr == Lstart_eptr) break; |
4663 | 629M | RMATCH(Fecode, RM34); |
4664 | 628M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4665 | 628M | Feptr--; |
4666 | 628M | if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF && |
4667 | 628M | Feptr[-1] == CHAR_CR) Feptr--; |
4668 | 628M | } |
4669 | 50.4M | } |
4670 | 68.5M | } |
4671 | 49.8M | break; /* End of repeat character type processing */ |
4672 | | |
4673 | 49.8M | #undef Lstart_eptr |
4674 | 49.8M | #undef Lmin |
4675 | 49.8M | #undef Lmax |
4676 | 49.8M | #undef Lctype |
4677 | 49.8M | #undef Lpropvalue |
4678 | | |
4679 | | |
4680 | | /* ===================================================================== */ |
4681 | | /* Match a back reference, possibly repeatedly. Look past the end of the |
4682 | | item to see if there is repeat information following. The OP_REF and |
4683 | | OP_REFI opcodes are used for a reference to a numbered group or to a |
4684 | | non-duplicated named group. For a duplicated named group, OP_DNREF and |
4685 | | OP_DNREFI are used. In this case we must scan the list of groups to which |
4686 | | the name refers, and use the first one that is set. */ |
4687 | | |
4688 | 49.8M | #define Lmin F->temp_32[0] |
4689 | 49.8M | #define Lmax F->temp_32[1] |
4690 | 49.8M | #define Lcaseless F->temp_32[2] |
4691 | 49.8M | #define Lstart F->temp_sptr[0] |
4692 | 49.8M | #define Loffset F->temp_size |
4693 | | |
4694 | 49.8M | case OP_DNREF: |
4695 | 0 | case OP_DNREFI: |
4696 | 0 | Lcaseless = (Fop == OP_DNREFI); |
4697 | 0 | { |
4698 | 0 | int count = GET2(Fecode, 1+IMM2_SIZE); |
4699 | 0 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
4700 | 0 | Fecode += 1 + 2*IMM2_SIZE; |
4701 | |
|
4702 | 0 | while (count-- > 0) |
4703 | 0 | { |
4704 | 0 | Loffset = (GET2(slot, 0) << 1) - 2; |
4705 | 0 | if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break; |
4706 | 0 | slot += mb->name_entry_size; |
4707 | 0 | } |
4708 | 0 | } |
4709 | 0 | goto REF_REPEAT; |
4710 | | |
4711 | 2.60M | case OP_REF: |
4712 | 7.77M | case OP_REFI: |
4713 | 7.77M | Lcaseless = (Fop == OP_REFI); |
4714 | 7.77M | Loffset = (GET2(Fecode, 1) << 1) - 2; |
4715 | 7.77M | Fecode += 1 + IMM2_SIZE; |
4716 | | |
4717 | | /* Set up for repetition, or handle the non-repeated case. The maximum and |
4718 | | minimum must be in the heap frame, but as they are short-term values, we |
4719 | | use temporary fields. */ |
4720 | | |
4721 | 7.77M | REF_REPEAT: |
4722 | 7.77M | switch (*Fecode) |
4723 | 7.77M | { |
4724 | 232 | case OP_CRSTAR: |
4725 | 232 | case OP_CRMINSTAR: |
4726 | 330 | case OP_CRPLUS: |
4727 | 330 | case OP_CRMINPLUS: |
4728 | 330 | case OP_CRQUERY: |
4729 | 330 | case OP_CRMINQUERY: |
4730 | 330 | fc = *Fecode++ - OP_CRSTAR; |
4731 | 330 | Lmin = rep_min[fc]; |
4732 | 330 | Lmax = rep_max[fc]; |
4733 | 330 | reptype = rep_typ[fc]; |
4734 | 330 | break; |
4735 | | |
4736 | 0 | case OP_CRRANGE: |
4737 | 0 | case OP_CRMINRANGE: |
4738 | 0 | Lmin = GET2(Fecode, 1); |
4739 | 0 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
4740 | 0 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
4741 | 0 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
4742 | 0 | Fecode += 1 + 2 * IMM2_SIZE; |
4743 | 0 | break; |
4744 | | |
4745 | 7.77M | default: /* No repeat follows */ |
4746 | 7.77M | { |
4747 | 7.77M | rrc = match_ref(Loffset, Lcaseless, F, mb, &length); |
4748 | 7.77M | if (rrc != 0) |
4749 | 5.99M | { |
4750 | 5.99M | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
4751 | 5.99M | CHECK_PARTIAL(); |
4752 | 5.99M | RRETURN(MATCH_NOMATCH); |
4753 | 0 | } |
4754 | 7.77M | } |
4755 | 1.78M | Feptr += length; |
4756 | 1.78M | continue; /* With the main loop */ |
4757 | 7.77M | } |
4758 | | |
4759 | | /* Handle repeated back references. If a set group has length zero, just |
4760 | | continue with the main loop, because it matches however many times. For an |
4761 | | unset reference, if the minimum is zero, we can also just continue. We can |
4762 | | also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset |
4763 | | group behave as a zero-length group. For any other unset cases, carrying |
4764 | | on will result in NOMATCH. */ |
4765 | | |
4766 | 330 | if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) |
4767 | 0 | { |
4768 | 0 | if (Fovector[Loffset] == Fovector[Loffset + 1]) continue; |
4769 | 0 | } |
4770 | 330 | else /* Group is not set */ |
4771 | 330 | { |
4772 | 330 | if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) |
4773 | 232 | continue; |
4774 | 330 | } |
4775 | | |
4776 | | /* First, ensure the minimum number of matches are present. */ |
4777 | | |
4778 | 98 | for (i = 1; i <= Lmin; i++) |
4779 | 98 | { |
4780 | 98 | PCRE2_SIZE slength; |
4781 | 98 | rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); |
4782 | 98 | if (rrc != 0) |
4783 | 98 | { |
4784 | 98 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
4785 | 98 | CHECK_PARTIAL(); |
4786 | 98 | RRETURN(MATCH_NOMATCH); |
4787 | 0 | } |
4788 | 0 | Feptr += slength; |
4789 | 0 | } |
4790 | | |
4791 | | /* If min = max, we are done. They are not both allowed to be zero. */ |
4792 | | |
4793 | 0 | if (Lmin == Lmax) continue; |
4794 | | |
4795 | | /* If minimizing, keep trying and advancing the pointer. */ |
4796 | | |
4797 | 0 | if (reptype == REPTYPE_MIN) |
4798 | 0 | { |
4799 | 0 | for (;;) |
4800 | 0 | { |
4801 | 0 | PCRE2_SIZE slength; |
4802 | 0 | RMATCH(Fecode, RM20); |
4803 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4804 | 0 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
4805 | 0 | rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); |
4806 | 0 | if (rrc != 0) |
4807 | 0 | { |
4808 | 0 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
4809 | 0 | CHECK_PARTIAL(); |
4810 | 0 | RRETURN(MATCH_NOMATCH); |
4811 | 0 | } |
4812 | 0 | Feptr += slength; |
4813 | 0 | } |
4814 | | /* Control never gets here */ |
4815 | 0 | } |
4816 | | |
4817 | | /* If maximizing, find the longest string and work backwards, as long as |
4818 | | the matched lengths for each iteration are the same. */ |
4819 | | |
4820 | 0 | else |
4821 | 0 | { |
4822 | 0 | BOOL samelengths = TRUE; |
4823 | 0 | Lstart = Feptr; /* Starting position */ |
4824 | 0 | Flength = Fovector[Loffset+1] - Fovector[Loffset]; |
4825 | |
|
4826 | 0 | for (i = Lmin; i < Lmax; i++) |
4827 | 0 | { |
4828 | 0 | PCRE2_SIZE slength; |
4829 | 0 | rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); |
4830 | 0 | if (rrc != 0) |
4831 | 0 | { |
4832 | | /* Can't use CHECK_PARTIAL because we don't want to update Feptr in |
4833 | | the soft partial matching case. */ |
4834 | |
|
4835 | 0 | if (rrc > 0 && mb->partial != 0 && |
4836 | 0 | mb->end_subject > mb->start_used_ptr) |
4837 | 0 | { |
4838 | 0 | mb->hitend = TRUE; |
4839 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4840 | 0 | } |
4841 | 0 | break; |
4842 | 0 | } |
4843 | | |
4844 | 0 | if (slength != Flength) samelengths = FALSE; |
4845 | 0 | Feptr += slength; |
4846 | 0 | } |
4847 | | |
4848 | | /* If the length matched for each repetition is the same as the length of |
4849 | | the captured group, we can easily work backwards. This is the normal |
4850 | | case. However, in caseless UTF-8 mode there are pairs of case-equivalent |
4851 | | characters whose lengths (in terms of code units) differ. However, this |
4852 | | is very rare, so we handle it by re-matching fewer and fewer times. */ |
4853 | | |
4854 | 0 | if (samelengths) |
4855 | 0 | { |
4856 | 0 | while (Feptr >= Lstart) |
4857 | 0 | { |
4858 | 0 | RMATCH(Fecode, RM21); |
4859 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4860 | 0 | Feptr -= Flength; |
4861 | 0 | } |
4862 | 0 | } |
4863 | | |
4864 | | /* The rare case of non-matching lengths. Re-scan the repetition for each |
4865 | | iteration. We know that match_ref() will succeed every time. */ |
4866 | | |
4867 | 0 | else |
4868 | 0 | { |
4869 | 0 | Lmax = i; |
4870 | 0 | for (;;) |
4871 | 0 | { |
4872 | 0 | RMATCH(Fecode, RM22); |
4873 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4874 | 0 | if (Feptr == Lstart) break; /* Failed after minimal repetition */ |
4875 | 0 | Feptr = Lstart; |
4876 | 0 | Lmax--; |
4877 | 0 | for (i = Lmin; i < Lmax; i++) |
4878 | 0 | { |
4879 | 0 | PCRE2_SIZE slength; |
4880 | 0 | (void)match_ref(Loffset, Lcaseless, F, mb, &slength); |
4881 | 0 | Feptr += slength; |
4882 | 0 | } |
4883 | 0 | } |
4884 | 0 | } |
4885 | | |
4886 | 0 | RRETURN(MATCH_NOMATCH); |
4887 | 0 | } |
4888 | | /* Control never gets here */ |
4889 | | |
4890 | 0 | #undef Lcaseless |
4891 | 0 | #undef Lmin |
4892 | 0 | #undef Lmax |
4893 | 0 | #undef Lstart |
4894 | 0 | #undef Loffset |
4895 | | |
4896 | | |
4897 | | |
4898 | | /* ========================================================================= */ |
4899 | | /* Opcodes for the start of various parenthesized items */ |
4900 | | /* ========================================================================= */ |
4901 | | |
4902 | | /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the |
4903 | | (*THEN) is within the current branch by comparing the address of OP_THEN |
4904 | | that is passed back with the end of the branch. If (*THEN) is within the |
4905 | | current branch, and the branch is one of two or more alternatives (it |
4906 | | either starts or ends with OP_ALT), we have reached the limit of THEN's |
4907 | | action, so convert the return code to NOMATCH, which will cause normal |
4908 | | backtracking to happen from now on. Otherwise, THEN is passed back to an |
4909 | | outer alternative. This implements Perl's treatment of parenthesized |
4910 | | groups, where a group not containing | does not affect the current |
4911 | | alternative, that is, (X) is NOT the same as (X|(*F)). */ |
4912 | | |
4913 | | |
4914 | | /* ===================================================================== */ |
4915 | | /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive |
4916 | | bracket group, indicating that it may occur zero times. It may repeat |
4917 | | infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in |
4918 | | the pattern. Brackets with fixed upper repeat limits are compiled as a |
4919 | | number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO. |
4920 | | Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */ |
4921 | | |
4922 | 186M | #define Lnext_ecode F->temp_sptr[0] |
4923 | | |
4924 | 16.0M | case OP_BRAZERO: |
4925 | 16.0M | Lnext_ecode = Fecode + 1; |
4926 | 16.0M | RMATCH(Lnext_ecode, RM9); |
4927 | 15.3M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4928 | 77.1M | do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); |
4929 | 15.3M | Fecode = Lnext_ecode + 1 + LINK_SIZE; |
4930 | 15.3M | break; |
4931 | | |
4932 | 51.1k | case OP_BRAMINZERO: |
4933 | 51.1k | Lnext_ecode = Fecode + 1; |
4934 | 161k | do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); |
4935 | 51.1k | RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10); |
4936 | 51.1k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4937 | 51.1k | Fecode++; |
4938 | 51.1k | break; |
4939 | | |
4940 | 0 | #undef Lnext_ecode |
4941 | | |
4942 | 0 | case OP_SKIPZERO: |
4943 | 0 | Fecode++; |
4944 | 0 | do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); |
4945 | 0 | Fecode += 1 + LINK_SIZE; |
4946 | 0 | break; |
4947 | | |
4948 | | |
4949 | | /* ===================================================================== */ |
4950 | | /* Handle possessive brackets with an unlimited repeat. The end of these |
4951 | | brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without |
4952 | | going further in the pattern. */ |
4953 | | |
4954 | 4.39k | #define Lframe_type F->temp_32[0] |
4955 | 2.64k | #define Lmatched_once F->temp_32[1] |
4956 | 1.73k | #define Lzero_allowed F->temp_32[2] |
4957 | 3.53k | #define Lstart_eptr F->temp_sptr[0] |
4958 | 892 | #define Lstart_group F->temp_sptr[1] |
4959 | | |
4960 | 876 | case OP_BRAPOSZERO: |
4961 | 876 | Lzero_allowed = TRUE; /* Zero repeat is allowed */ |
4962 | 876 | Fecode += 1; |
4963 | 876 | if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS) |
4964 | 0 | goto POSSESSIVE_CAPTURE; |
4965 | 876 | goto POSSESSIVE_NON_CAPTURE; |
4966 | | |
4967 | 876 | case OP_BRAPOS: |
4968 | 0 | case OP_SBRAPOS: |
4969 | 0 | Lzero_allowed = FALSE; /* Zero repeat not allowed */ |
4970 | |
|
4971 | 876 | POSSESSIVE_NON_CAPTURE: |
4972 | 876 | Lframe_type = GF_NOCAPTURE; /* Remembered frame type */ |
4973 | 876 | goto POSSESSIVE_GROUP; |
4974 | | |
4975 | 0 | case OP_CBRAPOS: |
4976 | 0 | case OP_SCBRAPOS: |
4977 | 0 | Lzero_allowed = FALSE; /* Zero repeat not allowed */ |
4978 | |
|
4979 | 0 | POSSESSIVE_CAPTURE: |
4980 | 0 | number = GET2(Fecode, 1+LINK_SIZE); |
4981 | 0 | Lframe_type = GF_CAPTURE | number; /* Remembered frame type */ |
4982 | |
|
4983 | 876 | POSSESSIVE_GROUP: |
4984 | 876 | Lmatched_once = FALSE; /* Never matched */ |
4985 | 876 | Lstart_group = Fecode; /* Start of this group */ |
4986 | | |
4987 | 876 | for (;;) |
4988 | 3.52k | { |
4989 | 3.52k | Lstart_eptr = Feptr; /* Position at group start */ |
4990 | 3.52k | group_frame_type = Lframe_type; |
4991 | 3.52k | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8); |
4992 | 3.52k | if (rrc == MATCH_KETRPOS) |
4993 | 16 | { |
4994 | 16 | Lmatched_once = TRUE; /* Matched at least once */ |
4995 | 16 | if (Feptr == Lstart_eptr) /* Empty match; skip to end */ |
4996 | 0 | { |
4997 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
4998 | 0 | break; |
4999 | 0 | } |
5000 | | |
5001 | 16 | Fecode = Lstart_group; |
5002 | 16 | continue; |
5003 | 16 | } |
5004 | | |
5005 | | /* See comment above about handling THEN. */ |
5006 | | |
5007 | 3.50k | if (rrc == MATCH_THEN) |
5008 | 0 | { |
5009 | 0 | PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); |
5010 | 0 | if (mb->verb_ecode_ptr < next_ecode && |
5011 | 0 | (*Fecode == OP_ALT || *next_ecode == OP_ALT)) |
5012 | 0 | rrc = MATCH_NOMATCH; |
5013 | 0 | } |
5014 | | |
5015 | 3.50k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5016 | 3.50k | Fecode += GET(Fecode, 1); |
5017 | 3.50k | if (*Fecode != OP_ALT) break; |
5018 | 3.50k | } |
5019 | | |
5020 | | /* Success if matched something or zero repeat allowed */ |
5021 | | |
5022 | 876 | if (Lmatched_once || Lzero_allowed) |
5023 | 876 | { |
5024 | 876 | Fecode += 1 + LINK_SIZE; |
5025 | 876 | break; |
5026 | 876 | } |
5027 | | |
5028 | 876 | RRETURN(MATCH_NOMATCH); |
5029 | |
|
5030 | 0 | #undef Lmatched_once |
5031 | 0 | #undef Lzero_allowed |
5032 | 0 | #undef Lframe_type |
5033 | 0 | #undef Lstart_eptr |
5034 | 0 | #undef Lstart_group |
5035 | | |
5036 | | |
5037 | | /* ===================================================================== */ |
5038 | | /* Handle non-capturing brackets that cannot match an empty string. When we |
5039 | | get to the final alternative within the brackets, as long as there are no |
5040 | | THEN's in the pattern, we can optimize by not recording a new backtracking |
5041 | | point. (Ideally we should test for a THEN within this group, but we don't |
5042 | | have that information.) Don't do this if we are at the very top level, |
5043 | | however, because that would make handling assertions and once-only brackets |
5044 | | messier when there is nothing to go back to. */ |
5045 | |
|
5046 | 221M | #define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */ |
5047 | 129M | #define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */ |
5048 | |
|
5049 | 69.0M | case OP_BRA: |
5050 | 69.0M | if (mb->hasthen || Frdepth == 0) |
5051 | 51.8M | { |
5052 | 51.8M | Lframe_type = 0; |
5053 | 51.8M | goto GROUPLOOP; |
5054 | 51.8M | } |
5055 | | |
5056 | 17.1M | for (;;) |
5057 | 48.7M | { |
5058 | 48.7M | Lnext_branch = Fecode + GET(Fecode, 1); |
5059 | 48.7M | if (*Lnext_branch != OP_ALT) break; |
5060 | | |
5061 | | /* This is never the final branch. We do not need to test for MATCH_THEN |
5062 | | here because this code is not used when there is a THEN in the pattern. */ |
5063 | | |
5064 | 31.6M | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1); |
5065 | 31.6M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5066 | 31.6M | Fecode = Lnext_branch; |
5067 | 31.6M | } |
5068 | | |
5069 | | /* Hit the start of the final branch. Continue at this level. */ |
5070 | | |
5071 | 17.1M | Fecode += PRIV(OP_lengths)[*Fecode]; |
5072 | 17.1M | break; |
5073 | | |
5074 | 0 | #undef Lnext_branch |
5075 | | |
5076 | | |
5077 | | /* ===================================================================== */ |
5078 | | /* Handle a capturing bracket, other than those that are possessive with an |
5079 | | unlimited repeat. */ |
5080 | | |
5081 | 6.11M | case OP_CBRA: |
5082 | 6.14M | case OP_SCBRA: |
5083 | 6.14M | Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE); |
5084 | 6.14M | goto GROUPLOOP; |
5085 | | |
5086 | | |
5087 | | /* ===================================================================== */ |
5088 | | /* Atomic groups and non-capturing brackets that can match an empty string |
5089 | | must record a backtracking point and also set up a chained frame. */ |
5090 | | |
5091 | 716 | case OP_ONCE: |
5092 | 716 | case OP_SCRIPT_RUN: |
5093 | 402k | case OP_SBRA: |
5094 | 402k | Lframe_type = GF_NOCAPTURE | Fop; |
5095 | | |
5096 | 58.4M | GROUPLOOP: |
5097 | 58.4M | for (;;) |
5098 | 163M | { |
5099 | 163M | group_frame_type = Lframe_type; |
5100 | 163M | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2); |
5101 | 155M | if (rrc == MATCH_THEN) |
5102 | 0 | { |
5103 | 0 | PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); |
5104 | 0 | if (mb->verb_ecode_ptr < next_ecode && |
5105 | 0 | (*Fecode == OP_ALT || *next_ecode == OP_ALT)) |
5106 | 0 | rrc = MATCH_NOMATCH; |
5107 | 0 | } |
5108 | 155M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5109 | 155M | Fecode += GET(Fecode, 1); |
5110 | 155M | if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
5111 | 104M | } |
5112 | | /* Control never reaches here. */ |
5113 | | |
5114 | 0 | #undef Lframe_type |
5115 | | |
5116 | | |
5117 | | /* ===================================================================== */ |
5118 | | /* Recursion either matches the current regex, or some subexpression. The |
5119 | | offset data is the offset to the starting bracket from the start of the |
5120 | | whole pattern. (This is so that it works from duplicated subpatterns.) */ |
5121 | | |
5122 | 0 | #define Lframe_type F->temp_32[0] |
5123 | 0 | #define Lstart_branch F->temp_sptr[0] |
5124 | | |
5125 | 0 | case OP_RECURSE: |
5126 | 0 | bracode = mb->start_code + GET(Fecode, 1); |
5127 | 0 | number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE); |
5128 | | |
5129 | | /* If we are already in a recursion, check for repeating the same one |
5130 | | without advancing the subject pointer. This should catch convoluted mutual |
5131 | | recursions. (Some simple cases are caught at compile time.) */ |
5132 | |
|
5133 | 0 | if (Fcurrent_recurse != RECURSE_UNSET) |
5134 | 0 | { |
5135 | 0 | offset = Flast_group_offset; |
5136 | 0 | while (offset != PCRE2_UNSET) |
5137 | 0 | { |
5138 | 0 | N = (heapframe *)((char *)mb->match_frames + offset); |
5139 | 0 | P = (heapframe *)((char *)N - frame_size); |
5140 | 0 | if (N->group_frame_type == (GF_RECURSE | number)) |
5141 | 0 | { |
5142 | 0 | if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP; |
5143 | 0 | break; |
5144 | 0 | } |
5145 | 0 | offset = P->last_group_offset; |
5146 | 0 | } |
5147 | 0 | } |
5148 | | |
5149 | | /* Now run the recursion, branch by branch. */ |
5150 | | |
5151 | 0 | Lstart_branch = bracode; |
5152 | 0 | Lframe_type = GF_RECURSE | number; |
5153 | |
|
5154 | 0 | for (;;) |
5155 | 0 | { |
5156 | 0 | PCRE2_SPTR next_ecode; |
5157 | |
|
5158 | 0 | group_frame_type = Lframe_type; |
5159 | 0 | RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11); |
5160 | 0 | next_ecode = Lstart_branch + GET(Lstart_branch,1); |
5161 | | |
5162 | | /* Handle backtracking verbs, which are defined in a range that can |
5163 | | easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to |
5164 | | escape beyond a recursion; they cause a NOMATCH for the entire recursion. |
5165 | | |
5166 | | When one of these verbs triggers, the current recursion group number is |
5167 | | recorded. If it matches the recursion we are processing, the verb |
5168 | | happened within the recursion and we must deal with it. Otherwise it must |
5169 | | have happened after the recursion completed, and so has to be passed |
5170 | | back. See comment above about handling THEN. */ |
5171 | |
|
5172 | 0 | if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX && |
5173 | 0 | mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE)) |
5174 | 0 | { |
5175 | 0 | if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode && |
5176 | 0 | (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT)) |
5177 | 0 | rrc = MATCH_NOMATCH; |
5178 | 0 | else RRETURN(MATCH_NOMATCH); |
5179 | 0 | } |
5180 | | |
5181 | | /* Note that carrying on after (*ACCEPT) in a recursion is handled in the |
5182 | | OP_ACCEPT code. Nothing needs to be done here. */ |
5183 | | |
5184 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5185 | 0 | Lstart_branch = next_ecode; |
5186 | 0 | if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH); |
5187 | 0 | } |
5188 | | /* Control never reaches here. */ |
5189 | | |
5190 | 0 | #undef Lframe_type |
5191 | 0 | #undef Lstart_branch |
5192 | | |
5193 | | |
5194 | | /* ===================================================================== */ |
5195 | | /* Positive assertions are like other groups except that PCRE doesn't allow |
5196 | | the effect of (*THEN) to escape beyond an assertion; it is therefore |
5197 | | treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its |
5198 | | captures and mark retained. Any other return is an error. */ |
5199 | | |
5200 | 697k | #define Lframe_type F->temp_32[0] |
5201 | | |
5202 | 270k | case OP_ASSERT: |
5203 | 270k | case OP_ASSERTBACK: |
5204 | 270k | case OP_ASSERT_NA: |
5205 | 271k | case OP_ASSERTBACK_NA: |
5206 | 271k | Lframe_type = GF_NOCAPTURE | Fop; |
5207 | 271k | for (;;) |
5208 | 426k | { |
5209 | 426k | group_frame_type = Lframe_type; |
5210 | 426k | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3); |
5211 | 426k | if (rrc == MATCH_ACCEPT) |
5212 | 0 | { |
5213 | 0 | memcpy(Fovector, |
5214 | 0 | (char *)assert_accept_frame + offsetof(heapframe, ovector), |
5215 | 0 | assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); |
5216 | 0 | Foffset_top = assert_accept_frame->offset_top; |
5217 | 0 | Fmark = assert_accept_frame->mark; |
5218 | 0 | break; |
5219 | 0 | } |
5220 | 426k | if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
5221 | 426k | Fecode += GET(Fecode, 1); |
5222 | 426k | if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
5223 | 155k | } |
5224 | | |
5225 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5226 | 0 | Fecode += 1 + LINK_SIZE; |
5227 | 0 | break; |
5228 | | |
5229 | 0 | #undef Lframe_type |
5230 | | |
5231 | | |
5232 | | /* ===================================================================== */ |
5233 | | /* Handle negative assertions. Loop for each non-matching branch as for |
5234 | | positive assertions. */ |
5235 | | |
5236 | 32.9M | #define Lframe_type F->temp_32[0] |
5237 | | |
5238 | 0 | case OP_ASSERT_NOT: |
5239 | 16.4M | case OP_ASSERTBACK_NOT: |
5240 | 16.4M | Lframe_type = GF_NOCAPTURE | Fop; |
5241 | | |
5242 | 16.4M | for (;;) |
5243 | 16.4M | { |
5244 | 16.4M | group_frame_type = Lframe_type; |
5245 | 16.4M | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4); |
5246 | 16.4M | switch(rrc) |
5247 | 16.4M | { |
5248 | 0 | case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */ |
5249 | 270k | case MATCH_MATCH: |
5250 | 270k | RRETURN (MATCH_NOMATCH); |
5251 | |
|
5252 | 16.2M | case MATCH_NOMATCH: /* Branch failed, try next if present. */ |
5253 | 16.2M | case MATCH_THEN: |
5254 | 16.2M | Fecode += GET(Fecode, 1); |
5255 | 16.2M | if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED; |
5256 | 1.42k | break; |
5257 | | |
5258 | 1.42k | case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */ |
5259 | 0 | case MATCH_SKIP: |
5260 | 0 | case MATCH_PRUNE: |
5261 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5262 | 0 | goto ASSERT_NOT_FAILED; |
5263 | | |
5264 | 0 | default: /* Pass back any other return */ |
5265 | 0 | RRETURN(rrc); |
5266 | 16.4M | } |
5267 | 16.4M | } |
5268 | | |
5269 | | /* None of the branches have matched or there was a backtrack to (*COMMIT), |
5270 | | (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a |
5271 | | negative assertion, so carry on. */ |
5272 | | |
5273 | 16.1M | ASSERT_NOT_FAILED: |
5274 | 16.1M | Fecode += 1 + LINK_SIZE; |
5275 | 16.1M | break; |
5276 | | |
5277 | 0 | #undef Lframe_type |
5278 | | |
5279 | | |
5280 | | /* ===================================================================== */ |
5281 | | /* The callout item calls an external function, if one is provided, passing |
5282 | | details of the match so far. This is mainly for debugging, though the |
5283 | | function is able to force a failure. */ |
5284 | | |
5285 | 0 | case OP_CALLOUT: |
5286 | 0 | case OP_CALLOUT_STR: |
5287 | 0 | rrc = do_callout(F, mb, &length); |
5288 | 0 | if (rrc > 0) RRETURN(MATCH_NOMATCH); |
5289 | 0 | if (rrc < 0) RRETURN(rrc); |
5290 | 0 | Fecode += length; |
5291 | 0 | break; |
5292 | | |
5293 | | |
5294 | | /* ===================================================================== */ |
5295 | | /* Conditional group: compilation checked that there are no more than two |
5296 | | branches. If the condition is false, skipping the first branch takes us |
5297 | | past the end of the item if there is only one branch, but that's exactly |
5298 | | what we want. */ |
5299 | | |
5300 | 0 | case OP_COND: |
5301 | 0 | case OP_SCOND: |
5302 | | |
5303 | | /* The variable Flength will be added to Fecode when the condition is |
5304 | | false, to get to the second branch. Setting it to the offset to the ALT or |
5305 | | KET, then incrementing Fecode achieves this effect. However, if the second |
5306 | | branch is non-existent, we must point to the KET so that the end of the |
5307 | | group is correctly processed. We now have Fecode pointing to the condition |
5308 | | or callout. */ |
5309 | |
|
5310 | 0 | Flength = GET(Fecode, 1); /* Offset to the second branch */ |
5311 | 0 | if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE; |
5312 | 0 | Fecode += 1 + LINK_SIZE; /* From this opcode */ |
5313 | | |
5314 | | /* Because of the way auto-callout works during compile, a callout item is |
5315 | | inserted between OP_COND and an assertion condition. Such a callout can |
5316 | | also be inserted manually. */ |
5317 | |
|
5318 | 0 | if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR) |
5319 | 0 | { |
5320 | 0 | rrc = do_callout(F, mb, &length); |
5321 | 0 | if (rrc > 0) RRETURN(MATCH_NOMATCH); |
5322 | 0 | if (rrc < 0) RRETURN(rrc); |
5323 | | |
5324 | | /* Advance Fecode past the callout, so it now points to the condition. We |
5325 | | must adjust Flength so that the value of Fecode+Flength is unchanged. */ |
5326 | |
|
5327 | 0 | Fecode += length; |
5328 | 0 | Flength -= length; |
5329 | 0 | } |
5330 | | |
5331 | | /* Test the various possible conditions */ |
5332 | | |
5333 | 0 | condition = FALSE; |
5334 | 0 | switch(*Fecode) |
5335 | 0 | { |
5336 | 0 | case OP_RREF: /* Group recursion test */ |
5337 | 0 | if (Fcurrent_recurse != RECURSE_UNSET) |
5338 | 0 | { |
5339 | 0 | number = GET2(Fecode, 1); |
5340 | 0 | condition = (number == RREF_ANY || number == Fcurrent_recurse); |
5341 | 0 | } |
5342 | 0 | break; |
5343 | | |
5344 | 0 | case OP_DNRREF: /* Duplicate named group recursion test */ |
5345 | 0 | if (Fcurrent_recurse != RECURSE_UNSET) |
5346 | 0 | { |
5347 | 0 | int count = GET2(Fecode, 1 + IMM2_SIZE); |
5348 | 0 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
5349 | 0 | while (count-- > 0) |
5350 | 0 | { |
5351 | 0 | number = GET2(slot, 0); |
5352 | 0 | condition = number == Fcurrent_recurse; |
5353 | 0 | if (condition) break; |
5354 | 0 | slot += mb->name_entry_size; |
5355 | 0 | } |
5356 | 0 | } |
5357 | 0 | break; |
5358 | | |
5359 | 0 | case OP_CREF: /* Numbered group used test */ |
5360 | 0 | offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */ |
5361 | 0 | condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; |
5362 | 0 | break; |
5363 | | |
5364 | 0 | case OP_DNCREF: /* Duplicate named group used test */ |
5365 | 0 | { |
5366 | 0 | int count = GET2(Fecode, 1 + IMM2_SIZE); |
5367 | 0 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
5368 | 0 | while (count-- > 0) |
5369 | 0 | { |
5370 | 0 | offset = (GET2(slot, 0) << 1) - 2; |
5371 | 0 | condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; |
5372 | 0 | if (condition) break; |
5373 | 0 | slot += mb->name_entry_size; |
5374 | 0 | } |
5375 | 0 | } |
5376 | 0 | break; |
5377 | | |
5378 | 0 | case OP_FALSE: |
5379 | 0 | case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ |
5380 | 0 | break; |
5381 | | |
5382 | 0 | case OP_TRUE: |
5383 | 0 | condition = TRUE; |
5384 | 0 | break; |
5385 | | |
5386 | | /* The condition is an assertion. Run code similar to the assertion code |
5387 | | above. */ |
5388 | | |
5389 | 0 | #define Lpositive F->temp_32[0] |
5390 | 0 | #define Lstart_branch F->temp_sptr[0] |
5391 | | |
5392 | 0 | default: |
5393 | 0 | Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK); |
5394 | 0 | Lstart_branch = Fecode; |
5395 | |
|
5396 | 0 | for (;;) |
5397 | 0 | { |
5398 | 0 | group_frame_type = GF_CONDASSERT | *Fecode; |
5399 | 0 | RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5); |
5400 | |
|
5401 | 0 | switch(rrc) |
5402 | 0 | { |
5403 | 0 | case MATCH_ACCEPT: /* Save captures */ |
5404 | 0 | memcpy(Fovector, |
5405 | 0 | (char *)assert_accept_frame + offsetof(heapframe, ovector), |
5406 | 0 | assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); |
5407 | 0 | Foffset_top = assert_accept_frame->offset_top; |
5408 | | |
5409 | | /* Fall through */ |
5410 | | /* In the case of a match, the captures have already been put into |
5411 | | the current frame. */ |
5412 | |
|
5413 | 0 | case MATCH_MATCH: |
5414 | 0 | condition = Lpositive; /* TRUE for positive assertion */ |
5415 | 0 | break; |
5416 | | |
5417 | | /* PCRE doesn't allow the effect of (*THEN) to escape beyond an |
5418 | | assertion; it is therefore always treated as NOMATCH. */ |
5419 | | |
5420 | 0 | case MATCH_NOMATCH: |
5421 | 0 | case MATCH_THEN: |
5422 | 0 | Lstart_branch += GET(Lstart_branch, 1); |
5423 | 0 | if (*Lstart_branch == OP_ALT) continue; /* Try next branch */ |
5424 | 0 | condition = !Lpositive; /* TRUE for negative assertion */ |
5425 | 0 | break; |
5426 | | |
5427 | | /* These force no match without checking other branches. */ |
5428 | | |
5429 | 0 | case MATCH_COMMIT: |
5430 | 0 | case MATCH_SKIP: |
5431 | 0 | case MATCH_PRUNE: |
5432 | 0 | condition = !Lpositive; |
5433 | 0 | break; |
5434 | | |
5435 | 0 | default: |
5436 | 0 | RRETURN(rrc); |
5437 | 0 | } |
5438 | 0 | break; /* Out of the branch loop */ |
5439 | 0 | } |
5440 | | |
5441 | | /* If the condition is true, find the end of the assertion so that |
5442 | | advancing past it gets us to the start of the first branch. */ |
5443 | | |
5444 | 0 | if (condition) |
5445 | 0 | { |
5446 | 0 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5447 | 0 | } |
5448 | 0 | break; /* End of assertion condition */ |
5449 | 0 | } |
5450 | | |
5451 | 0 | #undef Lpositive |
5452 | 0 | #undef Lstart_branch |
5453 | | |
5454 | | /* Choose branch according to the condition. */ |
5455 | | |
5456 | 0 | Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength; |
5457 | | |
5458 | | /* If the opcode is OP_SCOND it means we are at a repeated conditional |
5459 | | group that might match an empty string. We must therefore descend a level |
5460 | | so that the start is remembered for checking. For OP_COND we can just |
5461 | | continue at this level. */ |
5462 | |
|
5463 | 0 | if (Fop == OP_SCOND) |
5464 | 0 | { |
5465 | 0 | group_frame_type = GF_NOCAPTURE | Fop; |
5466 | 0 | RMATCH(Fecode, RM35); |
5467 | 0 | RRETURN(rrc); |
5468 | 0 | } |
5469 | 0 | break; |
5470 | | |
5471 | | |
5472 | | |
5473 | | /* ========================================================================= */ |
5474 | | /* End of start of parenthesis opcodes */ |
5475 | | /* ========================================================================= */ |
5476 | | |
5477 | | |
5478 | | /* ===================================================================== */ |
5479 | | /* Move the subject pointer back. This occurs only at the start of each |
5480 | | branch of a lookbehind assertion. If we are too close to the start to move |
5481 | | back, fail. When working with UTF-8 we move back a number of characters, |
5482 | | not bytes. */ |
5483 | | |
5484 | 16.4M | case OP_REVERSE: |
5485 | 16.4M | number = GET(Fecode, 1); |
5486 | 16.4M | #ifdef SUPPORT_UNICODE |
5487 | 16.4M | if (utf) |
5488 | 0 | { |
5489 | 0 | while (number-- > 0) |
5490 | 0 | { |
5491 | 0 | if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH); |
5492 | 0 | Feptr--; |
5493 | 0 | BACKCHAR(Feptr); |
5494 | 0 | } |
5495 | 0 | } |
5496 | 16.4M | else |
5497 | 16.4M | #endif |
5498 | | |
5499 | | /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */ |
5500 | | |
5501 | 16.4M | { |
5502 | 16.4M | if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH); |
5503 | 15.8M | Feptr -= number; |
5504 | 15.8M | } |
5505 | | |
5506 | | /* Save the earliest consulted character, then skip to next opcode */ |
5507 | | |
5508 | 15.8M | if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr; |
5509 | 15.8M | Fecode += 1 + LINK_SIZE; |
5510 | 15.8M | break; |
5511 | | |
5512 | | |
5513 | | /* ===================================================================== */ |
5514 | | /* An alternation is the end of a branch; scan along to find the end of the |
5515 | | bracketed group. */ |
5516 | | |
5517 | 44.9M | case OP_ALT: |
5518 | 1.51G | do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); |
5519 | 44.9M | break; |
5520 | | |
5521 | | |
5522 | | /* ===================================================================== */ |
5523 | | /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the |
5524 | | starting frame was added to the chained frames in order to remember the |
5525 | | starting subject position for the group. */ |
5526 | | |
5527 | 28.4M | case OP_KET: |
5528 | 28.4M | case OP_KETRMIN: |
5529 | 61.6M | case OP_KETRMAX: |
5530 | 61.6M | case OP_KETRPOS: |
5531 | | |
5532 | 61.6M | bracode = Fecode - GET(Fecode, 1); |
5533 | | |
5534 | | /* Point N to the frame at the start of the most recent group. |
5535 | | Remember the subject pointer at the start of the group. */ |
5536 | | |
5537 | 61.6M | if (*bracode != OP_BRA && *bracode != OP_COND) |
5538 | 56.2M | { |
5539 | 56.2M | N = (heapframe *)((char *)mb->match_frames + Flast_group_offset); |
5540 | 56.2M | P = (heapframe *)((char *)N - frame_size); |
5541 | 56.2M | Flast_group_offset = P->last_group_offset; |
5542 | | |
5543 | | #ifdef DEBUG_SHOW_RMATCH |
5544 | | fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n", |
5545 | | N->rdepth, N->group_frame_type, |
5546 | | (char *)P->eptr - (char *)mb->start_subject); |
5547 | | #endif |
5548 | | |
5549 | | /* If we are at the end of an assertion that is a condition, return a |
5550 | | match, discarding any intermediate backtracking points. Copy back the |
5551 | | mark setting and the captures into the frame before N so that they are |
5552 | | set on return. Doing this for all assertions, both positive and negative, |
5553 | | seems to match what Perl does. */ |
5554 | | |
5555 | 56.2M | if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT) |
5556 | 0 | { |
5557 | 0 | memcpy((char *)P + offsetof(heapframe, ovector), Fovector, |
5558 | 0 | Foffset_top * sizeof(PCRE2_SIZE)); |
5559 | 0 | P->offset_top = Foffset_top; |
5560 | 0 | P->mark = Fmark; |
5561 | 0 | Fback_frame = (char *)F - (char *)P; |
5562 | 0 | RRETURN(MATCH_MATCH); |
5563 | 0 | } |
5564 | 56.2M | } |
5565 | 5.42M | else P = NULL; /* Indicates starting frame not recorded */ |
5566 | | |
5567 | | /* The group was not a conditional assertion. */ |
5568 | | |
5569 | 61.6M | switch (*bracode) |
5570 | 61.6M | { |
5571 | 5.42M | case OP_BRA: /* No need to do anything for these */ |
5572 | 5.42M | case OP_COND: |
5573 | 5.42M | case OP_SCOND: |
5574 | 5.42M | break; |
5575 | | |
5576 | | /* Non-atomic positive assertions are like OP_BRA, except that the |
5577 | | subject pointer must be put back to where it was at the start of the |
5578 | | assertion. */ |
5579 | | |
5580 | 0 | case OP_ASSERT_NA: |
5581 | 10 | case OP_ASSERTBACK_NA: |
5582 | 10 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
5583 | 10 | Feptr = P->eptr; |
5584 | 10 | break; |
5585 | | |
5586 | | /* Atomic positive assertions are like OP_ONCE, except that in addition |
5587 | | the subject pointer must be put back to where it was at the start of the |
5588 | | assertion. */ |
5589 | | |
5590 | 154k | case OP_ASSERT: |
5591 | 154k | case OP_ASSERTBACK: |
5592 | 154k | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
5593 | 154k | Feptr = P->eptr; |
5594 | | /* Fall through */ |
5595 | | |
5596 | | /* For an atomic group, discard internal backtracking points. We must |
5597 | | also ensure that any remaining branches within the top-level of the group |
5598 | | are not tried. Do this by adjusting the code pointer within the backtrack |
5599 | | frame so that it points to the final branch. */ |
5600 | | |
5601 | 154k | case OP_ONCE: |
5602 | 154k | Fback_frame = ((char *)F - (char *)P); |
5603 | 154k | for (;;) |
5604 | 6.50M | { |
5605 | 6.50M | uint32_t y = GET(P->ecode,1); |
5606 | 6.50M | if ((P->ecode)[y] != OP_ALT) break; |
5607 | 6.35M | P->ecode += y; |
5608 | 6.35M | } |
5609 | 154k | break; |
5610 | | |
5611 | | /* A matching negative assertion returns MATCH, which is turned into |
5612 | | NOMATCH at the assertion level. */ |
5613 | | |
5614 | 0 | case OP_ASSERT_NOT: |
5615 | 270k | case OP_ASSERTBACK_NOT: |
5616 | 270k | RRETURN(MATCH_MATCH); |
5617 | | |
5618 | | /* At the end of a script run, apply the script-checking rules. This code |
5619 | | will never by exercised if Unicode support it not compiled, because in |
5620 | | that environment script runs cause an error at compile time. */ |
5621 | |
|
5622 | 0 | case OP_SCRIPT_RUN: |
5623 | 0 | if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH); |
5624 | 0 | break; |
5625 | | |
5626 | | /* Whole-pattern recursion is coded as a recurse into group 0, so it |
5627 | | won't be picked up here. Instead, we catch it when the OP_END is reached. |
5628 | | Other recursion is handled here. */ |
5629 | | |
5630 | 23.9M | case OP_CBRA: |
5631 | 23.9M | case OP_CBRAPOS: |
5632 | 25.7M | case OP_SCBRA: |
5633 | 25.7M | case OP_SCBRAPOS: |
5634 | 25.7M | number = GET2(bracode, 1+LINK_SIZE); |
5635 | | |
5636 | | /* Handle a recursively called group. We reinstate the previous set of |
5637 | | captures and then carry on after the recursion call. */ |
5638 | | |
5639 | 25.7M | if (Fcurrent_recurse == number) |
5640 | 0 | { |
5641 | 0 | P = (heapframe *)((char *)N - frame_size); |
5642 | 0 | memcpy((char *)F + offsetof(heapframe, ovector), P->ovector, |
5643 | 0 | P->offset_top * sizeof(PCRE2_SIZE)); |
5644 | 0 | Foffset_top = P->offset_top; |
5645 | 0 | Fcapture_last = P->capture_last; |
5646 | 0 | Fcurrent_recurse = P->current_recurse; |
5647 | 0 | Fecode = P->ecode + 1 + LINK_SIZE; |
5648 | 0 | continue; /* With next opcode */ |
5649 | 0 | } |
5650 | | |
5651 | | /* Deal with actual capturing. */ |
5652 | | |
5653 | 25.7M | offset = (number << 1) - 2; |
5654 | 25.7M | Fcapture_last = number; |
5655 | 25.7M | Fovector[offset] = P->eptr - mb->start_subject; |
5656 | 25.7M | Fovector[offset+1] = Feptr - mb->start_subject; |
5657 | 25.7M | if (offset >= Foffset_top) Foffset_top = offset + 2; |
5658 | 25.7M | break; |
5659 | 61.6M | } /* End actions relating to the starting opcode */ |
5660 | | |
5661 | | /* OP_KETRPOS is a possessive repeating ket. Remember the current position, |
5662 | | and return the MATCH_KETRPOS. This makes it possible to do the repeats one |
5663 | | at a time from the outer level. This must precede the empty string test - |
5664 | | in this case that test is done at the outer level. */ |
5665 | | |
5666 | 61.3M | if (*Fecode == OP_KETRPOS) |
5667 | 16 | { |
5668 | 16 | memcpy((char *)P + offsetof(heapframe, eptr), |
5669 | 16 | (char *)F + offsetof(heapframe, eptr), |
5670 | 16 | frame_copy_size); |
5671 | 16 | RRETURN(MATCH_KETRPOS); |
5672 | 0 | } |
5673 | | |
5674 | | /* Handle the different kinds of closing brackets. A non-repeating ket |
5675 | | needs no special action, just continuing at this level. This also happens |
5676 | | for the repeating kets if the group matched no characters, in order to |
5677 | | forcibly break infinite loops. Otherwise, the repeating kets try the rest |
5678 | | of the pattern or restart from the preceding bracket, in the appropriate |
5679 | | order. */ |
5680 | | |
5681 | 61.3M | if (Fop != OP_KET && (P == NULL || Feptr != P->eptr)) |
5682 | 1.39M | { |
5683 | 1.39M | if (Fop == OP_KETRMIN) |
5684 | 5.77k | { |
5685 | 5.77k | RMATCH(Fecode + 1 + LINK_SIZE, RM6); |
5686 | 5.77k | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5687 | 5.77k | Fecode -= GET(Fecode, 1); |
5688 | 5.77k | break; /* End of ket processing */ |
5689 | 5.77k | } |
5690 | | |
5691 | | /* Repeat the maximum number of times (KETRMAX) */ |
5692 | | |
5693 | 1.38M | RMATCH(bracode, RM7); |
5694 | 1.38M | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5695 | 1.38M | } |
5696 | | |
5697 | | /* Carry on at this level for a non-repeating ket, or after matching an |
5698 | | empty string, or after repeating for a maximum number of times. */ |
5699 | | |
5700 | 61.3M | Fecode += 1 + LINK_SIZE; |
5701 | 61.3M | break; |
5702 | | |
5703 | | |
5704 | | /* ===================================================================== */ |
5705 | | /* Start and end of line assertions, not multiline mode. */ |
5706 | | |
5707 | 2.85M | case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */ |
5708 | 2.85M | if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0) |
5709 | 1.74M | RRETURN(MATCH_NOMATCH); |
5710 | 1.74M | Fecode++; |
5711 | 1.74M | break; |
5712 | | |
5713 | 145k | case OP_SOD: /* Unconditional start of subject */ |
5714 | 145k | if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH); |
5715 | 680 | Fecode++; |
5716 | 680 | break; |
5717 | | |
5718 | | /* When PCRE2_NOTEOL is unset, assert before the subject end, or a |
5719 | | terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */ |
5720 | | |
5721 | 177M | case OP_DOLL: |
5722 | 177M | if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); |
5723 | 177M | if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS; |
5724 | | |
5725 | | /* Fall through */ |
5726 | | /* Unconditional end of subject assertion (\z) */ |
5727 | | |
5728 | 6.63k | case OP_EOD: |
5729 | 6.63k | if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH); |
5730 | 1.13k | if (mb->partial != 0) |
5731 | 0 | { |
5732 | 0 | mb->hitend = TRUE; |
5733 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
5734 | 0 | } |
5735 | 1.13k | Fecode++; |
5736 | 1.13k | break; |
5737 | | |
5738 | | /* End of subject or ending \n assertion (\Z) */ |
5739 | | |
5740 | 61.1k | case OP_EODN: |
5741 | 177M | ASSERT_NL_OR_EOS: |
5742 | 177M | if (Feptr < mb->end_subject && |
5743 | 177M | (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen)) |
5744 | 176M | { |
5745 | 176M | if (mb->partial != 0 && |
5746 | 176M | Feptr + 1 >= mb->end_subject && |
5747 | 176M | NLBLOCK->nltype == NLTYPE_FIXED && |
5748 | 176M | NLBLOCK->nllen == 2 && |
5749 | 176M | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
5750 | 0 | { |
5751 | 0 | mb->hitend = TRUE; |
5752 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
5753 | 0 | } |
5754 | 176M | RRETURN(MATCH_NOMATCH); |
5755 | 0 | } |
5756 | | |
5757 | | /* Either at end of string or \n before end. */ |
5758 | | |
5759 | 1.27M | if (mb->partial != 0) |
5760 | 0 | { |
5761 | 0 | mb->hitend = TRUE; |
5762 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
5763 | 0 | } |
5764 | 1.27M | Fecode++; |
5765 | 1.27M | break; |
5766 | | |
5767 | | |
5768 | | /* ===================================================================== */ |
5769 | | /* Start and end of line assertions, multiline mode. */ |
5770 | | |
5771 | | /* Start of subject unless notbol, or after any newline except for one at |
5772 | | the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */ |
5773 | | |
5774 | 8.87M | case OP_CIRCM: |
5775 | 8.87M | if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject) |
5776 | 8.87M | RRETURN(MATCH_NOMATCH); |
5777 | 8.87M | if (Feptr != mb->start_subject && |
5778 | 8.87M | ((Feptr == mb->end_subject && |
5779 | 8.80M | (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) || |
5780 | 8.80M | !WAS_NEWLINE(Feptr))) |
5781 | 8.63M | RRETURN(MATCH_NOMATCH); |
5782 | 233k | Fecode++; |
5783 | 233k | break; |
5784 | | |
5785 | | /* Assert before any newline, or before end of subject unless noteol is |
5786 | | set. */ |
5787 | | |
5788 | 121k | case OP_DOLLM: |
5789 | 121k | if (Feptr < mb->end_subject) |
5790 | 120k | { |
5791 | 120k | if (!IS_NEWLINE(Feptr)) |
5792 | 118k | { |
5793 | 118k | if (mb->partial != 0 && |
5794 | 118k | Feptr + 1 >= mb->end_subject && |
5795 | 118k | NLBLOCK->nltype == NLTYPE_FIXED && |
5796 | 118k | NLBLOCK->nllen == 2 && |
5797 | 118k | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
5798 | 0 | { |
5799 | 0 | mb->hitend = TRUE; |
5800 | 0 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
5801 | 0 | } |
5802 | 118k | RRETURN(MATCH_NOMATCH); |
5803 | 0 | } |
5804 | 120k | } |
5805 | 1.60k | else |
5806 | 1.60k | { |
5807 | 1.60k | if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); |
5808 | 1.60k | SCHECK_PARTIAL(); |
5809 | 1.60k | } |
5810 | 3.73k | Fecode++; |
5811 | 3.73k | break; |
5812 | | |
5813 | | |
5814 | | /* ===================================================================== */ |
5815 | | /* Start of match assertion */ |
5816 | | |
5817 | 19.1M | case OP_SOM: |
5818 | 19.1M | if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH); |
5819 | 22.2k | Fecode++; |
5820 | 22.2k | break; |
5821 | | |
5822 | | |
5823 | | /* ===================================================================== */ |
5824 | | /* Reset the start of match point */ |
5825 | | |
5826 | 334k | case OP_SET_SOM: |
5827 | 334k | Fstart_match = Feptr; |
5828 | 334k | Fecode++; |
5829 | 334k | break; |
5830 | | |
5831 | | |
5832 | | /* ===================================================================== */ |
5833 | | /* Word boundary assertions. Find out if the previous and current |
5834 | | characters are "word" characters. It takes a bit more work in UTF mode. |
5835 | | Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is |
5836 | | not set. When it is set, use Unicode properties if available, even when not |
5837 | | in UTF mode. Remember the earliest and latest consulted characters. */ |
5838 | | |
5839 | 157k | case OP_NOT_WORD_BOUNDARY: |
5840 | 5.85M | case OP_WORD_BOUNDARY: |
5841 | 5.85M | if (Feptr == mb->check_subject) prev_is_word = FALSE; else |
5842 | 5.84M | { |
5843 | 5.84M | PCRE2_SPTR lastptr = Feptr - 1; |
5844 | 5.84M | #ifdef SUPPORT_UNICODE |
5845 | 5.84M | if (utf) |
5846 | 0 | { |
5847 | 0 | BACKCHAR(lastptr); |
5848 | 0 | GETCHAR(fc, lastptr); |
5849 | 0 | } |
5850 | 5.84M | else |
5851 | 5.84M | #endif /* SUPPORT_UNICODE */ |
5852 | 5.84M | fc = *lastptr; |
5853 | 5.84M | if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr; |
5854 | 5.84M | #ifdef SUPPORT_UNICODE |
5855 | 5.84M | if ((mb->poptions & PCRE2_UCP) != 0) |
5856 | 0 | { |
5857 | 0 | if (fc == '_') prev_is_word = TRUE; else |
5858 | 0 | { |
5859 | 0 | int cat = UCD_CATEGORY(fc); |
5860 | 0 | prev_is_word = (cat == ucp_L || cat == ucp_N); |
5861 | 0 | } |
5862 | 0 | } |
5863 | 5.84M | else |
5864 | 5.84M | #endif /* SUPPORT_UNICODE */ |
5865 | 5.84M | prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; |
5866 | 5.84M | } |
5867 | | |
5868 | | /* Get status of next character */ |
5869 | | |
5870 | 5.85M | if (Feptr >= mb->end_subject) |
5871 | 36.0k | { |
5872 | 36.0k | SCHECK_PARTIAL(); |
5873 | 36.0k | cur_is_word = FALSE; |
5874 | 36.0k | } |
5875 | 5.81M | else |
5876 | 5.81M | { |
5877 | 5.81M | PCRE2_SPTR nextptr = Feptr + 1; |
5878 | 5.81M | #ifdef SUPPORT_UNICODE |
5879 | 5.81M | if (utf) |
5880 | 0 | { |
5881 | 0 | FORWARDCHARTEST(nextptr, mb->end_subject); |
5882 | 0 | GETCHAR(fc, Feptr); |
5883 | 0 | } |
5884 | 5.81M | else |
5885 | 5.81M | #endif /* SUPPORT_UNICODE */ |
5886 | 5.81M | fc = *Feptr; |
5887 | 5.81M | if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr; |
5888 | 5.81M | #ifdef SUPPORT_UNICODE |
5889 | 5.81M | if ((mb->poptions & PCRE2_UCP) != 0) |
5890 | 0 | { |
5891 | 0 | if (fc == '_') cur_is_word = TRUE; else |
5892 | 0 | { |
5893 | 0 | int cat = UCD_CATEGORY(fc); |
5894 | 0 | cur_is_word = (cat == ucp_L || cat == ucp_N); |
5895 | 0 | } |
5896 | 0 | } |
5897 | 5.81M | else |
5898 | 5.81M | #endif /* SUPPORT_UNICODE */ |
5899 | 5.81M | cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; |
5900 | 5.81M | } |
5901 | | |
5902 | | /* Now see if the situation is what we want */ |
5903 | | |
5904 | 5.85M | if ((*Fecode++ == OP_WORD_BOUNDARY)? |
5905 | 5.69M | cur_is_word == prev_is_word : cur_is_word != prev_is_word) |
5906 | 4.08M | RRETURN(MATCH_NOMATCH); |
5907 | 1.76M | break; |
5908 | | |
5909 | | |
5910 | | /* ===================================================================== */ |
5911 | | /* Backtracking (*VERB)s, with and without arguments. Note that if the |
5912 | | pattern is successfully matched, we do not come back from RMATCH. */ |
5913 | | |
5914 | 0 | case OP_MARK: |
5915 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
5916 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12); |
5917 | | |
5918 | | /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an |
5919 | | argument, and we must check whether that argument matches this MARK's |
5920 | | argument. It is passed back in mb->verb_skip_ptr. If it does match, we |
5921 | | return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject |
5922 | | position that corresponds to this mark. Otherwise, pass back the return |
5923 | | code unaltered. */ |
5924 | |
|
5925 | 0 | if (rrc == MATCH_SKIP_ARG && |
5926 | 0 | PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0) |
5927 | 0 | { |
5928 | 0 | mb->verb_skip_ptr = Feptr; /* Pass back current position */ |
5929 | 0 | RRETURN(MATCH_SKIP); |
5930 | 0 | } |
5931 | 0 | RRETURN(rrc); |
5932 | |
|
5933 | 0 | case OP_FAIL: |
5934 | 0 | RRETURN(MATCH_NOMATCH); |
5935 | | |
5936 | | /* Record the current recursing group number in mb->verb_current_recurse |
5937 | | when a backtracking return such as MATCH_COMMIT is given. This enables the |
5938 | | recurse processing to catch verbs from within the recursion. */ |
5939 | |
|
5940 | 0 | case OP_COMMIT: |
5941 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13); |
5942 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5943 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
5944 | 0 | RRETURN(MATCH_COMMIT); |
5945 | |
|
5946 | 0 | case OP_COMMIT_ARG: |
5947 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
5948 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36); |
5949 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5950 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
5951 | 0 | RRETURN(MATCH_COMMIT); |
5952 | |
|
5953 | 0 | case OP_PRUNE: |
5954 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14); |
5955 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5956 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
5957 | 0 | RRETURN(MATCH_PRUNE); |
5958 | |
|
5959 | 0 | case OP_PRUNE_ARG: |
5960 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
5961 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15); |
5962 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5963 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
5964 | 0 | RRETURN(MATCH_PRUNE); |
5965 | |
|
5966 | 0 | case OP_SKIP: |
5967 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16); |
5968 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5969 | 0 | mb->verb_skip_ptr = Feptr; /* Pass back current position */ |
5970 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
5971 | 0 | RRETURN(MATCH_SKIP); |
5972 | | |
5973 | | /* Note that, for Perl compatibility, SKIP with an argument does NOT set |
5974 | | nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was |
5975 | | not a matching mark, we have to re-run the match, ignoring the SKIP_ARG |
5976 | | that failed and any that precede it (either they also failed, or were not |
5977 | | triggered). To do this, we maintain a count of executed SKIP_ARGs. If a |
5978 | | SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg |
5979 | | set to the count of the one that failed. */ |
5980 | |
|
5981 | 0 | case OP_SKIP_ARG: |
5982 | 0 | mb->skip_arg_count++; |
5983 | 0 | if (mb->skip_arg_count <= mb->ignore_skip_arg) |
5984 | 0 | { |
5985 | 0 | Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1]; |
5986 | 0 | break; |
5987 | 0 | } |
5988 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17); |
5989 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5990 | | |
5991 | | /* Pass back the current skip name and return the special MATCH_SKIP_ARG |
5992 | | return code. This will either be caught by a matching MARK, or get to the |
5993 | | top, where it causes a rematch with mb->ignore_skip_arg set to the value of |
5994 | | mb->skip_arg_count. */ |
5995 | |
|
5996 | 0 | mb->verb_skip_ptr = Fecode + 2; |
5997 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
5998 | 0 | RRETURN(MATCH_SKIP_ARG); |
5999 | | |
6000 | | /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that |
6001 | | the branch in which it occurs can be determined. */ |
6002 | |
|
6003 | 0 | case OP_THEN: |
6004 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18); |
6005 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6006 | 0 | mb->verb_ecode_ptr = Fecode; |
6007 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6008 | 0 | RRETURN(MATCH_THEN); |
6009 | |
|
6010 | 0 | case OP_THEN_ARG: |
6011 | 0 | Fmark = mb->nomatch_mark = Fecode + 2; |
6012 | 0 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19); |
6013 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6014 | 0 | mb->verb_ecode_ptr = Fecode; |
6015 | 0 | mb->verb_current_recurse = Fcurrent_recurse; |
6016 | 0 | RRETURN(MATCH_THEN); |
6017 | | |
6018 | | |
6019 | | /* ===================================================================== */ |
6020 | | /* There's been some horrible disaster. Arrival here can only mean there is |
6021 | | something seriously wrong in the code above or the OP_xxx definitions. */ |
6022 | |
|
6023 | 0 | default: |
6024 | 0 | return PCRE2_ERROR_INTERNAL; |
6025 | 1.77G | } |
6026 | | |
6027 | | /* Do not insert any code in here without much thought; it is assumed |
6028 | | that "continue" in the code above comes out to here to repeat the main |
6029 | | loop. */ |
6030 | | |
6031 | 1.77G | } /* End of main loop */ |
6032 | | /* Control never reaches here */ |
6033 | | |
6034 | | |
6035 | | /* ========================================================================= */ |
6036 | | /* The RRETURN() macro jumps here. The number that is saved in Freturn_id |
6037 | | indicates which label we actually want to return to. The value in Frdepth is |
6038 | | the index number of the frame in the vector. The return value has been placed |
6039 | | in rrc. */ |
6040 | | |
6041 | 1.08G | #define LBL(val) case val: goto L_RM##val; |
6042 | | |
6043 | 1.13G | RETURN_SWITCH: |
6044 | 1.13G | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
6045 | 1.13G | if (Frdepth == 0) return rrc; /* Exit from the top level */ |
6046 | 1.08G | F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */ |
6047 | 1.08G | mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */ |
6048 | | |
6049 | | #ifdef DEBUG_SHOW_RMATCH |
6050 | | fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id); |
6051 | | #endif |
6052 | | |
6053 | 1.08G | switch (Freturn_id) |
6054 | 1.08G | { |
6055 | 155M | LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) |
6056 | 15.3M | LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16) |
6057 | 2.57M | LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24) |
6058 | 143k | LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32) |
6059 | 628M | LBL(33) LBL(34) LBL(35) LBL(36) |
6060 | | |
6061 | 0 | #ifdef SUPPORT_WIDE_CHARS |
6062 | 615 | LBL(100) LBL(101) |
6063 | 0 | #endif |
6064 | | |
6065 | 0 | #ifdef SUPPORT_UNICODE |
6066 | 0 | LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206) |
6067 | 134k | LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213) |
6068 | 72.5M | LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220) |
6069 | 2.78M | LBL(221) LBL(222) |
6070 | 0 | #endif |
6071 | | |
6072 | 0 | default: |
6073 | 0 | return PCRE2_ERROR_INTERNAL; |
6074 | 1.08G | } |
6075 | 1.08G | #undef LBL |
6076 | 1.08G | } |
6077 | | |
6078 | | |
6079 | | /************************************************* |
6080 | | * Match a Regular Expression * |
6081 | | *************************************************/ |
6082 | | |
6083 | | /* This function applies a compiled pattern to a subject string and picks out |
6084 | | portions of the string if it matches. Two elements in the vector are set for |
6085 | | each substring: the offsets to the start and end of the substring. |
6086 | | |
6087 | | Arguments: |
6088 | | code points to the compiled expression |
6089 | | subject points to the subject string |
6090 | | length length of subject string (may contain binary zeros) |
6091 | | start_offset where to start in the subject string |
6092 | | options option bits |
6093 | | match_data points to a match_data block |
6094 | | mcontext points a PCRE2 context |
6095 | | |
6096 | | Returns: > 0 => success; value is the number of ovector pairs filled |
6097 | | = 0 => success, but ovector is not big enough |
6098 | | = -1 => failed to match (PCRE2_ERROR_NOMATCH) |
6099 | | = -2 => partial match (PCRE2_ERROR_PARTIAL) |
6100 | | < -2 => some kind of unexpected problem |
6101 | | */ |
6102 | | |
6103 | | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION |
6104 | | pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, |
6105 | | PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, |
6106 | | pcre2_match_context *mcontext) |
6107 | 3.71M | { |
6108 | 3.71M | int rc; |
6109 | 3.71M | int was_zero_terminated = 0; |
6110 | 3.71M | const uint8_t *start_bits = NULL; |
6111 | 3.71M | const pcre2_real_code *re = (const pcre2_real_code *)code; |
6112 | | |
6113 | 3.71M | BOOL anchored; |
6114 | 3.71M | BOOL firstline; |
6115 | 3.71M | BOOL has_first_cu = FALSE; |
6116 | 3.71M | BOOL has_req_cu = FALSE; |
6117 | 3.71M | BOOL startline; |
6118 | | |
6119 | 3.71M | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6120 | 3.71M | PCRE2_SPTR memchr_found_first_cu; |
6121 | 3.71M | PCRE2_SPTR memchr_found_first_cu2; |
6122 | 3.71M | #endif |
6123 | | |
6124 | 3.71M | PCRE2_UCHAR first_cu = 0; |
6125 | 3.71M | PCRE2_UCHAR first_cu2 = 0; |
6126 | 3.71M | PCRE2_UCHAR req_cu = 0; |
6127 | 3.71M | PCRE2_UCHAR req_cu2 = 0; |
6128 | | |
6129 | 3.71M | PCRE2_SPTR bumpalong_limit; |
6130 | 3.71M | PCRE2_SPTR end_subject; |
6131 | 3.71M | PCRE2_SPTR true_end_subject; |
6132 | 3.71M | PCRE2_SPTR start_match = subject + start_offset; |
6133 | 3.71M | PCRE2_SPTR req_cu_ptr = start_match - 1; |
6134 | 3.71M | PCRE2_SPTR start_partial; |
6135 | 3.71M | PCRE2_SPTR match_partial; |
6136 | | |
6137 | | #ifdef SUPPORT_JIT |
6138 | | BOOL use_jit; |
6139 | | #endif |
6140 | | |
6141 | | /* This flag is needed even when Unicode is not supported for convenience |
6142 | | (it is used by the IS_NEWLINE macro). */ |
6143 | | |
6144 | 3.71M | BOOL utf = FALSE; |
6145 | | |
6146 | 3.71M | #ifdef SUPPORT_UNICODE |
6147 | 3.71M | BOOL ucp = FALSE; |
6148 | 3.71M | BOOL allow_invalid; |
6149 | 3.71M | uint32_t fragment_options = 0; |
6150 | | #ifdef SUPPORT_JIT |
6151 | | BOOL jit_checked_utf = FALSE; |
6152 | | #endif |
6153 | 3.71M | #endif /* SUPPORT_UNICODE */ |
6154 | | |
6155 | 3.71M | PCRE2_SIZE frame_size; |
6156 | | |
6157 | | /* We need to have mb as a pointer to a match block, because the IS_NEWLINE |
6158 | | macro is used below, and it expects NLBLOCK to be defined as a pointer. */ |
6159 | | |
6160 | 3.71M | pcre2_callout_block cb; |
6161 | 3.71M | match_block actual_match_block; |
6162 | 3.71M | match_block *mb = &actual_match_block; |
6163 | | |
6164 | | /* Allocate an initial vector of backtracking frames on the stack. If this |
6165 | | proves to be too small, it is replaced by a larger one on the heap. To get a |
6166 | | vector of the size required that is aligned for pointers, allocate it as a |
6167 | | vector of pointers. */ |
6168 | | |
6169 | 3.71M | PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)] |
6170 | 3.71M | PCRE2_KEEP_UNINITIALIZED; |
6171 | 3.71M | mb->stack_frames = (heapframe *)stack_frames_vector; |
6172 | | |
6173 | | /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated |
6174 | | subject string. */ |
6175 | | |
6176 | 3.71M | if (length == PCRE2_ZERO_TERMINATED) |
6177 | 0 | { |
6178 | 0 | length = PRIV(strlen)(subject); |
6179 | 0 | was_zero_terminated = 1; |
6180 | 0 | } |
6181 | 3.71M | true_end_subject = end_subject = subject + length; |
6182 | | |
6183 | | /* Plausibility checks */ |
6184 | | |
6185 | 3.71M | if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; |
6186 | 3.71M | if (code == NULL || subject == NULL || match_data == NULL) |
6187 | 0 | return PCRE2_ERROR_NULL; |
6188 | 3.71M | if (start_offset > length) return PCRE2_ERROR_BADOFFSET; |
6189 | | |
6190 | | /* Check that the first field in the block is the magic number. */ |
6191 | | |
6192 | 3.71M | if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; |
6193 | | |
6194 | | /* Check the code unit width. */ |
6195 | | |
6196 | 3.71M | if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) |
6197 | 0 | return PCRE2_ERROR_BADMODE; |
6198 | | |
6199 | | /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the |
6200 | | options variable for this function. Users of PCRE2 who are not calling the |
6201 | | function directly would like to have a way of setting these flags, in the same |
6202 | | way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with |
6203 | | constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and |
6204 | | (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now |
6205 | | transfer to the options for this function. The bits are guaranteed to be |
6206 | | adjacent, but do not have the same values. This bit of Boolean trickery assumes |
6207 | | that the match-time bits are not more significant than the flag bits. If by |
6208 | | accident this is not the case, a compile-time division by zero error will |
6209 | | occur. */ |
6210 | | |
6211 | 11.1M | #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) |
6212 | 7.42M | #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) |
6213 | 3.71M | options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); |
6214 | 3.71M | #undef FF |
6215 | 3.71M | #undef OO |
6216 | | |
6217 | | /* If the pattern was successfully studied with JIT support, we will run the |
6218 | | JIT executable instead of the rest of this function. Most options must be set |
6219 | | at compile time for the JIT code to be usable. */ |
6220 | | |
6221 | | #ifdef SUPPORT_JIT |
6222 | | use_jit = (re->executable_jit != NULL && |
6223 | | (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); |
6224 | | #endif |
6225 | | |
6226 | | /* Initialize UTF/UCP parameters. */ |
6227 | | |
6228 | 3.71M | #ifdef SUPPORT_UNICODE |
6229 | 3.71M | utf = (re->overall_options & PCRE2_UTF) != 0; |
6230 | 3.71M | allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; |
6231 | 3.71M | ucp = (re->overall_options & PCRE2_UCP) != 0; |
6232 | 3.71M | #endif /* SUPPORT_UNICODE */ |
6233 | | |
6234 | | /* Convert the partial matching flags into an integer. */ |
6235 | | |
6236 | 3.71M | mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : |
6237 | 3.71M | ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0; |
6238 | | |
6239 | | /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same |
6240 | | time. */ |
6241 | | |
6242 | 3.71M | if (mb->partial != 0 && |
6243 | 3.71M | ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) |
6244 | 0 | return PCRE2_ERROR_BADOPTION; |
6245 | | |
6246 | | /* It is an error to set an offset limit without setting the flag at compile |
6247 | | time. */ |
6248 | | |
6249 | 3.71M | if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && |
6250 | 3.71M | (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) |
6251 | 0 | return PCRE2_ERROR_BADOFFSETLIMIT; |
6252 | | |
6253 | | /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, |
6254 | | free the memory that was obtained. Set the field to NULL for no match cases. */ |
6255 | | |
6256 | 3.71M | if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) |
6257 | 0 | { |
6258 | 0 | match_data->memctl.free((void *)match_data->subject, |
6259 | 0 | match_data->memctl.memory_data); |
6260 | 0 | match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; |
6261 | 0 | } |
6262 | 3.71M | match_data->subject = NULL; |
6263 | | |
6264 | | /* Zero the error offset in case the first code unit is invalid UTF. */ |
6265 | | |
6266 | 3.71M | match_data->startchar = 0; |
6267 | | |
6268 | | |
6269 | | /* ============================= JIT matching ============================== */ |
6270 | | |
6271 | | /* Prepare for JIT matching. Check a UTF string for validity unless no check is |
6272 | | requested or invalid UTF can be handled. We check only the portion of the |
6273 | | subject that might be be inspected during matching - from the offset minus the |
6274 | | maximum lookbehind to the given length. This saves time when a small part of a |
6275 | | large subject is being matched by the use of a starting offset. Note that the |
6276 | | maximum lookbehind is a number of characters, not code units. */ |
6277 | | |
6278 | | #ifdef SUPPORT_JIT |
6279 | | if (use_jit) |
6280 | | { |
6281 | | #ifdef SUPPORT_UNICODE |
6282 | | if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid) |
6283 | | { |
6284 | | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6285 | | unsigned int i; |
6286 | | #endif |
6287 | | |
6288 | | /* For 8-bit and 16-bit UTF, check that the first code unit is a valid |
6289 | | character start. */ |
6290 | | |
6291 | | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6292 | | if (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
6293 | | { |
6294 | | if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; |
6295 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6296 | | return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ |
6297 | | #else |
6298 | | return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ |
6299 | | #endif |
6300 | | } |
6301 | | #endif /* WIDTH != 32 */ |
6302 | | |
6303 | | /* Move back by the maximum lookbehind, just in case it happens at the very |
6304 | | start of matching. */ |
6305 | | |
6306 | | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6307 | | for (i = re->max_lookbehind; i > 0 && start_match > subject; i--) |
6308 | | { |
6309 | | start_match--; |
6310 | | while (start_match > subject && |
6311 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6312 | | (*start_match & 0xc0) == 0x80) |
6313 | | #else /* 16-bit */ |
6314 | | (*start_match & 0xfc00) == 0xdc00) |
6315 | | #endif |
6316 | | start_match--; |
6317 | | } |
6318 | | #else /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
6319 | | |
6320 | | /* In the 32-bit library, one code unit equals one character. However, |
6321 | | we cannot just subtract the lookbehind and then compare pointers, because |
6322 | | a very large lookbehind could create an invalid pointer. */ |
6323 | | |
6324 | | if (start_offset >= re->max_lookbehind) |
6325 | | start_match -= re->max_lookbehind; |
6326 | | else |
6327 | | start_match = subject; |
6328 | | #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
6329 | | |
6330 | | /* Validate the relevant portion of the subject. Adjust the offset of an |
6331 | | invalid code point to be an absolute offset in the whole string. */ |
6332 | | |
6333 | | match_data->rc = PRIV(valid_utf)(start_match, |
6334 | | length - (start_match - subject), &(match_data->startchar)); |
6335 | | if (match_data->rc != 0) |
6336 | | { |
6337 | | match_data->startchar += start_match - subject; |
6338 | | return match_data->rc; |
6339 | | } |
6340 | | jit_checked_utf = TRUE; |
6341 | | } |
6342 | | #endif /* SUPPORT_UNICODE */ |
6343 | | |
6344 | | /* If JIT returns BADOPTION, which means that the selected complete or |
6345 | | partial matching mode was not compiled, fall through to the interpreter. */ |
6346 | | |
6347 | | rc = pcre2_jit_match(code, subject, length, start_offset, options, |
6348 | | match_data, mcontext); |
6349 | | if (rc != PCRE2_ERROR_JIT_BADOPTION) |
6350 | | { |
6351 | | if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0) |
6352 | | { |
6353 | | length = CU2BYTES(length + was_zero_terminated); |
6354 | | match_data->subject = match_data->memctl.malloc(length, |
6355 | | match_data->memctl.memory_data); |
6356 | | if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; |
6357 | | memcpy((void *)match_data->subject, subject, length); |
6358 | | match_data->flags |= PCRE2_MD_COPIED_SUBJECT; |
6359 | | } |
6360 | | return rc; |
6361 | | } |
6362 | | } |
6363 | | #endif /* SUPPORT_JIT */ |
6364 | | |
6365 | | /* ========================= End of JIT matching ========================== */ |
6366 | | |
6367 | | |
6368 | | /* Proceed with non-JIT matching. The default is to allow lookbehinds to the |
6369 | | start of the subject. A UTF check when there is a non-zero offset may change |
6370 | | this. */ |
6371 | | |
6372 | 3.71M | mb->check_subject = subject; |
6373 | | |
6374 | | /* If a UTF subject string was not checked for validity in the JIT code above, |
6375 | | check it here, and handle support for invalid UTF strings. The check above |
6376 | | happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset. |
6377 | | If we get here in those circumstances, it means the subject string is valid, |
6378 | | but for some reason JIT matching was not successful. There is no need to check |
6379 | | the subject again. |
6380 | | |
6381 | | We check only the portion of the subject that might be be inspected during |
6382 | | matching - from the offset minus the maximum lookbehind to the given length. |
6383 | | This saves time when a small part of a large subject is being matched by the |
6384 | | use of a starting offset. Note that the maximum lookbehind is a number of |
6385 | | characters, not code units. |
6386 | | |
6387 | | Note also that support for invalid UTF forces a check, overriding the setting |
6388 | | of PCRE2_NO_CHECK_UTF. */ |
6389 | | |
6390 | 3.71M | #ifdef SUPPORT_UNICODE |
6391 | 3.71M | if (utf && |
6392 | | #ifdef SUPPORT_JIT |
6393 | | !jit_checked_utf && |
6394 | | #endif |
6395 | 3.71M | ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid)) |
6396 | 0 | { |
6397 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6398 | 0 | BOOL skipped_bad_start = FALSE; |
6399 | 0 | #endif |
6400 | | |
6401 | | /* For 8-bit and 16-bit UTF, check that the first code unit is a valid |
6402 | | character start. If we are handling invalid UTF, just skip over such code |
6403 | | units. Otherwise, give an appropriate error. */ |
6404 | |
|
6405 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6406 | 0 | if (allow_invalid) |
6407 | 0 | { |
6408 | 0 | while (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
6409 | 0 | { |
6410 | 0 | start_match++; |
6411 | 0 | skipped_bad_start = TRUE; |
6412 | 0 | } |
6413 | 0 | } |
6414 | 0 | else if (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
6415 | 0 | { |
6416 | 0 | if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; |
6417 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6418 | 0 | return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ |
6419 | | #else |
6420 | | return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ |
6421 | | #endif |
6422 | 0 | } |
6423 | 0 | #endif /* WIDTH != 32 */ |
6424 | | |
6425 | | /* The mb->check_subject field points to the start of UTF checking; |
6426 | | lookbehinds can go back no further than this. */ |
6427 | | |
6428 | 0 | mb->check_subject = start_match; |
6429 | | |
6430 | | /* Move back by the maximum lookbehind, just in case it happens at the very |
6431 | | start of matching, but don't do this if we skipped bad 8-bit or 16-bit code |
6432 | | units above. */ |
6433 | |
|
6434 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6435 | 0 | if (!skipped_bad_start) |
6436 | 0 | { |
6437 | 0 | unsigned int i; |
6438 | 0 | for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--) |
6439 | 0 | { |
6440 | 0 | mb->check_subject--; |
6441 | 0 | while (mb->check_subject > subject && |
6442 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6443 | 0 | (*mb->check_subject & 0xc0) == 0x80) |
6444 | | #else /* 16-bit */ |
6445 | | (*mb->check_subject & 0xfc00) == 0xdc00) |
6446 | | #endif |
6447 | 0 | mb->check_subject--; |
6448 | 0 | } |
6449 | 0 | } |
6450 | | #else /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
6451 | | |
6452 | | /* In the 32-bit library, one code unit equals one character. However, |
6453 | | we cannot just subtract the lookbehind and then compare pointers, because |
6454 | | a very large lookbehind could create an invalid pointer. */ |
6455 | | |
6456 | | if (start_offset >= re->max_lookbehind) |
6457 | | mb->check_subject -= re->max_lookbehind; |
6458 | | else |
6459 | | mb->check_subject = subject; |
6460 | | #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
6461 | | |
6462 | | /* Validate the relevant portion of the subject. There's a loop in case we |
6463 | | encounter bad UTF in the characters preceding start_match which we are |
6464 | | scanning because of a lookbehind. */ |
6465 | |
|
6466 | 0 | for (;;) |
6467 | 0 | { |
6468 | 0 | match_data->rc = PRIV(valid_utf)(mb->check_subject, |
6469 | 0 | length - (mb->check_subject - subject), &(match_data->startchar)); |
6470 | |
|
6471 | 0 | if (match_data->rc == 0) break; /* Valid UTF string */ |
6472 | | |
6473 | | /* Invalid UTF string. Adjust the offset to be an absolute offset in the |
6474 | | whole string. If we are handling invalid UTF strings, set end_subject to |
6475 | | stop before the bad code unit, and set the options to "not end of line". |
6476 | | Otherwise return the error. */ |
6477 | | |
6478 | 0 | match_data->startchar += mb->check_subject - subject; |
6479 | 0 | if (!allow_invalid || match_data->rc > 0) return match_data->rc; |
6480 | 0 | end_subject = subject + match_data->startchar; |
6481 | | |
6482 | | /* If the end precedes start_match, it means there is invalid UTF in the |
6483 | | extra code units we reversed over because of a lookbehind. Advance past the |
6484 | | first bad code unit, and then skip invalid character starting code units in |
6485 | | 8-bit and 16-bit modes, and try again. */ |
6486 | |
|
6487 | 0 | if (end_subject < start_match) |
6488 | 0 | { |
6489 | 0 | mb->check_subject = end_subject + 1; |
6490 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6491 | 0 | while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject)) |
6492 | 0 | mb->check_subject++; |
6493 | 0 | #endif |
6494 | 0 | } |
6495 | | |
6496 | | /* Otherwise, set the not end of line option, and do the match. */ |
6497 | | |
6498 | 0 | else |
6499 | 0 | { |
6500 | 0 | fragment_options = PCRE2_NOTEOL; |
6501 | 0 | break; |
6502 | 0 | } |
6503 | 0 | } |
6504 | 0 | } |
6505 | 3.71M | #endif /* SUPPORT_UNICODE */ |
6506 | | |
6507 | | /* A NULL match context means "use a default context", but we take the memory |
6508 | | control functions from the pattern. */ |
6509 | | |
6510 | 3.71M | if (mcontext == NULL) |
6511 | 1.51M | { |
6512 | 1.51M | mcontext = (pcre2_match_context *)(&PRIV(default_match_context)); |
6513 | 1.51M | mb->memctl = re->memctl; |
6514 | 1.51M | } |
6515 | 2.19M | else mb->memctl = mcontext->memctl; |
6516 | | |
6517 | 3.71M | anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; |
6518 | 3.71M | firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; |
6519 | 3.71M | startline = (re->flags & PCRE2_STARTLINE) != 0; |
6520 | 3.71M | bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? |
6521 | 3.71M | true_end_subject : subject + mcontext->offset_limit; |
6522 | | |
6523 | | /* Initialize and set up the fixed fields in the callout block, with a pointer |
6524 | | in the match block. */ |
6525 | | |
6526 | 3.71M | mb->cb = &cb; |
6527 | 3.71M | cb.version = 2; |
6528 | 3.71M | cb.subject = subject; |
6529 | 3.71M | cb.subject_length = (PCRE2_SIZE)(end_subject - subject); |
6530 | 3.71M | cb.callout_flags = 0; |
6531 | | |
6532 | | /* Fill in the remaining fields in the match block, except for moptions, which |
6533 | | gets set later. */ |
6534 | | |
6535 | 3.71M | mb->callout = mcontext->callout; |
6536 | 3.71M | mb->callout_data = mcontext->callout_data; |
6537 | | |
6538 | 3.71M | mb->start_subject = subject; |
6539 | 3.71M | mb->start_offset = start_offset; |
6540 | 3.71M | mb->end_subject = end_subject; |
6541 | 3.71M | mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; |
6542 | 3.71M | mb->allowemptypartial = (re->max_lookbehind > 0) || |
6543 | 3.71M | (re->flags & PCRE2_MATCH_EMPTY) != 0; |
6544 | 3.71M | mb->poptions = re->overall_options; /* Pattern options */ |
6545 | 3.71M | mb->ignore_skip_arg = 0; |
6546 | 3.71M | mb->mark = mb->nomatch_mark = NULL; /* In case never set */ |
6547 | | |
6548 | | /* The name table is needed for finding all the numbers associated with a |
6549 | | given name, for condition testing. The code follows the name table. */ |
6550 | | |
6551 | 3.71M | mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); |
6552 | 3.71M | mb->name_count = re->name_count; |
6553 | 3.71M | mb->name_entry_size = re->name_entry_size; |
6554 | 3.71M | mb->start_code = mb->name_table + re->name_count * re->name_entry_size; |
6555 | | |
6556 | | /* Process the \R and newline settings. */ |
6557 | | |
6558 | 3.71M | mb->bsr_convention = re->bsr_convention; |
6559 | 3.71M | mb->nltype = NLTYPE_FIXED; |
6560 | 3.71M | switch(re->newline_convention) |
6561 | 3.71M | { |
6562 | 0 | case PCRE2_NEWLINE_CR: |
6563 | 0 | mb->nllen = 1; |
6564 | 0 | mb->nl[0] = CHAR_CR; |
6565 | 0 | break; |
6566 | | |
6567 | 3.71M | case PCRE2_NEWLINE_LF: |
6568 | 3.71M | mb->nllen = 1; |
6569 | 3.71M | mb->nl[0] = CHAR_NL; |
6570 | 3.71M | break; |
6571 | | |
6572 | 0 | case PCRE2_NEWLINE_NUL: |
6573 | 0 | mb->nllen = 1; |
6574 | 0 | mb->nl[0] = CHAR_NUL; |
6575 | 0 | break; |
6576 | | |
6577 | 0 | case PCRE2_NEWLINE_CRLF: |
6578 | 0 | mb->nllen = 2; |
6579 | 0 | mb->nl[0] = CHAR_CR; |
6580 | 0 | mb->nl[1] = CHAR_NL; |
6581 | 0 | break; |
6582 | | |
6583 | 0 | case PCRE2_NEWLINE_ANY: |
6584 | 0 | mb->nltype = NLTYPE_ANY; |
6585 | 0 | break; |
6586 | | |
6587 | 0 | case PCRE2_NEWLINE_ANYCRLF: |
6588 | 0 | mb->nltype = NLTYPE_ANYCRLF; |
6589 | 0 | break; |
6590 | | |
6591 | 0 | default: return PCRE2_ERROR_INTERNAL; |
6592 | 3.71M | } |
6593 | | |
6594 | | /* The backtracking frames have fixed data at the front, and a PCRE2_SIZE |
6595 | | vector at the end, whose size depends on the number of capturing parentheses in |
6596 | | the pattern. It is not used at all if there are no capturing parentheses. |
6597 | | |
6598 | | frame_size is the total size of each frame |
6599 | | mb->frame_vector_size is the total usable size of the vector (rounded down |
6600 | | to a whole number of frames) |
6601 | | |
6602 | | The last of these is changed within the match() function if the frame vector |
6603 | | has to be expanded. We therefore put it into the match block so that it is |
6604 | | correct when calling match() more than once for non-anchored patterns. */ |
6605 | | |
6606 | 3.71M | frame_size = offsetof(heapframe, ovector) + |
6607 | 3.71M | re->top_bracket * 2 * sizeof(PCRE2_SIZE); |
6608 | | |
6609 | | /* Limits set in the pattern override the match context only if they are |
6610 | | smaller. */ |
6611 | | |
6612 | 3.71M | mb->heap_limit = (mcontext->heap_limit < re->limit_heap)? |
6613 | 3.71M | mcontext->heap_limit : re->limit_heap; |
6614 | | |
6615 | 3.71M | mb->match_limit = (mcontext->match_limit < re->limit_match)? |
6616 | 3.71M | mcontext->match_limit : re->limit_match; |
6617 | | |
6618 | 3.71M | mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)? |
6619 | 3.71M | mcontext->depth_limit : re->limit_depth; |
6620 | | |
6621 | | /* If a pattern has very many capturing parentheses, the frame size may be very |
6622 | | large. Ensure that there are at least 10 available frames by getting an initial |
6623 | | vector on the heap if necessary, except when the heap limit prevents this. Get |
6624 | | fewer if possible. (The heap limit is in kibibytes.) */ |
6625 | | |
6626 | 3.71M | if (frame_size <= START_FRAMES_SIZE/10) |
6627 | 3.71M | { |
6628 | 3.71M | mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */ |
6629 | 3.71M | mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size); |
6630 | 3.71M | } |
6631 | 0 | else |
6632 | 0 | { |
6633 | 0 | mb->frame_vector_size = frame_size * 10; |
6634 | 0 | if ((mb->frame_vector_size / 1024) > mb->heap_limit) |
6635 | 0 | { |
6636 | 0 | if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT; |
6637 | 0 | mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size; |
6638 | 0 | } |
6639 | 0 | mb->match_frames = mb->memctl.malloc(mb->frame_vector_size, |
6640 | 0 | mb->memctl.memory_data); |
6641 | 0 | if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY; |
6642 | 0 | } |
6643 | | |
6644 | 3.71M | mb->match_frames_top = |
6645 | 3.71M | (heapframe *)((char *)mb->match_frames + mb->frame_vector_size); |
6646 | | |
6647 | | /* Write to the ovector within the first frame to mark every capture unset and |
6648 | | to avoid uninitialized memory read errors when it is copied to a new frame. */ |
6649 | | |
6650 | 3.71M | memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff, |
6651 | 3.71M | re->top_bracket * 2 * sizeof(PCRE2_SIZE)); |
6652 | | |
6653 | | /* Pointers to the individual character tables */ |
6654 | | |
6655 | 3.71M | mb->lcc = re->tables + lcc_offset; |
6656 | 3.71M | mb->fcc = re->tables + fcc_offset; |
6657 | 3.71M | mb->ctypes = re->tables + ctypes_offset; |
6658 | | |
6659 | | /* Set up the first code unit to match, if available. If there's no first code |
6660 | | unit there may be a bitmap of possible first characters. */ |
6661 | | |
6662 | 3.71M | if ((re->flags & PCRE2_FIRSTSET) != 0) |
6663 | 1.47M | { |
6664 | 1.47M | has_first_cu = TRUE; |
6665 | 1.47M | first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); |
6666 | 1.47M | if ((re->flags & PCRE2_FIRSTCASELESS) != 0) |
6667 | 7.20k | { |
6668 | 7.20k | first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); |
6669 | 7.20k | #ifdef SUPPORT_UNICODE |
6670 | 7.20k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6671 | 7.20k | if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu); |
6672 | | #else |
6673 | | if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu); |
6674 | | #endif |
6675 | 7.20k | #endif /* SUPPORT_UNICODE */ |
6676 | 7.20k | } |
6677 | 1.47M | } |
6678 | 2.23M | else |
6679 | 2.23M | if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) |
6680 | 1.84M | start_bits = re->start_bitmap; |
6681 | | |
6682 | | /* There may also be a "last known required character" set. */ |
6683 | | |
6684 | 3.71M | if ((re->flags & PCRE2_LASTSET) != 0) |
6685 | 2.47M | { |
6686 | 2.47M | has_req_cu = TRUE; |
6687 | 2.47M | req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); |
6688 | 2.47M | if ((re->flags & PCRE2_LASTCASELESS) != 0) |
6689 | 41.0k | { |
6690 | 41.0k | req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); |
6691 | 41.0k | #ifdef SUPPORT_UNICODE |
6692 | 41.0k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6693 | 41.0k | if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu); |
6694 | | #else |
6695 | | if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu); |
6696 | | #endif |
6697 | 41.0k | #endif /* SUPPORT_UNICODE */ |
6698 | 41.0k | } |
6699 | 2.47M | } |
6700 | | |
6701 | | |
6702 | | /* ==========================================================================*/ |
6703 | | |
6704 | | /* Loop for handling unanchored repeated matching attempts; for anchored regexs |
6705 | | the loop runs just once. */ |
6706 | | |
6707 | 3.71M | #ifdef SUPPORT_UNICODE |
6708 | 3.71M | FRAGMENT_RESTART: |
6709 | 3.71M | #endif |
6710 | | |
6711 | 3.71M | start_partial = match_partial = NULL; |
6712 | 3.71M | mb->hitend = FALSE; |
6713 | | |
6714 | 3.71M | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6715 | 3.71M | memchr_found_first_cu = NULL; |
6716 | 3.71M | memchr_found_first_cu2 = NULL; |
6717 | 3.71M | #endif |
6718 | | |
6719 | 3.71M | for(;;) |
6720 | 52.9M | { |
6721 | 52.9M | PCRE2_SPTR new_start_match; |
6722 | | |
6723 | | /* ----------------- Start of match optimizations ---------------- */ |
6724 | | |
6725 | | /* There are some optimizations that avoid running the match if a known |
6726 | | starting point is not found, or if a known later code unit is not present. |
6727 | | However, there is an option (settable at compile time) that disables these, |
6728 | | for testing and for ensuring that all callouts do actually occur. */ |
6729 | | |
6730 | 52.9M | if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) |
6731 | 52.9M | { |
6732 | | /* If firstline is TRUE, the start of the match is constrained to the first |
6733 | | line of a multiline string. That is, the match must be before or at the |
6734 | | first newline following the start of matching. Temporarily adjust |
6735 | | end_subject so that we stop the scans for a first code unit at a newline. |
6736 | | If the match fails at the newline, later code breaks the loop. */ |
6737 | | |
6738 | 52.9M | if (firstline) |
6739 | 0 | { |
6740 | 0 | PCRE2_SPTR t = start_match; |
6741 | 0 | #ifdef SUPPORT_UNICODE |
6742 | 0 | if (utf) |
6743 | 0 | { |
6744 | 0 | while (t < end_subject && !IS_NEWLINE(t)) |
6745 | 0 | { |
6746 | 0 | t++; |
6747 | 0 | ACROSSCHAR(t < end_subject, t, t++); |
6748 | 0 | } |
6749 | 0 | } |
6750 | 0 | else |
6751 | 0 | #endif |
6752 | 0 | while (t < end_subject && !IS_NEWLINE(t)) t++; |
6753 | 0 | end_subject = t; |
6754 | 0 | } |
6755 | | |
6756 | | /* Anchored: check the first code unit if one is recorded. This may seem |
6757 | | pointless but it can help in detecting a no match case without scanning for |
6758 | | the required code unit. */ |
6759 | | |
6760 | 52.9M | if (anchored) |
6761 | 1.73M | { |
6762 | 1.73M | if (has_first_cu || start_bits != NULL) |
6763 | 1.71M | { |
6764 | 1.71M | BOOL ok = start_match < end_subject; |
6765 | 1.71M | if (ok) |
6766 | 1.71M | { |
6767 | 1.71M | PCRE2_UCHAR c = UCHAR21TEST(start_match); |
6768 | 1.71M | ok = has_first_cu && (c == first_cu || c == first_cu2); |
6769 | 1.71M | if (!ok && start_bits != NULL) |
6770 | 1.71M | { |
6771 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
6772 | | if (c > 255) c = 255; |
6773 | | #endif |
6774 | 1.71M | ok = (start_bits[c/8] & (1u << (c&7))) != 0; |
6775 | 1.71M | } |
6776 | 1.71M | } |
6777 | 1.71M | if (!ok) |
6778 | 17.2k | { |
6779 | 17.2k | rc = MATCH_NOMATCH; |
6780 | 17.2k | break; |
6781 | 17.2k | } |
6782 | 1.71M | } |
6783 | 1.73M | } |
6784 | | |
6785 | | /* Not anchored. Advance to a unique first code unit if there is one. */ |
6786 | | |
6787 | 51.2M | else |
6788 | 51.2M | { |
6789 | 51.2M | if (has_first_cu) |
6790 | 2.25M | { |
6791 | 2.25M | if (first_cu != first_cu2) /* Caseless */ |
6792 | 194k | { |
6793 | | /* In 16-bit and 32_bit modes we have to do our own search, so can |
6794 | | look for both cases at once. */ |
6795 | | |
6796 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
6797 | | PCRE2_UCHAR smc; |
6798 | | while (start_match < end_subject && |
6799 | | (smc = UCHAR21TEST(start_match)) != first_cu && |
6800 | | smc != first_cu2) |
6801 | | start_match++; |
6802 | | #else |
6803 | | /* In 8-bit mode, the use of memchr() gives a big speed up, even |
6804 | | though we have to call it twice in order to find the earliest |
6805 | | occurrence of the code unit in either of its cases. Caching is used |
6806 | | to remember the positions of previously found code units. This can |
6807 | | make a huge difference when the strings are very long and only one |
6808 | | case is actually present. */ |
6809 | | |
6810 | 194k | PCRE2_SPTR pp1 = NULL; |
6811 | 194k | PCRE2_SPTR pp2 = NULL; |
6812 | 194k | PCRE2_SIZE searchlength = end_subject - start_match; |
6813 | | |
6814 | | /* If we haven't got a previously found position for first_cu, or if |
6815 | | the current starting position is later, we need to do a search. If |
6816 | | the code unit is not found, set it to the end. */ |
6817 | | |
6818 | 194k | if (memchr_found_first_cu == NULL || |
6819 | 194k | start_match > memchr_found_first_cu) |
6820 | 110k | { |
6821 | 110k | pp1 = memchr(start_match, first_cu, searchlength); |
6822 | 110k | memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1; |
6823 | 110k | } |
6824 | | |
6825 | | /* If the start is before a previously found position, use the |
6826 | | previous position, or NULL if a previous search failed. */ |
6827 | | |
6828 | 84.1k | else pp1 = (memchr_found_first_cu == end_subject)? NULL : |
6829 | 84.1k | memchr_found_first_cu; |
6830 | | |
6831 | | /* Do the same thing for the other case. */ |
6832 | | |
6833 | 194k | if (memchr_found_first_cu2 == NULL || |
6834 | 194k | start_match > memchr_found_first_cu2) |
6835 | 90.6k | { |
6836 | 90.6k | pp2 = memchr(start_match, first_cu2, searchlength); |
6837 | 90.6k | memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2; |
6838 | 90.6k | } |
6839 | | |
6840 | 103k | else pp2 = (memchr_found_first_cu2 == end_subject)? NULL : |
6841 | 103k | memchr_found_first_cu2; |
6842 | | |
6843 | | /* Set the start to the end of the subject if neither case was found. |
6844 | | Otherwise, use the earlier found point. */ |
6845 | | |
6846 | 194k | if (pp1 == NULL) |
6847 | 16.3k | start_match = (pp2 == NULL)? end_subject : pp2; |
6848 | 178k | else |
6849 | 178k | start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; |
6850 | | |
6851 | 194k | #endif /* 8-bit handling */ |
6852 | 194k | } |
6853 | | |
6854 | | /* The caseful case is much simpler. */ |
6855 | | |
6856 | 2.06M | else |
6857 | 2.06M | { |
6858 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
6859 | | while (start_match < end_subject && UCHAR21TEST(start_match) != |
6860 | | first_cu) |
6861 | | start_match++; |
6862 | | #else |
6863 | 2.06M | start_match = memchr(start_match, first_cu, end_subject - start_match); |
6864 | 2.06M | if (start_match == NULL) start_match = end_subject; |
6865 | 2.06M | #endif |
6866 | 2.06M | } |
6867 | | |
6868 | | /* If we can't find the required first code unit, having reached the |
6869 | | true end of the subject, break the bumpalong loop, to force a match |
6870 | | failure, except when doing partial matching, when we let the next cycle |
6871 | | run at the end of the subject. To see why, consider the pattern |
6872 | | /(?<=abc)def/, which partially matches "abc", even though the string |
6873 | | does not contain the starting character "d". If we have not reached the |
6874 | | true end of the subject (PCRE2_FIRSTLINE caused end_subject to be |
6875 | | temporarily modified) we also let the cycle run, because the matching |
6876 | | string is legitimately allowed to start with the first code unit of a |
6877 | | newline. */ |
6878 | | |
6879 | 2.25M | if (mb->partial == 0 && start_match >= mb->end_subject) |
6880 | 245k | { |
6881 | 245k | rc = MATCH_NOMATCH; |
6882 | 245k | break; |
6883 | 245k | } |
6884 | 2.25M | } |
6885 | | |
6886 | | /* If there's no first code unit, advance to just after a linebreak for a |
6887 | | multiline match if required. */ |
6888 | | |
6889 | 48.9M | else if (startline) |
6890 | 1.05M | { |
6891 | 1.05M | if (start_match > mb->start_subject + start_offset) |
6892 | 922k | { |
6893 | 922k | #ifdef SUPPORT_UNICODE |
6894 | 922k | if (utf) |
6895 | 0 | { |
6896 | 0 | while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
6897 | 0 | { |
6898 | 0 | start_match++; |
6899 | 0 | ACROSSCHAR(start_match < end_subject, start_match, start_match++); |
6900 | 0 | } |
6901 | 0 | } |
6902 | 922k | else |
6903 | 922k | #endif |
6904 | 34.8M | while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
6905 | 33.9M | start_match++; |
6906 | | |
6907 | | /* If we have just passed a CR and the newline option is ANY or |
6908 | | ANYCRLF, and we are now at a LF, advance the match position by one |
6909 | | more code unit. */ |
6910 | | |
6911 | 922k | if (start_match[-1] == CHAR_CR && |
6912 | 922k | (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && |
6913 | 922k | start_match < end_subject && |
6914 | 922k | UCHAR21TEST(start_match) == CHAR_NL) |
6915 | 0 | start_match++; |
6916 | 922k | } |
6917 | 1.05M | } |
6918 | | |
6919 | | /* If there's no first code unit or a requirement for a multiline line |
6920 | | start, advance to a non-unique first code unit if any have been |
6921 | | identified. The bitmap contains only 256 bits. When code units are 16 or |
6922 | | 32 bits wide, all code units greater than 254 set the 255 bit. */ |
6923 | | |
6924 | 47.9M | else if (start_bits != NULL) |
6925 | 15.9M | { |
6926 | 106M | while (start_match < end_subject) |
6927 | 106M | { |
6928 | 106M | uint32_t c = UCHAR21TEST(start_match); |
6929 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
6930 | | if (c > 255) c = 255; |
6931 | | #endif |
6932 | 106M | if ((start_bits[c/8] & (1u << (c&7))) != 0) break; |
6933 | 90.3M | start_match++; |
6934 | 90.3M | } |
6935 | | |
6936 | | /* See comment above in first_cu checking about the next few lines. */ |
6937 | | |
6938 | 15.9M | if (mb->partial == 0 && start_match >= mb->end_subject) |
6939 | 22.7k | { |
6940 | 22.7k | rc = MATCH_NOMATCH; |
6941 | 22.7k | break; |
6942 | 22.7k | } |
6943 | 15.9M | } |
6944 | 51.2M | } /* End first code unit handling */ |
6945 | | |
6946 | | /* Restore fudged end_subject */ |
6947 | | |
6948 | 52.6M | end_subject = mb->end_subject; |
6949 | | |
6950 | | /* The following two optimizations must be disabled for partial matching. */ |
6951 | | |
6952 | 52.6M | if (mb->partial == 0) |
6953 | 52.6M | { |
6954 | 52.6M | PCRE2_SPTR p; |
6955 | | |
6956 | | /* The minimum matching length is a lower bound; no string of that length |
6957 | | may actually match the pattern. Although the value is, strictly, in |
6958 | | characters, we treat it as code units to avoid spending too much time in |
6959 | | this optimization. */ |
6960 | | |
6961 | 52.6M | if (end_subject - start_match < re->minlength) |
6962 | 271k | { |
6963 | 271k | rc = MATCH_NOMATCH; |
6964 | 271k | break; |
6965 | 271k | } |
6966 | | |
6967 | | /* If req_cu is set, we know that that code unit must appear in the |
6968 | | subject for the (non-partial) match to succeed. If the first code unit is |
6969 | | set, req_cu must be later in the subject; otherwise the test starts at |
6970 | | the match point. This optimization can save a huge amount of backtracking |
6971 | | in patterns with nested unlimited repeats that aren't going to match. |
6972 | | Writing separate code for caseful/caseless versions makes it go faster, |
6973 | | as does using an autoincrement and backing off on a match. As in the case |
6974 | | of the first code unit, using memchr() in the 8-bit library gives a big |
6975 | | speed up. Unlike the first_cu check above, we do not need to call |
6976 | | memchr() twice in the caseless case because we only need to check for the |
6977 | | presence of the character in either case, not find the first occurrence. |
6978 | | |
6979 | | The search can be skipped if the code unit was found later than the |
6980 | | current starting point in a previous iteration of the bumpalong loop. |
6981 | | |
6982 | | HOWEVER: when the subject string is very, very long, searching to its end |
6983 | | can take a long time, and give bad performance on quite ordinary |
6984 | | anchored patterns. This showed up when somebody was matching something |
6985 | | like /^\d+C/ on a 32-megabyte string... so we don't do this when the |
6986 | | string is sufficiently long, but it's worth searching a lot more for |
6987 | | unanchored patterns. */ |
6988 | | |
6989 | 52.3M | p = start_match + (has_first_cu? 1:0); |
6990 | 52.3M | if (has_req_cu && p > req_cu_ptr) |
6991 | 2.80M | { |
6992 | 2.80M | PCRE2_SIZE check_length = end_subject - start_match; |
6993 | | |
6994 | 2.80M | if (check_length < REQ_CU_MAX || |
6995 | 2.80M | (!anchored && check_length < REQ_CU_MAX * 1000)) |
6996 | 2.80M | { |
6997 | 2.80M | if (req_cu != req_cu2) /* Caseless */ |
6998 | 138k | { |
6999 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7000 | | while (p < end_subject) |
7001 | | { |
7002 | | uint32_t pp = UCHAR21INCTEST(p); |
7003 | | if (pp == req_cu || pp == req_cu2) { p--; break; } |
7004 | | } |
7005 | | #else /* 8-bit code units */ |
7006 | 138k | PCRE2_SPTR pp = p; |
7007 | 138k | p = memchr(pp, req_cu, end_subject - pp); |
7008 | 138k | if (p == NULL) |
7009 | 39.8k | { |
7010 | 39.8k | p = memchr(pp, req_cu2, end_subject - pp); |
7011 | 39.8k | if (p == NULL) p = end_subject; |
7012 | 39.8k | } |
7013 | 138k | #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ |
7014 | 138k | } |
7015 | | |
7016 | | /* The caseful case */ |
7017 | | |
7018 | 2.66M | else |
7019 | 2.66M | { |
7020 | | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7021 | | while (p < end_subject) |
7022 | | { |
7023 | | if (UCHAR21INCTEST(p) == req_cu) { p--; break; } |
7024 | | } |
7025 | | |
7026 | | #else /* 8-bit code units */ |
7027 | 2.66M | p = memchr(p, req_cu, end_subject - p); |
7028 | 2.66M | if (p == NULL) p = end_subject; |
7029 | 2.66M | #endif |
7030 | 2.66M | } |
7031 | | |
7032 | | /* If we can't find the required code unit, break the bumpalong loop, |
7033 | | forcing a match failure. */ |
7034 | | |
7035 | 2.80M | if (p >= end_subject) |
7036 | 504k | { |
7037 | 504k | rc = MATCH_NOMATCH; |
7038 | 504k | break; |
7039 | 504k | } |
7040 | | |
7041 | | /* If we have found the required code unit, save the point where we |
7042 | | found it, so that we don't search again next time round the bumpalong |
7043 | | loop if the start hasn't yet passed this code unit. */ |
7044 | | |
7045 | 2.29M | req_cu_ptr = p; |
7046 | 2.29M | } |
7047 | 2.80M | } |
7048 | 52.3M | } |
7049 | 52.6M | } |
7050 | | |
7051 | | /* ------------ End of start of match optimizations ------------ */ |
7052 | | |
7053 | | /* Give no match if we have passed the bumpalong limit. */ |
7054 | | |
7055 | 51.8M | if (start_match > bumpalong_limit) |
7056 | 0 | { |
7057 | 0 | rc = MATCH_NOMATCH; |
7058 | 0 | break; |
7059 | 0 | } |
7060 | | |
7061 | | /* OK, we can now run the match. If "hitend" is set afterwards, remember the |
7062 | | first starting point for which a partial match was found. */ |
7063 | | |
7064 | 51.8M | cb.start_match = (PCRE2_SIZE)(start_match - subject); |
7065 | 51.8M | cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH; |
7066 | | |
7067 | 51.8M | mb->start_used_ptr = start_match; |
7068 | 51.8M | mb->last_used_ptr = start_match; |
7069 | 51.8M | #ifdef SUPPORT_UNICODE |
7070 | 51.8M | mb->moptions = options | fragment_options; |
7071 | | #else |
7072 | | mb->moptions = options; |
7073 | | #endif |
7074 | 51.8M | mb->match_call_count = 0; |
7075 | 51.8M | mb->end_offset_top = 0; |
7076 | 51.8M | mb->skip_arg_count = 0; |
7077 | | |
7078 | 51.8M | rc = match(start_match, mb->start_code, match_data->ovector, |
7079 | 51.8M | match_data->oveccount, re->top_bracket, frame_size, mb); |
7080 | | |
7081 | 51.8M | if (mb->hitend && start_partial == NULL) |
7082 | 0 | { |
7083 | 0 | start_partial = mb->start_used_ptr; |
7084 | 0 | match_partial = start_match; |
7085 | 0 | } |
7086 | | |
7087 | 51.8M | switch(rc) |
7088 | 51.8M | { |
7089 | | /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched |
7090 | | the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP |
7091 | | entirely. The only way we can do that is to re-do the match at the same |
7092 | | point, with a flag to force SKIP with an argument to be ignored. Just |
7093 | | treating this case as NOMATCH does not work because it does not check other |
7094 | | alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ |
7095 | | |
7096 | 0 | case MATCH_SKIP_ARG: |
7097 | 0 | new_start_match = start_match; |
7098 | 0 | mb->ignore_skip_arg = mb->skip_arg_count; |
7099 | 0 | break; |
7100 | | |
7101 | | /* SKIP passes back the next starting point explicitly, but if it is no |
7102 | | greater than the match we have just done, treat it as NOMATCH. */ |
7103 | | |
7104 | 0 | case MATCH_SKIP: |
7105 | 0 | if (mb->verb_skip_ptr > start_match) |
7106 | 0 | { |
7107 | 0 | new_start_match = mb->verb_skip_ptr; |
7108 | 0 | break; |
7109 | 0 | } |
7110 | | /* Fall through */ |
7111 | | |
7112 | | /* NOMATCH and PRUNE advance by one character. THEN at this level acts |
7113 | | exactly like PRUNE. Unset ignore SKIP-with-argument. */ |
7114 | | |
7115 | 49.3M | case MATCH_NOMATCH: |
7116 | 49.3M | case MATCH_PRUNE: |
7117 | 49.3M | case MATCH_THEN: |
7118 | 49.3M | mb->ignore_skip_arg = 0; |
7119 | 49.3M | new_start_match = start_match + 1; |
7120 | 49.3M | #ifdef SUPPORT_UNICODE |
7121 | 49.3M | if (utf) |
7122 | 0 | ACROSSCHAR(new_start_match < end_subject, new_start_match, |
7123 | 49.3M | new_start_match++); |
7124 | 49.3M | #endif |
7125 | 49.3M | break; |
7126 | | |
7127 | | /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ |
7128 | | |
7129 | 0 | case MATCH_COMMIT: |
7130 | 0 | rc = MATCH_NOMATCH; |
7131 | 0 | goto ENDLOOP; |
7132 | | |
7133 | | /* Any other return is either a match, or some kind of error. */ |
7134 | | |
7135 | 2.58M | default: |
7136 | 2.58M | goto ENDLOOP; |
7137 | 51.8M | } |
7138 | | |
7139 | | /* Control reaches here for the various types of "no match at this point" |
7140 | | result. Reset the code to MATCH_NOMATCH for subsequent checking. */ |
7141 | | |
7142 | 49.3M | rc = MATCH_NOMATCH; |
7143 | | |
7144 | | /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first |
7145 | | newline in the subject (though it may continue over the newline). Therefore, |
7146 | | if we have just failed to match, starting at a newline, do not continue. */ |
7147 | | |
7148 | 49.3M | if (firstline && IS_NEWLINE(start_match)) break; |
7149 | | |
7150 | | /* Advance to new matching position */ |
7151 | | |
7152 | 49.3M | start_match = new_start_match; |
7153 | | |
7154 | | /* Break the loop if the pattern is anchored or if we have passed the end of |
7155 | | the subject. */ |
7156 | | |
7157 | 49.3M | if (anchored || start_match > end_subject) break; |
7158 | | |
7159 | | /* If we have just passed a CR and we are now at a LF, and the pattern does |
7160 | | not contain any explicit matches for \r or \n, and the newline option is CRLF |
7161 | | or ANY or ANYCRLF, advance the match position by one more code unit. In |
7162 | | normal matching start_match will aways be greater than the first position at |
7163 | | this stage, but a failed *SKIP can cause a return at the same point, which is |
7164 | | why the first test exists. */ |
7165 | | |
7166 | 49.2M | if (start_match > subject + start_offset && |
7167 | 49.2M | start_match[-1] == CHAR_CR && |
7168 | 49.2M | start_match < end_subject && |
7169 | 49.2M | *start_match == CHAR_NL && |
7170 | 49.2M | (re->flags & PCRE2_HASCRORLF) == 0 && |
7171 | 49.2M | (mb->nltype == NLTYPE_ANY || |
7172 | 373k | mb->nltype == NLTYPE_ANYCRLF || |
7173 | 373k | mb->nllen == 2)) |
7174 | 0 | start_match++; |
7175 | | |
7176 | 49.2M | mb->mark = NULL; /* Reset for start of next match attempt */ |
7177 | 49.2M | } /* End of for(;;) "bumpalong" loop */ |
7178 | | |
7179 | | /* ==========================================================================*/ |
7180 | | |
7181 | | /* When we reach here, one of the following stopping conditions is true: |
7182 | | |
7183 | | (1) The match succeeded, either completely, or partially; |
7184 | | |
7185 | | (2) The pattern is anchored or the match was failed after (*COMMIT); |
7186 | | |
7187 | | (3) We are past the end of the subject or the bumpalong limit; |
7188 | | |
7189 | | (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because |
7190 | | this option requests that a match occur at or before the first newline in |
7191 | | the subject. |
7192 | | |
7193 | | (5) Some kind of error occurred. |
7194 | | |
7195 | | */ |
7196 | | |
7197 | 3.71M | ENDLOOP: |
7198 | | |
7199 | | /* If end_subject != true_end_subject, it means we are handling invalid UTF, |
7200 | | and have just processed a non-terminal fragment. If this resulted in no match |
7201 | | or a partial match we must carry on to the next fragment (a partial match is |
7202 | | returned to the caller only at the very end of the subject). A loop is used to |
7203 | | avoid trying to match against empty fragments; if the pattern can match an |
7204 | | empty string it would have done so already. */ |
7205 | | |
7206 | 3.71M | #ifdef SUPPORT_UNICODE |
7207 | 3.71M | if (utf && end_subject != true_end_subject && |
7208 | 3.71M | (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL)) |
7209 | 0 | { |
7210 | 0 | for (;;) |
7211 | 0 | { |
7212 | | /* Advance past the first bad code unit, and then skip invalid character |
7213 | | starting code units in 8-bit and 16-bit modes. */ |
7214 | |
|
7215 | 0 | start_match = end_subject + 1; |
7216 | |
|
7217 | 0 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7218 | 0 | while (start_match < true_end_subject && NOT_FIRSTCU(*start_match)) |
7219 | 0 | start_match++; |
7220 | 0 | #endif |
7221 | | |
7222 | | /* If we have hit the end of the subject, there isn't another non-empty |
7223 | | fragment, so give up. */ |
7224 | |
|
7225 | 0 | if (start_match >= true_end_subject) |
7226 | 0 | { |
7227 | 0 | rc = MATCH_NOMATCH; /* In case it was partial */ |
7228 | 0 | break; |
7229 | 0 | } |
7230 | | |
7231 | | /* Check the rest of the subject */ |
7232 | | |
7233 | 0 | mb->check_subject = start_match; |
7234 | 0 | rc = PRIV(valid_utf)(start_match, length - (start_match - subject), |
7235 | 0 | &(match_data->startchar)); |
7236 | | |
7237 | | /* The rest of the subject is valid UTF. */ |
7238 | |
|
7239 | 0 | if (rc == 0) |
7240 | 0 | { |
7241 | 0 | mb->end_subject = end_subject = true_end_subject; |
7242 | 0 | fragment_options = PCRE2_NOTBOL; |
7243 | 0 | goto FRAGMENT_RESTART; |
7244 | 0 | } |
7245 | | |
7246 | | /* A subsequent UTF error has been found; if the next fragment is |
7247 | | non-empty, set up to process it. Otherwise, let the loop advance. */ |
7248 | | |
7249 | 0 | else if (rc < 0) |
7250 | 0 | { |
7251 | 0 | mb->end_subject = end_subject = start_match + match_data->startchar; |
7252 | 0 | if (end_subject > start_match) |
7253 | 0 | { |
7254 | 0 | fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL; |
7255 | 0 | goto FRAGMENT_RESTART; |
7256 | 0 | } |
7257 | 0 | } |
7258 | 0 | } |
7259 | 0 | } |
7260 | 3.71M | #endif /* SUPPORT_UNICODE */ |
7261 | | |
7262 | | /* Release an enlarged frame vector that is on the heap. */ |
7263 | | |
7264 | 3.71M | if (mb->match_frames != mb->stack_frames) |
7265 | 6 | mb->memctl.free(mb->match_frames, mb->memctl.memory_data); |
7266 | | |
7267 | | /* Fill in fields that are always returned in the match data. */ |
7268 | | |
7269 | 3.71M | match_data->code = re; |
7270 | 3.71M | match_data->mark = mb->mark; |
7271 | 3.71M | match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER; |
7272 | | |
7273 | | /* Handle a fully successful match. Set the return code to the number of |
7274 | | captured strings, or 0 if there were too many to fit into the ovector, and then |
7275 | | set the remaining returned values before returning. Make a copy of the subject |
7276 | | string if requested. */ |
7277 | | |
7278 | 3.71M | if (rc == MATCH_MATCH) |
7279 | 2.56M | { |
7280 | 2.56M | match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)? |
7281 | 2.56M | 0 : (int)mb->end_offset_top/2 + 1; |
7282 | 2.56M | match_data->startchar = start_match - subject; |
7283 | 2.56M | match_data->leftchar = mb->start_used_ptr - subject; |
7284 | 2.56M | match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)? |
7285 | 2.14M | mb->last_used_ptr : mb->end_match_ptr) - subject; |
7286 | 2.56M | if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0) |
7287 | 0 | { |
7288 | 0 | length = CU2BYTES(length + was_zero_terminated); |
7289 | 0 | match_data->subject = match_data->memctl.malloc(length, |
7290 | 0 | match_data->memctl.memory_data); |
7291 | 0 | if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; |
7292 | 0 | memcpy((void *)match_data->subject, subject, length); |
7293 | 0 | match_data->flags |= PCRE2_MD_COPIED_SUBJECT; |
7294 | 0 | } |
7295 | 2.56M | else match_data->subject = subject; |
7296 | 2.56M | return match_data->rc; |
7297 | 2.56M | } |
7298 | | |
7299 | | /* Control gets here if there has been a partial match, an error, or if the |
7300 | | overall match attempt has failed at all permitted starting positions. Any mark |
7301 | | data is in the nomatch_mark field. */ |
7302 | | |
7303 | 1.14M | match_data->mark = mb->nomatch_mark; |
7304 | | |
7305 | | /* For anything other than nomatch or partial match, just return the code. */ |
7306 | | |
7307 | 1.14M | if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc; |
7308 | | |
7309 | | /* Handle a partial match. If a "soft" partial match was requested, searching |
7310 | | for a complete match will have continued, and the value of rc at this point |
7311 | | will be MATCH_NOMATCH. For a "hard" partial match, it will already be |
7312 | | PCRE2_ERROR_PARTIAL. */ |
7313 | | |
7314 | 1.12M | else if (match_partial != NULL) |
7315 | 0 | { |
7316 | 0 | match_data->subject = subject; |
7317 | 0 | match_data->ovector[0] = match_partial - subject; |
7318 | 0 | match_data->ovector[1] = end_subject - subject; |
7319 | 0 | match_data->startchar = match_partial - subject; |
7320 | 0 | match_data->leftchar = start_partial - subject; |
7321 | 0 | match_data->rightchar = end_subject - subject; |
7322 | 0 | match_data->rc = PCRE2_ERROR_PARTIAL; |
7323 | 0 | } |
7324 | | |
7325 | | /* Else this is the classic nomatch case. */ |
7326 | | |
7327 | 1.12M | else match_data->rc = PCRE2_ERROR_NOMATCH; |
7328 | | |
7329 | 1.14M | return match_data->rc; |
7330 | 3.71M | } |
7331 | | |
7332 | | /* End of pcre2_match.c */ |