/src/libgit2/deps/pcre/pcre_exec.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Copyright (c) 1997-2021 University of Cambridge |
10 | | |
11 | | ----------------------------------------------------------------------------- |
12 | | Redistribution and use in source and binary forms, with or without |
13 | | modification, are permitted provided that the following conditions are met: |
14 | | |
15 | | * Redistributions of source code must retain the above copyright notice, |
16 | | this list of conditions and the following disclaimer. |
17 | | |
18 | | * Redistributions in binary form must reproduce the above copyright |
19 | | notice, this list of conditions and the following disclaimer in the |
20 | | documentation and/or other materials provided with the distribution. |
21 | | |
22 | | * Neither the name of the University of Cambridge nor the names of its |
23 | | contributors may be used to endorse or promote products derived from |
24 | | this software without specific prior written permission. |
25 | | |
26 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
27 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
28 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
29 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
30 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
31 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
32 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
33 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
34 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
35 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
36 | | POSSIBILITY OF SUCH DAMAGE. |
37 | | ----------------------------------------------------------------------------- |
38 | | */ |
39 | | |
40 | | /* This module contains pcre_exec(), the externally visible function that does |
41 | | pattern matching using an NFA algorithm, trying to mimic Perl as closely as |
42 | | possible. There are also some static supporting functions. */ |
43 | | |
44 | | #ifdef HAVE_CONFIG_H |
45 | | #include "config.h" |
46 | | #endif |
47 | | |
48 | 0 | #define NLBLOCK md /* Block containing newline information */ |
49 | 0 | #define PSSTART start_subject /* Field containing processed string start */ |
50 | 0 | #define PSEND end_subject /* Field containing processed string end */ |
51 | | |
52 | | #include "pcre_internal.h" |
53 | | |
54 | | /* Undefine some potentially clashing cpp symbols */ |
55 | | |
56 | | #undef min |
57 | | #undef max |
58 | | |
59 | | /* The md->capture_last field uses the lower 16 bits for the last captured |
60 | | substring (which can never be greater than 65535) and a bit in the top half |
61 | | to mean "capture vector overflowed". This odd way of doing things was |
62 | | implemented when it was realized that preserving and restoring the overflow bit |
63 | | whenever the last capture number was saved/restored made for a neater |
64 | | interface, and doing it this way saved on (a) another variable, which would |
65 | | have increased the stack frame size (a big NO-NO in PCRE) and (b) another |
66 | | separate set of save/restore instructions. The following defines are used in |
67 | | implementing this. */ |
68 | | |
69 | 0 | #define CAPLMASK 0x0000ffff /* The bits used for last_capture */ |
70 | 0 | #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */ |
71 | 0 | #define OVFLBIT 0x00010000 /* The bit that is set for overflow */ |
72 | | |
73 | | /* Values for setting in md->match_function_type to indicate two special types |
74 | | of call to match(). We do it this way to save on using another stack variable, |
75 | | as stack usage is to be discouraged. */ |
76 | | |
77 | 0 | #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */ |
78 | 0 | #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */ |
79 | | |
80 | | /* Non-error returns from the match() function. Error returns are externally |
81 | | defined PCRE_ERROR_xxx codes, which are all negative. */ |
82 | | |
83 | 56.8k | #define MATCH_MATCH 1 |
84 | 85.3k | #define MATCH_NOMATCH 0 |
85 | | |
86 | | /* Special internal returns from the match() function. Make them sufficiently |
87 | | negative to avoid the external error codes. */ |
88 | | |
89 | 28.4k | #define MATCH_ACCEPT (-999) |
90 | 0 | #define MATCH_KETRPOS (-998) |
91 | 0 | #define MATCH_ONCE (-997) |
92 | | /* The next 5 must be kept together and in sequence so that a test that checks |
93 | | for any one of them can use a range. */ |
94 | 0 | #define MATCH_COMMIT (-996) |
95 | 0 | #define MATCH_PRUNE (-995) |
96 | 0 | #define MATCH_SKIP (-994) |
97 | 0 | #define MATCH_SKIP_ARG (-993) |
98 | 0 | #define MATCH_THEN (-992) |
99 | 0 | #define MATCH_BACKTRACK_MAX MATCH_THEN |
100 | 0 | #define MATCH_BACKTRACK_MIN MATCH_COMMIT |
101 | | |
102 | | /* Maximum number of ints of offset to save on the stack for recursive calls. |
103 | | If the offset vector is bigger, malloc is used. This should be a multiple of 3, |
104 | | because the offset vector is always a multiple of 3 long. */ |
105 | | |
106 | 0 | #define REC_STACK_SAVE_MAX 30 |
107 | | |
108 | | /* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
109 | | |
110 | | static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, }; |
111 | | static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, }; |
112 | | |
113 | | #ifdef PCRE_DEBUG |
114 | | /************************************************* |
115 | | * Debugging function to print chars * |
116 | | *************************************************/ |
117 | | |
118 | | /* Print a sequence of chars in printable format, stopping at the end of the |
119 | | subject if the requested. |
120 | | |
121 | | Arguments: |
122 | | p points to characters |
123 | | length number to print |
124 | | is_subject TRUE if printing from within md->start_subject |
125 | | md pointer to matching data block, if is_subject is TRUE |
126 | | |
127 | | Returns: nothing |
128 | | */ |
129 | | |
130 | | static void |
131 | | pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md) |
132 | | { |
133 | | pcre_uint32 c; |
134 | | BOOL utf = md->utf; |
135 | | if (is_subject && length > md->end_subject - p) length = md->end_subject - p; |
136 | | while (length-- > 0) |
137 | | if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c); |
138 | | } |
139 | | #endif |
140 | | |
141 | | |
142 | | |
143 | | /************************************************* |
144 | | * Match a back-reference * |
145 | | *************************************************/ |
146 | | |
147 | | /* Normally, if a back reference hasn't been set, the length that is passed is |
148 | | negative, so the match always fails. However, in JavaScript compatibility mode, |
149 | | the length passed is zero. Note that in caseless UTF-8 mode, the number of |
150 | | subject bytes matched may be different to the number of reference bytes. |
151 | | |
152 | | Arguments: |
153 | | offset index into the offset vector |
154 | | eptr pointer into the subject |
155 | | length length of reference to be matched (number of bytes) |
156 | | md points to match data block |
157 | | caseless TRUE if caseless |
158 | | |
159 | | Returns: >= 0 the number of subject bytes matched |
160 | | -1 no match |
161 | | -2 partial match; always given if at end subject |
162 | | */ |
163 | | |
164 | | static int |
165 | | match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md, |
166 | | BOOL caseless) |
167 | 0 | { |
168 | 0 | PCRE_PUCHAR eptr_start = eptr; |
169 | 0 | register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset]; |
170 | | #if defined SUPPORT_UTF && defined SUPPORT_UCP |
171 | | BOOL utf = md->utf; |
172 | | #endif |
173 | |
|
174 | | #ifdef PCRE_DEBUG |
175 | | if (eptr >= md->end_subject) |
176 | | printf("matching subject <null>"); |
177 | | else |
178 | | { |
179 | | printf("matching subject "); |
180 | | pchars(eptr, length, TRUE, md); |
181 | | } |
182 | | printf(" against backref "); |
183 | | pchars(p, length, FALSE, md); |
184 | | printf("\n"); |
185 | | #endif |
186 | | |
187 | | /* Always fail if reference not set (and not JavaScript compatible - in that |
188 | | case the length is passed as zero). */ |
189 | |
|
190 | 0 | if (length < 0) return -1; |
191 | | |
192 | | /* Separate the caseless case for speed. In UTF-8 mode we can only do this |
193 | | properly if Unicode properties are supported. Otherwise, we can check only |
194 | | ASCII characters. */ |
195 | | |
196 | 0 | if (caseless) |
197 | 0 | { |
198 | | #if defined SUPPORT_UTF && defined SUPPORT_UCP |
199 | | if (utf) |
200 | | { |
201 | | /* Match characters up to the end of the reference. NOTE: the number of |
202 | | data units matched may differ, because in UTF-8 there are some characters |
203 | | whose upper and lower case versions code have different numbers of bytes. |
204 | | For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 |
205 | | (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a |
206 | | sequence of two of the latter. It is important, therefore, to check the |
207 | | length along the reference, not along the subject (earlier code did this |
208 | | wrong). */ |
209 | | |
210 | | PCRE_PUCHAR endptr = p + length; |
211 | | while (p < endptr) |
212 | | { |
213 | | pcre_uint32 c, d; |
214 | | const ucd_record *ur; |
215 | | if (eptr >= md->end_subject) return -2; /* Partial match */ |
216 | | GETCHARINC(c, eptr); |
217 | | GETCHARINC(d, p); |
218 | | ur = GET_UCD(d); |
219 | | if (c != d && c != d + ur->other_case) |
220 | | { |
221 | | const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset; |
222 | | for (;;) |
223 | | { |
224 | | if (c < *pp) return -1; |
225 | | if (c == *pp++) break; |
226 | | } |
227 | | } |
228 | | } |
229 | | } |
230 | | else |
231 | | #endif |
232 | | |
233 | | /* The same code works when not in UTF-8 mode and in UTF-8 mode when there |
234 | | is no UCP support. */ |
235 | 0 | { |
236 | 0 | while (length-- > 0) |
237 | 0 | { |
238 | 0 | pcre_uint32 cc, cp; |
239 | 0 | if (eptr >= md->end_subject) return -2; /* Partial match */ |
240 | 0 | cc = UCHAR21TEST(eptr); |
241 | 0 | cp = UCHAR21TEST(p); |
242 | 0 | if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1; |
243 | 0 | p++; |
244 | 0 | eptr++; |
245 | 0 | } |
246 | 0 | } |
247 | 0 | } |
248 | | |
249 | | /* In the caseful case, we can just compare the bytes, whether or not we |
250 | | are in UTF-8 mode. */ |
251 | | |
252 | 0 | else |
253 | 0 | { |
254 | 0 | while (length-- > 0) |
255 | 0 | { |
256 | 0 | if (eptr >= md->end_subject) return -2; /* Partial match */ |
257 | 0 | if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; |
258 | 0 | } |
259 | 0 | } |
260 | | |
261 | 0 | return (int)(eptr - eptr_start); |
262 | 0 | } |
263 | | |
264 | | |
265 | | |
266 | | /*************************************************************************** |
267 | | **************************************************************************** |
268 | | RECURSION IN THE match() FUNCTION |
269 | | |
270 | | The match() function is highly recursive, though not every recursive call |
271 | | increases the recursive depth. Nevertheless, some regular expressions can cause |
272 | | it to recurse to a great depth. I was writing for Unix, so I just let it call |
273 | | itself recursively. This uses the stack for saving everything that has to be |
274 | | saved for a recursive call. On Unix, the stack can be large, and this works |
275 | | fine. |
276 | | |
277 | | It turns out that on some non-Unix-like systems there are problems with |
278 | | programs that use a lot of stack. (This despite the fact that every last chip |
279 | | has oodles of memory these days, and techniques for extending the stack have |
280 | | been known for decades.) So.... |
281 | | |
282 | | There is a fudge, triggered by defining NO_RECURSE, which avoids recursive |
283 | | calls by keeping local variables that need to be preserved in blocks of memory |
284 | | obtained from malloc() instead instead of on the stack. Macros are used to |
285 | | achieve this so that the actual code doesn't look very different to what it |
286 | | always used to. |
287 | | |
288 | | The original heap-recursive code used longjmp(). However, it seems that this |
289 | | can be very slow on some operating systems. Following a suggestion from Stan |
290 | | Switzer, the use of longjmp() has been abolished, at the cost of having to |
291 | | provide a unique number for each call to RMATCH. There is no way of generating |
292 | | a sequence of numbers at compile time in C. I have given them names, to make |
293 | | them stand out more clearly. |
294 | | |
295 | | Crude tests on x86 Linux show a small speedup of around 5-8%. However, on |
296 | | FreeBSD, avoiding longjmp() more than halves the time taken to run the standard |
297 | | tests. Furthermore, not using longjmp() means that local dynamic variables |
298 | | don't have indeterminate values; this has meant that the frame size can be |
299 | | reduced because the result can be "passed back" by straight setting of the |
300 | | variable instead of being passed in the frame. |
301 | | **************************************************************************** |
302 | | ***************************************************************************/ |
303 | | |
304 | | /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN |
305 | | below must be updated in sync. */ |
306 | | |
307 | | enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, |
308 | | RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, |
309 | | RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, |
310 | | RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, |
311 | | RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, |
312 | | RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, |
313 | | RM61, RM62, RM63, RM64, RM65, RM66, RM67 }; |
314 | | |
315 | | /* These versions of the macros use the stack, as normal. There are debugging |
316 | | versions and production versions. Note that the "rw" argument of RMATCH isn't |
317 | | actually used in this definition. */ |
318 | | |
319 | | #ifndef NO_RECURSE |
320 | | #define REGISTER register |
321 | | |
322 | | #ifdef PCRE_DEBUG |
323 | | #define RMATCH(ra,rb,rc,rd,re,rw) \ |
324 | | { \ |
325 | | printf("match() called in line %d\n", __LINE__); \ |
326 | | rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \ |
327 | | printf("to line %d\n", __LINE__); \ |
328 | | } |
329 | | #define RRETURN(ra) \ |
330 | | { \ |
331 | | printf("match() returned %d from line %d\n", ra, __LINE__); \ |
332 | | return ra; \ |
333 | | } |
334 | | #else |
335 | | #define RMATCH(ra,rb,rc,rd,re,rw) \ |
336 | | rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1) |
337 | | #define RRETURN(ra) return ra |
338 | | #endif |
339 | | |
340 | | #else |
341 | | |
342 | | |
343 | | /* These versions of the macros manage a private stack on the heap. Note that |
344 | | the "rd" argument of RMATCH isn't actually used in this definition. It's the md |
345 | | argument of match(), which never changes. */ |
346 | | |
347 | | #define REGISTER |
348 | | |
349 | | #define RMATCH(ra,rb,rc,rd,re,rw)\ |
350 | 0 | {\ |
351 | 0 | heapframe *newframe = frame->Xnextframe;\ |
352 | 0 | if (newframe == NULL)\ |
353 | 0 | {\ |
354 | 0 | newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\ |
355 | 0 | if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ |
356 | 0 | newframe->Xnextframe = NULL;\ |
357 | 0 | frame->Xnextframe = newframe;\ |
358 | 0 | }\ |
359 | 0 | frame->Xwhere = rw;\ |
360 | 0 | newframe->Xeptr = ra;\ |
361 | 0 | newframe->Xecode = rb;\ |
362 | 0 | newframe->Xmstart = mstart;\ |
363 | 0 | newframe->Xoffset_top = rc;\ |
364 | 0 | newframe->Xeptrb = re;\ |
365 | 0 | newframe->Xrdepth = frame->Xrdepth + 1;\ |
366 | 0 | newframe->Xprevframe = frame;\ |
367 | 0 | frame = newframe;\ |
368 | 0 | DPRINTF(("restarting from line %d\n", __LINE__));\ |
369 | 0 | goto HEAP_RECURSE;\ |
370 | 0 | L_##rw:\ |
371 | 0 | DPRINTF(("jumped back to line %d\n", __LINE__));\ |
372 | 0 | } |
373 | | |
374 | | #define RRETURN(ra)\ |
375 | 0 | {\ |
376 | 0 | heapframe *oldframe = frame;\ |
377 | 0 | frame = oldframe->Xprevframe;\ |
378 | 0 | if (frame != NULL)\ |
379 | 0 | {\ |
380 | 0 | rrc = ra;\ |
381 | 0 | goto HEAP_RETURN;\ |
382 | 0 | }\ |
383 | 0 | return ra;\ |
384 | 0 | } |
385 | | |
386 | | |
387 | | /* Structure for remembering the local variables in a private frame */ |
388 | | |
389 | | typedef struct heapframe { |
390 | | struct heapframe *Xprevframe; |
391 | | struct heapframe *Xnextframe; |
392 | | |
393 | | /* Function arguments that may change */ |
394 | | |
395 | | PCRE_PUCHAR Xeptr; |
396 | | const pcre_uchar *Xecode; |
397 | | PCRE_PUCHAR Xmstart; |
398 | | int Xoffset_top; |
399 | | eptrblock *Xeptrb; |
400 | | unsigned int Xrdepth; |
401 | | |
402 | | /* Function local variables */ |
403 | | |
404 | | PCRE_PUCHAR Xcallpat; |
405 | | #ifdef SUPPORT_UTF |
406 | | PCRE_PUCHAR Xcharptr; |
407 | | #endif |
408 | | PCRE_PUCHAR Xdata; |
409 | | PCRE_PUCHAR Xnext; |
410 | | PCRE_PUCHAR Xpp; |
411 | | PCRE_PUCHAR Xprev; |
412 | | PCRE_PUCHAR Xsaved_eptr; |
413 | | |
414 | | recursion_info Xnew_recursive; |
415 | | |
416 | | BOOL Xcur_is_word; |
417 | | BOOL Xcondition; |
418 | | BOOL Xprev_is_word; |
419 | | |
420 | | #ifdef SUPPORT_UCP |
421 | | int Xprop_type; |
422 | | unsigned int Xprop_value; |
423 | | int Xprop_fail_result; |
424 | | int Xoclength; |
425 | | pcre_uchar Xocchars[6]; |
426 | | #endif |
427 | | |
428 | | int Xcodelink; |
429 | | int Xctype; |
430 | | unsigned int Xfc; |
431 | | int Xfi; |
432 | | int Xlength; |
433 | | int Xmax; |
434 | | int Xmin; |
435 | | unsigned int Xnumber; |
436 | | int Xoffset; |
437 | | unsigned int Xop; |
438 | | pcre_int32 Xsave_capture_last; |
439 | | int Xsave_offset1, Xsave_offset2, Xsave_offset3; |
440 | | int Xstacksave[REC_STACK_SAVE_MAX]; |
441 | | |
442 | | eptrblock Xnewptrb; |
443 | | |
444 | | /* Where to jump back to */ |
445 | | |
446 | | int Xwhere; |
447 | | |
448 | | } heapframe; |
449 | | |
450 | | #endif |
451 | | |
452 | | |
453 | | /*************************************************************************** |
454 | | ***************************************************************************/ |
455 | | |
456 | | |
457 | | |
458 | | /************************************************* |
459 | | * Match from current position * |
460 | | *************************************************/ |
461 | | |
462 | | /* This function is called recursively in many circumstances. Whenever it |
463 | | returns a negative (error) response, the outer incarnation must also return the |
464 | | same response. */ |
465 | | |
466 | | /* These macros pack up tests that are used for partial matching, and which |
467 | | appear several times in the code. We set the "hit end" flag if the pointer is |
468 | | at the end of the subject and also past the start of the subject (i.e. |
469 | | something has been matched). For hard partial matching, we then return |
470 | | immediately. The second one is used when we already know we are past the end of |
471 | | the subject. */ |
472 | | |
473 | | #define CHECK_PARTIAL()\ |
474 | 0 | if (md->partial != 0 && eptr >= md->end_subject && \ |
475 | 0 | eptr > md->start_used_ptr) \ |
476 | 0 | { \ |
477 | 0 | md->hitend = TRUE; \ |
478 | 0 | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \ |
479 | 0 | } |
480 | | |
481 | | #define SCHECK_PARTIAL()\ |
482 | 0 | if (md->partial != 0 && eptr > md->start_used_ptr) \ |
483 | 0 | { \ |
484 | 0 | md->hitend = TRUE; \ |
485 | 0 | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \ |
486 | 0 | } |
487 | | |
488 | | |
489 | | /* Performance note: It might be tempting to extract commonly used fields from |
490 | | the md structure (e.g. utf, end_subject) into individual variables to improve |
491 | | performance. Tests using gcc on a SPARC disproved this; in the first case, it |
492 | | made performance worse. |
493 | | |
494 | | Arguments: |
495 | | eptr pointer to current character in subject |
496 | | ecode pointer to current position in compiled code |
497 | | mstart pointer to the current match start position (can be modified |
498 | | by encountering \K) |
499 | | offset_top current top pointer |
500 | | md pointer to "static" info for the match |
501 | | eptrb pointer to chain of blocks containing eptr at start of |
502 | | brackets - for testing for empty matches |
503 | | rdepth the recursion depth |
504 | | |
505 | | Returns: MATCH_MATCH if matched ) these values are >= 0 |
506 | | MATCH_NOMATCH if failed to match ) |
507 | | a negative MATCH_xxx value for PRUNE, SKIP, etc |
508 | | a negative PCRE_ERROR_xxx value if aborted by an error condition |
509 | | (e.g. stopped by repeated call or recursion limit) |
510 | | */ |
511 | | |
512 | | static int |
513 | | match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode, |
514 | | PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb, |
515 | | unsigned int rdepth) |
516 | 0 | { |
517 | | /* These variables do not need to be preserved over recursion in this function, |
518 | | so they can be ordinary variables in all cases. Mark some of them with |
519 | | "register" because they are used a lot in loops. */ |
520 | |
|
521 | 0 | register int rrc; /* Returns from recursive calls */ |
522 | 0 | register int i; /* Used for loops not involving calls to RMATCH() */ |
523 | 0 | register pcre_uint32 c; /* Character values not kept over RMATCH() calls */ |
524 | 0 | register BOOL utf; /* Local copy of UTF flag for speed */ |
525 | |
|
526 | 0 | BOOL minimize, possessive; /* Quantifier options */ |
527 | 0 | BOOL caseless; |
528 | 0 | int condcode; |
529 | | |
530 | | /* When recursion is not being used, all "local" variables that have to be |
531 | | preserved over calls to RMATCH() are part of a "frame". We set up the top-level |
532 | | frame on the stack here; subsequent instantiations are obtained from the heap |
533 | | whenever RMATCH() does a "recursion". See the macro definitions above. Putting |
534 | | the top-level on the stack rather than malloc-ing them all gives a performance |
535 | | boost in many cases where there is not much "recursion". */ |
536 | |
|
537 | 0 | #ifdef NO_RECURSE |
538 | 0 | heapframe *frame = (heapframe *)md->match_frames_base; |
539 | | |
540 | | /* Copy in the original argument variables */ |
541 | |
|
542 | 0 | frame->Xeptr = eptr; |
543 | 0 | frame->Xecode = ecode; |
544 | 0 | frame->Xmstart = mstart; |
545 | 0 | frame->Xoffset_top = offset_top; |
546 | 0 | frame->Xeptrb = eptrb; |
547 | 0 | frame->Xrdepth = rdepth; |
548 | | |
549 | | /* This is where control jumps back to to effect "recursion" */ |
550 | |
|
551 | 0 | HEAP_RECURSE: |
552 | | |
553 | | /* Macros make the argument variables come from the current frame */ |
554 | |
|
555 | 0 | #define eptr frame->Xeptr |
556 | 0 | #define ecode frame->Xecode |
557 | 0 | #define mstart frame->Xmstart |
558 | 0 | #define offset_top frame->Xoffset_top |
559 | 0 | #define eptrb frame->Xeptrb |
560 | 0 | #define rdepth frame->Xrdepth |
561 | | |
562 | | /* Ditto for the local variables */ |
563 | |
|
564 | | #ifdef SUPPORT_UTF |
565 | | #define charptr frame->Xcharptr |
566 | | #endif |
567 | 0 | #define callpat frame->Xcallpat |
568 | 0 | #define codelink frame->Xcodelink |
569 | 0 | #define data frame->Xdata |
570 | 0 | #define next frame->Xnext |
571 | 0 | #define pp frame->Xpp |
572 | 0 | #define prev frame->Xprev |
573 | 0 | #define saved_eptr frame->Xsaved_eptr |
574 | |
|
575 | 0 | #define new_recursive frame->Xnew_recursive |
576 | |
|
577 | 0 | #define cur_is_word frame->Xcur_is_word |
578 | 0 | #define condition frame->Xcondition |
579 | 0 | #define prev_is_word frame->Xprev_is_word |
580 | |
|
581 | | #ifdef SUPPORT_UCP |
582 | | #define prop_type frame->Xprop_type |
583 | | #define prop_value frame->Xprop_value |
584 | | #define prop_fail_result frame->Xprop_fail_result |
585 | | #define oclength frame->Xoclength |
586 | | #define occhars frame->Xocchars |
587 | | #endif |
588 | |
|
589 | 0 | #define ctype frame->Xctype |
590 | 0 | #define fc frame->Xfc |
591 | 0 | #define fi frame->Xfi |
592 | 0 | #define length frame->Xlength |
593 | 0 | #define max frame->Xmax |
594 | 0 | #define min frame->Xmin |
595 | 0 | #define number frame->Xnumber |
596 | 0 | #define offset frame->Xoffset |
597 | 0 | #define op frame->Xop |
598 | 0 | #define save_capture_last frame->Xsave_capture_last |
599 | 0 | #define save_offset1 frame->Xsave_offset1 |
600 | 0 | #define save_offset2 frame->Xsave_offset2 |
601 | 0 | #define save_offset3 frame->Xsave_offset3 |
602 | 0 | #define stacksave frame->Xstacksave |
603 | |
|
604 | 0 | #define newptrb frame->Xnewptrb |
605 | | |
606 | | /* When recursion is being used, local variables are allocated on the stack and |
607 | | get preserved during recursion in the normal way. In this environment, fi and |
608 | | i, and fc and c, can be the same variables. */ |
609 | |
|
610 | | #else /* NO_RECURSE not defined */ |
611 | | #define fi i |
612 | | #define fc c |
613 | | |
614 | | /* Many of the following variables are used only in small blocks of the code. |
615 | | My normal style of coding would have declared them within each of those blocks. |
616 | | However, in order to accommodate the version of this code that uses an external |
617 | | "stack" implemented on the heap, it is easier to declare them all here, so the |
618 | | declarations can be cut out in a block. The only declarations within blocks |
619 | | below are for variables that do not have to be preserved over a recursive call |
620 | | to RMATCH(). */ |
621 | | |
622 | | #ifdef SUPPORT_UTF |
623 | | const pcre_uchar *charptr; |
624 | | #endif |
625 | | const pcre_uchar *callpat; |
626 | | const pcre_uchar *data; |
627 | | const pcre_uchar *next; |
628 | | PCRE_PUCHAR pp; |
629 | | const pcre_uchar *prev; |
630 | | PCRE_PUCHAR saved_eptr; |
631 | | |
632 | | recursion_info new_recursive; |
633 | | |
634 | | BOOL cur_is_word; |
635 | | BOOL condition; |
636 | | BOOL prev_is_word; |
637 | | |
638 | | #ifdef SUPPORT_UCP |
639 | | int prop_type; |
640 | | unsigned int prop_value; |
641 | | int prop_fail_result; |
642 | | int oclength; |
643 | | pcre_uchar occhars[6]; |
644 | | #endif |
645 | | |
646 | | int codelink; |
647 | | int ctype; |
648 | | int length; |
649 | | int max; |
650 | | int min; |
651 | | unsigned int number; |
652 | | int offset; |
653 | | unsigned int op; |
654 | | pcre_int32 save_capture_last; |
655 | | int save_offset1, save_offset2, save_offset3; |
656 | | int stacksave[REC_STACK_SAVE_MAX]; |
657 | | |
658 | | eptrblock newptrb; |
659 | | |
660 | | /* There is a special fudge for calling match() in a way that causes it to |
661 | | measure the size of its basic stack frame when the stack is being used for |
662 | | recursion. The second argument (ecode) being NULL triggers this behaviour. It |
663 | | cannot normally ever be NULL. The return is the negated value of the frame |
664 | | size. */ |
665 | | |
666 | | if (ecode == NULL) |
667 | | { |
668 | | if (rdepth == 0) |
669 | | return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1); |
670 | | else |
671 | | { |
672 | | int len = (int)((char *)&rdepth - (char *)eptr); |
673 | | return (len > 0)? -len : len; |
674 | | } |
675 | | } |
676 | | #endif /* NO_RECURSE */ |
677 | | |
678 | | /* To save space on the stack and in the heap frame, I have doubled up on some |
679 | | of the local variables that are used only in localised parts of the code, but |
680 | | still need to be preserved over recursive calls of match(). These macros define |
681 | | the alternative names that are used. */ |
682 | |
|
683 | 0 | #define allow_zero cur_is_word |
684 | 0 | #define cbegroup condition |
685 | 0 | #define code_offset codelink |
686 | 0 | #define condassert condition |
687 | 0 | #define matched_once prev_is_word |
688 | 0 | #define foc number |
689 | 0 | #define save_mark data |
690 | | |
691 | | /* These statements are here to stop the compiler complaining about unitialized |
692 | | variables. */ |
693 | |
|
694 | | #ifdef SUPPORT_UCP |
695 | | prop_value = 0; |
696 | | prop_fail_result = 0; |
697 | | #endif |
698 | | |
699 | | |
700 | | /* This label is used for tail recursion, which is used in a few cases even |
701 | | when NO_RECURSE is not defined, in order to reduce the amount of stack that is |
702 | | used. Thanks to Ian Taylor for noticing this possibility and sending the |
703 | | original patch. */ |
704 | |
|
705 | 0 | TAIL_RECURSE: |
706 | | |
707 | | /* OK, now we can get on with the real code of the function. Recursive calls |
708 | | are specified by the macro RMATCH and RRETURN is used to return. When |
709 | | NO_RECURSE is *not* defined, these just turn into a recursive call to match() |
710 | | and a "return", respectively (possibly with some debugging if PCRE_DEBUG is |
711 | | defined). However, RMATCH isn't like a function call because it's quite a |
712 | | complicated macro. It has to be used in one particular way. This shouldn't, |
713 | | however, impact performance when true recursion is being used. */ |
714 | |
|
715 | | #ifdef SUPPORT_UTF |
716 | | utf = md->utf; /* Local copy of the flag */ |
717 | | #else |
718 | 0 | utf = FALSE; |
719 | 0 | #endif |
720 | | |
721 | | /* First check that we haven't called match() too many times, or that we |
722 | | haven't exceeded the recursive call limit. */ |
723 | |
|
724 | 0 | if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); |
725 | 0 | if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); |
726 | | |
727 | | /* At the start of a group with an unlimited repeat that may match an empty |
728 | | string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is |
729 | | done this way to save having to use another function argument, which would take |
730 | | up space on the stack. See also MATCH_CONDASSERT below. |
731 | | |
732 | | When MATCH_CBEGROUP is set, add the current subject pointer to the chain of |
733 | | such remembered pointers, to be checked when we hit the closing ket, in order |
734 | | to break infinite loops that match no characters. When match() is called in |
735 | | other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must |
736 | | NOT be used with tail recursion, because the memory block that is used is on |
737 | | the stack, so a new one may be required for each match(). */ |
738 | |
|
739 | 0 | if (md->match_function_type == MATCH_CBEGROUP) |
740 | 0 | { |
741 | 0 | newptrb.epb_saved_eptr = eptr; |
742 | 0 | newptrb.epb_prev = eptrb; |
743 | 0 | eptrb = &newptrb; |
744 | 0 | md->match_function_type = 0; |
745 | 0 | } |
746 | | |
747 | | /* Now start processing the opcodes. */ |
748 | |
|
749 | 0 | for (;;) |
750 | 0 | { |
751 | 0 | minimize = possessive = FALSE; |
752 | 0 | op = *ecode; |
753 | |
|
754 | 0 | switch(op) |
755 | 0 | { |
756 | 0 | case OP_MARK: |
757 | 0 | md->nomatch_mark = ecode + 2; |
758 | 0 | md->mark = NULL; /* In case previously set by assertion */ |
759 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, |
760 | 0 | eptrb, RM55); |
761 | 0 | if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT || rrc == MATCH_KETRPOS) && |
762 | 0 | md->mark == NULL) md->mark = ecode + 2; |
763 | | |
764 | | /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an |
765 | | argument, and we must check whether that argument matches this MARK's |
766 | | argument. It is passed back in md->start_match_ptr (an overloading of that |
767 | | variable). If it does match, we reset that variable to the current subject |
768 | | position and return MATCH_SKIP. Otherwise, pass back the return code |
769 | | unaltered. */ |
770 | | |
771 | 0 | else if (rrc == MATCH_SKIP_ARG && |
772 | 0 | STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0) |
773 | 0 | { |
774 | 0 | md->start_match_ptr = eptr; |
775 | 0 | RRETURN(MATCH_SKIP); |
776 | 0 | } |
777 | 0 | RRETURN(rrc); |
778 | |
|
779 | 0 | case OP_FAIL: |
780 | 0 | RRETURN(MATCH_NOMATCH); |
781 | |
|
782 | 0 | case OP_COMMIT: |
783 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, |
784 | 0 | eptrb, RM52); |
785 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
786 | 0 | RRETURN(MATCH_COMMIT); |
787 | |
|
788 | 0 | case OP_PRUNE: |
789 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, |
790 | 0 | eptrb, RM51); |
791 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
792 | 0 | RRETURN(MATCH_PRUNE); |
793 | |
|
794 | 0 | case OP_PRUNE_ARG: |
795 | 0 | md->nomatch_mark = ecode + 2; |
796 | 0 | md->mark = NULL; /* In case previously set by assertion */ |
797 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, |
798 | 0 | eptrb, RM56); |
799 | 0 | if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && |
800 | 0 | md->mark == NULL) md->mark = ecode + 2; |
801 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
802 | 0 | RRETURN(MATCH_PRUNE); |
803 | |
|
804 | 0 | case OP_SKIP: |
805 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, |
806 | 0 | eptrb, RM53); |
807 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
808 | 0 | md->start_match_ptr = eptr; /* Pass back current position */ |
809 | 0 | RRETURN(MATCH_SKIP); |
810 | | |
811 | | /* Note that, for Perl compatibility, SKIP with an argument does NOT set |
812 | | nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was |
813 | | not a matching mark, we have to re-run the match, ignoring the SKIP_ARG |
814 | | that failed and any that precede it (either they also failed, or were not |
815 | | triggered). To do this, we maintain a count of executed SKIP_ARGs. If a |
816 | | SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg |
817 | | set to the count of the one that failed. */ |
818 | |
|
819 | 0 | case OP_SKIP_ARG: |
820 | 0 | md->skip_arg_count++; |
821 | 0 | if (md->skip_arg_count <= md->ignore_skip_arg) |
822 | 0 | { |
823 | 0 | ecode += PRIV(OP_lengths)[*ecode] + ecode[1]; |
824 | 0 | break; |
825 | 0 | } |
826 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, |
827 | 0 | eptrb, RM57); |
828 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
829 | | |
830 | | /* Pass back the current skip name by overloading md->start_match_ptr and |
831 | | returning the special MATCH_SKIP_ARG return code. This will either be |
832 | | caught by a matching MARK, or get to the top, where it causes a rematch |
833 | | with md->ignore_skip_arg set to the value of md->skip_arg_count. */ |
834 | |
|
835 | 0 | md->start_match_ptr = ecode + 2; |
836 | 0 | RRETURN(MATCH_SKIP_ARG); |
837 | | |
838 | | /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that |
839 | | the branch in which it occurs can be determined. Overload the start of |
840 | | match pointer to do this. */ |
841 | |
|
842 | 0 | case OP_THEN: |
843 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, |
844 | 0 | eptrb, RM54); |
845 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
846 | 0 | md->start_match_ptr = ecode; |
847 | 0 | RRETURN(MATCH_THEN); |
848 | |
|
849 | 0 | case OP_THEN_ARG: |
850 | 0 | md->nomatch_mark = ecode + 2; |
851 | 0 | md->mark = NULL; /* In case previously set by assertion */ |
852 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, |
853 | 0 | md, eptrb, RM58); |
854 | 0 | if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && |
855 | 0 | md->mark == NULL) md->mark = ecode + 2; |
856 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
857 | 0 | md->start_match_ptr = ecode; |
858 | 0 | RRETURN(MATCH_THEN); |
859 | | |
860 | | /* Handle an atomic group that does not contain any capturing parentheses. |
861 | | This can be handled like an assertion. Prior to 8.13, all atomic groups |
862 | | were handled this way. In 8.13, the code was changed as below for ONCE, so |
863 | | that backups pass through the group and thereby reset captured values. |
864 | | However, this uses a lot more stack, so in 8.20, atomic groups that do not |
865 | | contain any captures generate OP_ONCE_NC, which can be handled in the old, |
866 | | less stack intensive way. |
867 | | |
868 | | Check the alternative branches in turn - the matching won't pass the KET |
869 | | for this kind of subpattern. If any one branch matches, we carry on as at |
870 | | the end of a normal bracket, leaving the subject pointer, but resetting |
871 | | the start-of-match value in case it was changed by \K. */ |
872 | |
|
873 | 0 | case OP_ONCE_NC: |
874 | 0 | prev = ecode; |
875 | 0 | saved_eptr = eptr; |
876 | 0 | save_mark = md->mark; |
877 | 0 | do |
878 | 0 | { |
879 | 0 | RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64); |
880 | 0 | if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */ |
881 | 0 | { |
882 | 0 | mstart = md->start_match_ptr; |
883 | 0 | break; |
884 | 0 | } |
885 | 0 | if (rrc == MATCH_THEN) |
886 | 0 | { |
887 | 0 | next = ecode + GET(ecode,1); |
888 | 0 | if (md->start_match_ptr < next && |
889 | 0 | (*ecode == OP_ALT || *next == OP_ALT)) |
890 | 0 | rrc = MATCH_NOMATCH; |
891 | 0 | } |
892 | |
|
893 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
894 | 0 | ecode += GET(ecode,1); |
895 | 0 | md->mark = save_mark; |
896 | 0 | } |
897 | 0 | while (*ecode == OP_ALT); |
898 | | |
899 | | /* If hit the end of the group (which could be repeated), fail */ |
900 | | |
901 | 0 | if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
902 | | |
903 | | /* Continue as from after the group, updating the offsets high water |
904 | | mark, since extracts may have been taken. */ |
905 | |
|
906 | 0 | do ecode += GET(ecode, 1); while (*ecode == OP_ALT); |
907 | |
|
908 | 0 | offset_top = md->end_offset_top; |
909 | 0 | eptr = md->end_match_ptr; |
910 | | |
911 | | /* For a non-repeating ket, just continue at this level. This also |
912 | | happens for a repeating ket if no characters were matched in the group. |
913 | | This is the forcible breaking of infinite loops as implemented in Perl |
914 | | 5.005. */ |
915 | |
|
916 | 0 | if (*ecode == OP_KET || eptr == saved_eptr) |
917 | 0 | { |
918 | 0 | ecode += 1+LINK_SIZE; |
919 | 0 | break; |
920 | 0 | } |
921 | | |
922 | | /* The repeating kets try the rest of the pattern or restart from the |
923 | | preceding bracket, in the appropriate order. The second "call" of match() |
924 | | uses tail recursion, to avoid using another stack frame. */ |
925 | | |
926 | 0 | if (*ecode == OP_KETRMIN) |
927 | 0 | { |
928 | 0 | RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65); |
929 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
930 | 0 | ecode = prev; |
931 | 0 | goto TAIL_RECURSE; |
932 | 0 | } |
933 | 0 | else /* OP_KETRMAX */ |
934 | 0 | { |
935 | 0 | RMATCH(eptr, prev, offset_top, md, eptrb, RM66); |
936 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
937 | 0 | ecode += 1 + LINK_SIZE; |
938 | 0 | goto TAIL_RECURSE; |
939 | 0 | } |
940 | | /* Control never gets here */ |
941 | | |
942 | | /* Handle a capturing bracket, other than those that are possessive with an |
943 | | unlimited repeat. If there is space in the offset vector, save the current |
944 | | subject position in the working slot at the top of the vector. We mustn't |
945 | | change the current values of the data slot, because they may be set from a |
946 | | previous iteration of this group, and be referred to by a reference inside |
947 | | the group. A failure to match might occur after the group has succeeded, |
948 | | if something later on doesn't match. For this reason, we need to restore |
949 | | the working value and also the values of the final offsets, in case they |
950 | | were set by a previous iteration of the same bracket. |
951 | | |
952 | | If there isn't enough space in the offset vector, treat this as if it were |
953 | | a non-capturing bracket. Don't worry about setting the flag for the error |
954 | | case here; that is handled in the code for KET. */ |
955 | | |
956 | 0 | case OP_CBRA: |
957 | 0 | case OP_SCBRA: |
958 | 0 | number = GET2(ecode, 1+LINK_SIZE); |
959 | 0 | offset = number << 1; |
960 | |
|
961 | | #ifdef PCRE_DEBUG |
962 | | printf("start bracket %d\n", number); |
963 | | printf("subject="); |
964 | | pchars(eptr, 16, TRUE, md); |
965 | | printf("\n"); |
966 | | #endif |
967 | |
|
968 | 0 | if (offset < md->offset_max) |
969 | 0 | { |
970 | 0 | save_offset1 = md->offset_vector[offset]; |
971 | 0 | save_offset2 = md->offset_vector[offset+1]; |
972 | 0 | save_offset3 = md->offset_vector[md->offset_end - number]; |
973 | 0 | save_capture_last = md->capture_last; |
974 | 0 | save_mark = md->mark; |
975 | |
|
976 | 0 | DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); |
977 | 0 | md->offset_vector[md->offset_end - number] = |
978 | 0 | (int)(eptr - md->start_subject); |
979 | |
|
980 | 0 | for (;;) |
981 | 0 | { |
982 | 0 | if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; |
983 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, |
984 | 0 | eptrb, RM1); |
985 | 0 | if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */ |
986 | | |
987 | | /* If we backed up to a THEN, check whether it is within the current |
988 | | branch by comparing the address of the THEN that is passed back with |
989 | | the end of the branch. If it is within the current branch, and the |
990 | | branch is one of two or more alternatives (it either starts or ends |
991 | | with OP_ALT), we have reached the limit of THEN's action, so convert |
992 | | the return code to NOMATCH, which will cause normal backtracking to |
993 | | happen from now on. Otherwise, THEN is passed back to an outer |
994 | | alternative. This implements Perl's treatment of parenthesized groups, |
995 | | where a group not containing | does not affect the current alternative, |
996 | | that is, (X) is NOT the same as (X|(*F)). */ |
997 | | |
998 | 0 | if (rrc == MATCH_THEN) |
999 | 0 | { |
1000 | 0 | next = ecode + GET(ecode,1); |
1001 | 0 | if (md->start_match_ptr < next && |
1002 | 0 | (*ecode == OP_ALT || *next == OP_ALT)) |
1003 | 0 | rrc = MATCH_NOMATCH; |
1004 | 0 | } |
1005 | | |
1006 | | /* Anything other than NOMATCH is passed back. */ |
1007 | |
|
1008 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1009 | 0 | md->capture_last = save_capture_last; |
1010 | 0 | ecode += GET(ecode, 1); |
1011 | 0 | md->mark = save_mark; |
1012 | 0 | if (*ecode != OP_ALT) break; |
1013 | 0 | } |
1014 | | |
1015 | 0 | DPRINTF(("bracket %d failed\n", number)); |
1016 | 0 | md->offset_vector[offset] = save_offset1; |
1017 | 0 | md->offset_vector[offset+1] = save_offset2; |
1018 | 0 | md->offset_vector[md->offset_end - number] = save_offset3; |
1019 | | |
1020 | | /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */ |
1021 | |
|
1022 | 0 | RRETURN(rrc); |
1023 | 0 | } |
1024 | | |
1025 | | /* FALL THROUGH ... Insufficient room for saving captured contents. Treat |
1026 | | as a non-capturing bracket. */ |
1027 | | |
1028 | | /* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
1029 | | /* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
1030 | | |
1031 | 0 | DPRINTF(("insufficient capture room: treat as non-capturing\n")); |
1032 | | |
1033 | | /* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
1034 | | /* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
1035 | | |
1036 | | /* Non-capturing or atomic group, except for possessive with unlimited |
1037 | | repeat and ONCE group with no captures. Loop for all the alternatives. |
1038 | | |
1039 | | When we get to the final alternative within the brackets, we used to return |
1040 | | the result of a recursive call to match() whatever happened so it was |
1041 | | possible to reduce stack usage by turning this into a tail recursion, |
1042 | | except in the case of a possibly empty group. However, now that there is |
1043 | | the possiblity of (*THEN) occurring in the final alternative, this |
1044 | | optimization is no longer always possible. |
1045 | | |
1046 | | We can optimize if we know there are no (*THEN)s in the pattern; at present |
1047 | | this is the best that can be done. |
1048 | | |
1049 | | MATCH_ONCE is returned when the end of an atomic group is successfully |
1050 | | reached, but subsequent matching fails. It passes back up the tree (causing |
1051 | | captured values to be reset) until the original atomic group level is |
1052 | | reached. This is tested by comparing md->once_target with the start of the |
1053 | | group. At this point, the return is converted into MATCH_NOMATCH so that |
1054 | | previous backup points can be taken. */ |
1055 | |
|
1056 | 0 | case OP_ONCE: |
1057 | 0 | case OP_BRA: |
1058 | 0 | case OP_SBRA: |
1059 | 0 | DPRINTF(("start non-capturing bracket\n")); |
1060 | |
|
1061 | 0 | for (;;) |
1062 | 0 | { |
1063 | 0 | if (op >= OP_SBRA || op == OP_ONCE) |
1064 | 0 | md->match_function_type = MATCH_CBEGROUP; |
1065 | | |
1066 | | /* If this is not a possibly empty group, and there are no (*THEN)s in |
1067 | | the pattern, and this is the final alternative, optimize as described |
1068 | | above. */ |
1069 | | |
1070 | 0 | else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT) |
1071 | 0 | { |
1072 | 0 | ecode += PRIV(OP_lengths)[*ecode]; |
1073 | 0 | goto TAIL_RECURSE; |
1074 | 0 | } |
1075 | | |
1076 | | /* In all other cases, we have to make another call to match(). */ |
1077 | | |
1078 | 0 | save_mark = md->mark; |
1079 | 0 | save_capture_last = md->capture_last; |
1080 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, |
1081 | 0 | RM2); |
1082 | | |
1083 | | /* See comment in the code for capturing groups above about handling |
1084 | | THEN. */ |
1085 | |
|
1086 | 0 | if (rrc == MATCH_THEN) |
1087 | 0 | { |
1088 | 0 | next = ecode + GET(ecode,1); |
1089 | 0 | if (md->start_match_ptr < next && |
1090 | 0 | (*ecode == OP_ALT || *next == OP_ALT)) |
1091 | 0 | rrc = MATCH_NOMATCH; |
1092 | 0 | } |
1093 | |
|
1094 | 0 | if (rrc != MATCH_NOMATCH) |
1095 | 0 | { |
1096 | 0 | if (rrc == MATCH_ONCE) |
1097 | 0 | { |
1098 | 0 | const pcre_uchar *scode = ecode; |
1099 | 0 | if (*scode != OP_ONCE) /* If not at start, find it */ |
1100 | 0 | { |
1101 | 0 | while (*scode == OP_ALT) scode += GET(scode, 1); |
1102 | 0 | scode -= GET(scode, 1); |
1103 | 0 | } |
1104 | 0 | if (md->once_target == scode) rrc = MATCH_NOMATCH; |
1105 | 0 | } |
1106 | 0 | RRETURN(rrc); |
1107 | 0 | } |
1108 | 0 | ecode += GET(ecode, 1); |
1109 | 0 | md->mark = save_mark; |
1110 | 0 | if (*ecode != OP_ALT) break; |
1111 | 0 | md->capture_last = save_capture_last; |
1112 | 0 | } |
1113 | | |
1114 | 0 | RRETURN(MATCH_NOMATCH); |
1115 | | |
1116 | | /* Handle possessive capturing brackets with an unlimited repeat. We come |
1117 | | here from BRAZERO with allow_zero set TRUE. The offset_vector values are |
1118 | | handled similarly to the normal case above. However, the matching is |
1119 | | different. The end of these brackets will always be OP_KETRPOS, which |
1120 | | returns MATCH_KETRPOS without going further in the pattern. By this means |
1121 | | we can handle the group by iteration rather than recursion, thereby |
1122 | | reducing the amount of stack needed. */ |
1123 | |
|
1124 | 0 | case OP_CBRAPOS: |
1125 | 0 | case OP_SCBRAPOS: |
1126 | 0 | allow_zero = FALSE; |
1127 | |
|
1128 | 0 | POSSESSIVE_CAPTURE: |
1129 | 0 | number = GET2(ecode, 1+LINK_SIZE); |
1130 | 0 | offset = number << 1; |
1131 | |
|
1132 | | #ifdef PCRE_DEBUG |
1133 | | printf("start possessive bracket %d\n", number); |
1134 | | printf("subject="); |
1135 | | pchars(eptr, 16, TRUE, md); |
1136 | | printf("\n"); |
1137 | | #endif |
1138 | |
|
1139 | 0 | if (offset >= md->offset_max) goto POSSESSIVE_NON_CAPTURE; |
1140 | | |
1141 | 0 | matched_once = FALSE; |
1142 | 0 | code_offset = (int)(ecode - md->start_code); |
1143 | |
|
1144 | 0 | save_offset1 = md->offset_vector[offset]; |
1145 | 0 | save_offset2 = md->offset_vector[offset+1]; |
1146 | 0 | save_offset3 = md->offset_vector[md->offset_end - number]; |
1147 | 0 | save_capture_last = md->capture_last; |
1148 | |
|
1149 | 0 | DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); |
1150 | | |
1151 | | /* Each time round the loop, save the current subject position for use |
1152 | | when the group matches. For MATCH_MATCH, the group has matched, so we |
1153 | | restart it with a new subject starting position, remembering that we had |
1154 | | at least one match. For MATCH_NOMATCH, carry on with the alternatives, as |
1155 | | usual. If we haven't matched any alternatives in any iteration, check to |
1156 | | see if a previous iteration matched. If so, the group has matched; |
1157 | | continue from afterwards. Otherwise it has failed; restore the previous |
1158 | | capture values before returning NOMATCH. */ |
1159 | |
|
1160 | 0 | for (;;) |
1161 | 0 | { |
1162 | 0 | md->offset_vector[md->offset_end - number] = |
1163 | 0 | (int)(eptr - md->start_subject); |
1164 | 0 | if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; |
1165 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, |
1166 | 0 | eptrb, RM63); |
1167 | 0 | if (rrc == MATCH_KETRPOS) |
1168 | 0 | { |
1169 | 0 | offset_top = md->end_offset_top; |
1170 | 0 | ecode = md->start_code + code_offset; |
1171 | 0 | save_capture_last = md->capture_last; |
1172 | 0 | matched_once = TRUE; |
1173 | 0 | mstart = md->start_match_ptr; /* In case \K changed it */ |
1174 | 0 | if (eptr == md->end_match_ptr) /* Matched an empty string */ |
1175 | 0 | { |
1176 | 0 | do ecode += GET(ecode, 1); while (*ecode == OP_ALT); |
1177 | 0 | break; |
1178 | 0 | } |
1179 | 0 | eptr = md->end_match_ptr; |
1180 | 0 | continue; |
1181 | 0 | } |
1182 | | |
1183 | | /* See comment in the code for capturing groups above about handling |
1184 | | THEN. */ |
1185 | | |
1186 | 0 | if (rrc == MATCH_THEN) |
1187 | 0 | { |
1188 | 0 | next = ecode + GET(ecode,1); |
1189 | 0 | if (md->start_match_ptr < next && |
1190 | 0 | (*ecode == OP_ALT || *next == OP_ALT)) |
1191 | 0 | rrc = MATCH_NOMATCH; |
1192 | 0 | } |
1193 | |
|
1194 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1195 | 0 | md->capture_last = save_capture_last; |
1196 | 0 | ecode += GET(ecode, 1); |
1197 | 0 | if (*ecode != OP_ALT) break; |
1198 | 0 | } |
1199 | | |
1200 | 0 | if (!matched_once) |
1201 | 0 | { |
1202 | 0 | md->offset_vector[offset] = save_offset1; |
1203 | 0 | md->offset_vector[offset+1] = save_offset2; |
1204 | 0 | md->offset_vector[md->offset_end - number] = save_offset3; |
1205 | 0 | } |
1206 | |
|
1207 | 0 | if (allow_zero || matched_once) |
1208 | 0 | { |
1209 | 0 | ecode += 1 + LINK_SIZE; |
1210 | 0 | break; |
1211 | 0 | } |
1212 | | |
1213 | 0 | RRETURN(MATCH_NOMATCH); |
1214 | | |
1215 | | /* Non-capturing possessive bracket with unlimited repeat. We come here |
1216 | | from BRAZERO with allow_zero = TRUE. The code is similar to the above, |
1217 | | without the capturing complication. It is written out separately for speed |
1218 | | and cleanliness. */ |
1219 | |
|
1220 | 0 | case OP_BRAPOS: |
1221 | 0 | case OP_SBRAPOS: |
1222 | 0 | allow_zero = FALSE; |
1223 | |
|
1224 | 0 | POSSESSIVE_NON_CAPTURE: |
1225 | 0 | matched_once = FALSE; |
1226 | 0 | code_offset = (int)(ecode - md->start_code); |
1227 | 0 | save_capture_last = md->capture_last; |
1228 | |
|
1229 | 0 | for (;;) |
1230 | 0 | { |
1231 | 0 | if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; |
1232 | 0 | RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, |
1233 | 0 | eptrb, RM48); |
1234 | 0 | if (rrc == MATCH_KETRPOS) |
1235 | 0 | { |
1236 | 0 | offset_top = md->end_offset_top; |
1237 | 0 | ecode = md->start_code + code_offset; |
1238 | 0 | matched_once = TRUE; |
1239 | 0 | mstart = md->start_match_ptr; /* In case \K reset it */ |
1240 | 0 | if (eptr == md->end_match_ptr) /* Matched an empty string */ |
1241 | 0 | { |
1242 | 0 | do ecode += GET(ecode, 1); while (*ecode == OP_ALT); |
1243 | 0 | break; |
1244 | 0 | } |
1245 | 0 | eptr = md->end_match_ptr; |
1246 | 0 | continue; |
1247 | 0 | } |
1248 | | |
1249 | | /* See comment in the code for capturing groups above about handling |
1250 | | THEN. */ |
1251 | | |
1252 | 0 | if (rrc == MATCH_THEN) |
1253 | 0 | { |
1254 | 0 | next = ecode + GET(ecode,1); |
1255 | 0 | if (md->start_match_ptr < next && |
1256 | 0 | (*ecode == OP_ALT || *next == OP_ALT)) |
1257 | 0 | rrc = MATCH_NOMATCH; |
1258 | 0 | } |
1259 | |
|
1260 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1261 | 0 | ecode += GET(ecode, 1); |
1262 | 0 | if (*ecode != OP_ALT) break; |
1263 | 0 | md->capture_last = save_capture_last; |
1264 | 0 | } |
1265 | | |
1266 | 0 | if (matched_once || allow_zero) |
1267 | 0 | { |
1268 | 0 | ecode += 1 + LINK_SIZE; |
1269 | 0 | break; |
1270 | 0 | } |
1271 | 0 | RRETURN(MATCH_NOMATCH); |
1272 | | |
1273 | | /* Control never reaches here. */ |
1274 | | |
1275 | | /* Conditional group: compilation checked that there are no more than two |
1276 | | branches. If the condition is false, skipping the first branch takes us |
1277 | | past the end of the item if there is only one branch, but that's exactly |
1278 | | what we want. */ |
1279 | |
|
1280 | 0 | case OP_COND: |
1281 | 0 | case OP_SCOND: |
1282 | | |
1283 | | /* The variable codelink will be added to ecode when the condition is |
1284 | | false, to get to the second branch. Setting it to the offset to the ALT |
1285 | | or KET, then incrementing ecode achieves this effect. We now have ecode |
1286 | | pointing to the condition or callout. */ |
1287 | |
|
1288 | 0 | codelink = GET(ecode, 1); /* Offset to the second branch */ |
1289 | 0 | ecode += 1 + LINK_SIZE; /* From this opcode */ |
1290 | | |
1291 | | /* Because of the way auto-callout works during compile, a callout item is |
1292 | | inserted between OP_COND and an assertion condition. */ |
1293 | |
|
1294 | 0 | if (*ecode == OP_CALLOUT) |
1295 | 0 | { |
1296 | 0 | if (PUBL(callout) != NULL) |
1297 | 0 | { |
1298 | 0 | PUBL(callout_block) cb; |
1299 | 0 | cb.version = 2; /* Version 1 of the callout block */ |
1300 | 0 | cb.callout_number = ecode[1]; |
1301 | 0 | cb.offset_vector = md->offset_vector; |
1302 | 0 | #if defined COMPILE_PCRE8 |
1303 | 0 | cb.subject = (PCRE_SPTR)md->start_subject; |
1304 | | #elif defined COMPILE_PCRE16 |
1305 | | cb.subject = (PCRE_SPTR16)md->start_subject; |
1306 | | #elif defined COMPILE_PCRE32 |
1307 | | cb.subject = (PCRE_SPTR32)md->start_subject; |
1308 | | #endif |
1309 | 0 | cb.subject_length = (int)(md->end_subject - md->start_subject); |
1310 | 0 | cb.start_match = (int)(mstart - md->start_subject); |
1311 | 0 | cb.current_position = (int)(eptr - md->start_subject); |
1312 | 0 | cb.pattern_position = GET(ecode, 2); |
1313 | 0 | cb.next_item_length = GET(ecode, 2 + LINK_SIZE); |
1314 | 0 | cb.capture_top = offset_top/2; |
1315 | 0 | cb.capture_last = md->capture_last & CAPLMASK; |
1316 | | /* Internal change requires this for API compatibility. */ |
1317 | 0 | if (cb.capture_last == 0) cb.capture_last = -1; |
1318 | 0 | cb.callout_data = md->callout_data; |
1319 | 0 | cb.mark = md->nomatch_mark; |
1320 | 0 | if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); |
1321 | 0 | if (rrc < 0) RRETURN(rrc); |
1322 | 0 | } |
1323 | | |
1324 | | /* Advance ecode past the callout, so it now points to the condition. We |
1325 | | must adjust codelink so that the value of ecode+codelink is unchanged. */ |
1326 | | |
1327 | 0 | ecode += PRIV(OP_lengths)[OP_CALLOUT]; |
1328 | 0 | codelink -= PRIV(OP_lengths)[OP_CALLOUT]; |
1329 | 0 | } |
1330 | | |
1331 | | /* Test the various possible conditions */ |
1332 | | |
1333 | 0 | condition = FALSE; |
1334 | 0 | switch(condcode = *ecode) |
1335 | 0 | { |
1336 | 0 | case OP_RREF: /* Numbered group recursion test */ |
1337 | 0 | if (md->recursive != NULL) /* Not recursing => FALSE */ |
1338 | 0 | { |
1339 | 0 | unsigned int recno = GET2(ecode, 1); /* Recursion group number*/ |
1340 | 0 | condition = (recno == RREF_ANY || recno == md->recursive->group_num); |
1341 | 0 | } |
1342 | 0 | break; |
1343 | | |
1344 | 0 | case OP_DNRREF: /* Duplicate named group recursion test */ |
1345 | 0 | if (md->recursive != NULL) |
1346 | 0 | { |
1347 | 0 | int count = GET2(ecode, 1 + IMM2_SIZE); |
1348 | 0 | pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; |
1349 | 0 | while (count-- > 0) |
1350 | 0 | { |
1351 | 0 | unsigned int recno = GET2(slot, 0); |
1352 | 0 | condition = recno == md->recursive->group_num; |
1353 | 0 | if (condition) break; |
1354 | 0 | slot += md->name_entry_size; |
1355 | 0 | } |
1356 | 0 | } |
1357 | 0 | break; |
1358 | | |
1359 | 0 | case OP_CREF: /* Numbered group used test */ |
1360 | 0 | offset = GET2(ecode, 1) << 1; /* Doubled ref number */ |
1361 | 0 | condition = offset < offset_top && md->offset_vector[offset] >= 0; |
1362 | 0 | break; |
1363 | | |
1364 | 0 | case OP_DNCREF: /* Duplicate named group used test */ |
1365 | 0 | { |
1366 | 0 | int count = GET2(ecode, 1 + IMM2_SIZE); |
1367 | 0 | pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; |
1368 | 0 | while (count-- > 0) |
1369 | 0 | { |
1370 | 0 | offset = GET2(slot, 0) << 1; |
1371 | 0 | condition = offset < offset_top && md->offset_vector[offset] >= 0; |
1372 | 0 | if (condition) break; |
1373 | 0 | slot += md->name_entry_size; |
1374 | 0 | } |
1375 | 0 | } |
1376 | 0 | break; |
1377 | | |
1378 | 0 | case OP_DEF: /* DEFINE - always false */ |
1379 | 0 | case OP_FAIL: /* From optimized (?!) condition */ |
1380 | 0 | break; |
1381 | | |
1382 | | /* The condition is an assertion. Call match() to evaluate it - setting |
1383 | | md->match_function_type to MATCH_CONDASSERT causes it to stop at the end |
1384 | | of an assertion. */ |
1385 | | |
1386 | 0 | default: |
1387 | 0 | md->match_function_type = MATCH_CONDASSERT; |
1388 | 0 | RMATCH(eptr, ecode, offset_top, md, NULL, RM3); |
1389 | 0 | if (rrc == MATCH_MATCH) |
1390 | 0 | { |
1391 | 0 | if (md->end_offset_top > offset_top) |
1392 | 0 | offset_top = md->end_offset_top; /* Captures may have happened */ |
1393 | 0 | condition = TRUE; |
1394 | | |
1395 | | /* Advance ecode past the assertion to the start of the first branch, |
1396 | | but adjust it so that the general choosing code below works. If the |
1397 | | assertion has a quantifier that allows zero repeats we must skip over |
1398 | | the BRAZERO. This is a lunatic thing to do, but somebody did! */ |
1399 | |
|
1400 | 0 | if (*ecode == OP_BRAZERO) ecode++; |
1401 | 0 | ecode += GET(ecode, 1); |
1402 | 0 | while (*ecode == OP_ALT) ecode += GET(ecode, 1); |
1403 | 0 | ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode]; |
1404 | 0 | } |
1405 | | |
1406 | | /* PCRE doesn't allow the effect of (*THEN) to escape beyond an |
1407 | | assertion; it is therefore treated as NOMATCH. Any other return is an |
1408 | | error. */ |
1409 | | |
1410 | 0 | else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) |
1411 | 0 | { |
1412 | 0 | RRETURN(rrc); /* Need braces because of following else */ |
1413 | 0 | } |
1414 | 0 | break; |
1415 | 0 | } |
1416 | | |
1417 | | /* Choose branch according to the condition */ |
1418 | | |
1419 | 0 | ecode += condition? PRIV(OP_lengths)[condcode] : codelink; |
1420 | | |
1421 | | /* We are now at the branch that is to be obeyed. As there is only one, we |
1422 | | can use tail recursion to avoid using another stack frame, except when |
1423 | | there is unlimited repeat of a possibly empty group. In the latter case, a |
1424 | | recursive call to match() is always required, unless the second alternative |
1425 | | doesn't exist, in which case we can just plough on. Note that, for |
1426 | | compatibility with Perl, the | in a conditional group is NOT treated as |
1427 | | creating two alternatives. If a THEN is encountered in the branch, it |
1428 | | propagates out to the enclosing alternative (unless nested in a deeper set |
1429 | | of alternatives, of course). */ |
1430 | |
|
1431 | 0 | if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT) |
1432 | 0 | { |
1433 | 0 | if (op != OP_SCOND) |
1434 | 0 | { |
1435 | 0 | goto TAIL_RECURSE; |
1436 | 0 | } |
1437 | | |
1438 | 0 | md->match_function_type = MATCH_CBEGROUP; |
1439 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM49); |
1440 | 0 | RRETURN(rrc); |
1441 | 0 | } |
1442 | | |
1443 | | /* Condition false & no alternative; continue after the group. */ |
1444 | | |
1445 | 0 | else |
1446 | 0 | { |
1447 | 0 | } |
1448 | 0 | break; |
1449 | | |
1450 | | |
1451 | | /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, |
1452 | | to close any currently open capturing brackets. */ |
1453 | | |
1454 | 0 | case OP_CLOSE: |
1455 | 0 | number = GET2(ecode, 1); /* Must be less than 65536 */ |
1456 | 0 | offset = number << 1; |
1457 | |
|
1458 | | #ifdef PCRE_DEBUG |
1459 | | printf("end bracket %d at *ACCEPT", number); |
1460 | | printf("\n"); |
1461 | | #endif |
1462 | |
|
1463 | 0 | md->capture_last = (md->capture_last & OVFLMASK) | number; |
1464 | 0 | if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else |
1465 | 0 | { |
1466 | 0 | md->offset_vector[offset] = |
1467 | 0 | md->offset_vector[md->offset_end - number]; |
1468 | 0 | md->offset_vector[offset+1] = (int)(eptr - md->start_subject); |
1469 | | |
1470 | | /* If this group is at or above the current highwater mark, ensure that |
1471 | | any groups between the current high water mark and this group are marked |
1472 | | unset and then update the high water mark. */ |
1473 | |
|
1474 | 0 | if (offset >= offset_top) |
1475 | 0 | { |
1476 | 0 | register int *iptr = md->offset_vector + offset_top; |
1477 | 0 | register int *iend = md->offset_vector + offset; |
1478 | 0 | while (iptr < iend) *iptr++ = -1; |
1479 | 0 | offset_top = offset + 2; |
1480 | 0 | } |
1481 | 0 | } |
1482 | 0 | ecode += 1 + IMM2_SIZE; |
1483 | 0 | break; |
1484 | | |
1485 | | |
1486 | | /* End of the pattern, either real or forced. */ |
1487 | | |
1488 | 0 | case OP_END: |
1489 | 0 | case OP_ACCEPT: |
1490 | 0 | case OP_ASSERT_ACCEPT: |
1491 | | |
1492 | | /* If we have matched an empty string, fail if not in an assertion and not |
1493 | | in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART |
1494 | | is set and we have matched at the start of the subject. In both cases, |
1495 | | backtracking will then try other alternatives, if any. */ |
1496 | |
|
1497 | 0 | if (eptr == mstart && op != OP_ASSERT_ACCEPT && |
1498 | 0 | md->recursive == NULL && |
1499 | 0 | (md->notempty || |
1500 | 0 | (md->notempty_atstart && |
1501 | 0 | mstart == md->start_subject + md->start_offset))) |
1502 | 0 | RRETURN(MATCH_NOMATCH); |
1503 | | |
1504 | | /* Otherwise, we have a match. */ |
1505 | |
|
1506 | 0 | md->end_match_ptr = eptr; /* Record where we ended */ |
1507 | 0 | md->end_offset_top = offset_top; /* and how many extracts were taken */ |
1508 | 0 | md->start_match_ptr = mstart; /* and the start (\K can modify) */ |
1509 | | |
1510 | | /* For some reason, the macros don't work properly if an expression is |
1511 | | given as the argument to RRETURN when the heap is in use. */ |
1512 | |
|
1513 | 0 | rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT; |
1514 | 0 | RRETURN(rrc); |
1515 | | |
1516 | | /* Assertion brackets. Check the alternative branches in turn - the |
1517 | | matching won't pass the KET for an assertion. If any one branch matches, |
1518 | | the assertion is true. Lookbehind assertions have an OP_REVERSE item at the |
1519 | | start of each branch to move the current point backwards, so the code at |
1520 | | this level is identical to the lookahead case. When the assertion is part |
1521 | | of a condition, we want to return immediately afterwards. The caller of |
1522 | | this incarnation of the match() function will have set MATCH_CONDASSERT in |
1523 | | md->match_function type, and one of these opcodes will be the first opcode |
1524 | | that is processed. We use a local variable that is preserved over calls to |
1525 | | match() to remember this case. */ |
1526 | |
|
1527 | 0 | case OP_ASSERT: |
1528 | 0 | case OP_ASSERTBACK: |
1529 | 0 | save_mark = md->mark; |
1530 | 0 | if (md->match_function_type == MATCH_CONDASSERT) |
1531 | 0 | { |
1532 | 0 | condassert = TRUE; |
1533 | 0 | md->match_function_type = 0; |
1534 | 0 | } |
1535 | 0 | else condassert = FALSE; |
1536 | | |
1537 | | /* Loop for each branch */ |
1538 | |
|
1539 | 0 | do |
1540 | 0 | { |
1541 | 0 | RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4); |
1542 | | |
1543 | | /* A match means that the assertion is true; break out of the loop |
1544 | | that matches its alternatives. */ |
1545 | |
|
1546 | 0 | if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) |
1547 | 0 | { |
1548 | 0 | mstart = md->start_match_ptr; /* In case \K reset it */ |
1549 | 0 | break; |
1550 | 0 | } |
1551 | | |
1552 | | /* If not matched, restore the previous mark setting. */ |
1553 | | |
1554 | 0 | md->mark = save_mark; |
1555 | | |
1556 | | /* See comment in the code for capturing groups above about handling |
1557 | | THEN. */ |
1558 | |
|
1559 | 0 | if (rrc == MATCH_THEN) |
1560 | 0 | { |
1561 | 0 | next = ecode + GET(ecode,1); |
1562 | 0 | if (md->start_match_ptr < next && |
1563 | 0 | (*ecode == OP_ALT || *next == OP_ALT)) |
1564 | 0 | rrc = MATCH_NOMATCH; |
1565 | 0 | } |
1566 | | |
1567 | | /* Anything other than NOMATCH causes the entire assertion to fail, |
1568 | | passing back the return code. This includes COMMIT, SKIP, PRUNE and an |
1569 | | uncaptured THEN, which means they take their normal effect. This |
1570 | | consistent approach does not always have exactly the same effect as in |
1571 | | Perl. */ |
1572 | |
|
1573 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1574 | 0 | ecode += GET(ecode, 1); |
1575 | 0 | } |
1576 | 0 | while (*ecode == OP_ALT); /* Continue for next alternative */ |
1577 | | |
1578 | | /* If we have tried all the alternative branches, the assertion has |
1579 | | failed. If not, we broke out after a match. */ |
1580 | | |
1581 | 0 | if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); |
1582 | | |
1583 | | /* If checking an assertion for a condition, return MATCH_MATCH. */ |
1584 | |
|
1585 | 0 | if (condassert) RRETURN(MATCH_MATCH); |
1586 | | |
1587 | | /* Continue from after a successful assertion, updating the offsets high |
1588 | | water mark, since extracts may have been taken during the assertion. */ |
1589 | |
|
1590 | 0 | do ecode += GET(ecode,1); while (*ecode == OP_ALT); |
1591 | 0 | ecode += 1 + LINK_SIZE; |
1592 | 0 | offset_top = md->end_offset_top; |
1593 | 0 | continue; |
1594 | | |
1595 | | /* Negative assertion: all branches must fail to match for the assertion to |
1596 | | succeed. */ |
1597 | | |
1598 | 0 | case OP_ASSERT_NOT: |
1599 | 0 | case OP_ASSERTBACK_NOT: |
1600 | 0 | save_mark = md->mark; |
1601 | 0 | if (md->match_function_type == MATCH_CONDASSERT) |
1602 | 0 | { |
1603 | 0 | condassert = TRUE; |
1604 | 0 | md->match_function_type = 0; |
1605 | 0 | } |
1606 | 0 | else condassert = FALSE; |
1607 | | |
1608 | | /* Loop for each alternative branch. */ |
1609 | |
|
1610 | 0 | do |
1611 | 0 | { |
1612 | 0 | RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); |
1613 | 0 | md->mark = save_mark; /* Always restore the mark setting */ |
1614 | |
|
1615 | 0 | switch(rrc) |
1616 | 0 | { |
1617 | 0 | case MATCH_MATCH: /* A successful match means */ |
1618 | 0 | case MATCH_ACCEPT: /* the assertion has failed. */ |
1619 | 0 | RRETURN(MATCH_NOMATCH); |
1620 | |
|
1621 | 0 | case MATCH_NOMATCH: /* Carry on with next branch */ |
1622 | 0 | break; |
1623 | | |
1624 | | /* See comment in the code for capturing groups above about handling |
1625 | | THEN. */ |
1626 | | |
1627 | 0 | case MATCH_THEN: |
1628 | 0 | next = ecode + GET(ecode,1); |
1629 | 0 | if (md->start_match_ptr < next && |
1630 | 0 | (*ecode == OP_ALT || *next == OP_ALT)) |
1631 | 0 | { |
1632 | 0 | rrc = MATCH_NOMATCH; |
1633 | 0 | break; |
1634 | 0 | } |
1635 | | /* Otherwise fall through. */ |
1636 | | |
1637 | | /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole |
1638 | | assertion to fail to match, without considering any more alternatives. |
1639 | | Failing to match means the assertion is true. This is a consistent |
1640 | | approach, but does not always have the same effect as in Perl. */ |
1641 | | |
1642 | 0 | case MATCH_COMMIT: |
1643 | 0 | case MATCH_SKIP: |
1644 | 0 | case MATCH_SKIP_ARG: |
1645 | 0 | case MATCH_PRUNE: |
1646 | 0 | do ecode += GET(ecode,1); while (*ecode == OP_ALT); |
1647 | 0 | goto NEG_ASSERT_TRUE; /* Break out of alternation loop */ |
1648 | | |
1649 | | /* Anything else is an error */ |
1650 | | |
1651 | 0 | default: |
1652 | 0 | RRETURN(rrc); |
1653 | 0 | } |
1654 | | |
1655 | | /* Continue with next branch */ |
1656 | | |
1657 | 0 | ecode += GET(ecode,1); |
1658 | 0 | } |
1659 | 0 | while (*ecode == OP_ALT); |
1660 | | |
1661 | | /* All branches in the assertion failed to match. */ |
1662 | | |
1663 | 0 | NEG_ASSERT_TRUE: |
1664 | 0 | if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ |
1665 | 0 | ecode += 1 + LINK_SIZE; /* Continue with current branch */ |
1666 | 0 | continue; |
1667 | | |
1668 | | /* Move the subject pointer back. This occurs only at the start of |
1669 | | each branch of a lookbehind assertion. If we are too close to the start to |
1670 | | move back, this match function fails. When working with UTF-8 we move |
1671 | | back a number of characters, not bytes. */ |
1672 | | |
1673 | 0 | case OP_REVERSE: |
1674 | | #ifdef SUPPORT_UTF |
1675 | | if (utf) |
1676 | | { |
1677 | | i = GET(ecode, 1); |
1678 | | while (i-- > 0) |
1679 | | { |
1680 | | eptr--; |
1681 | | if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); |
1682 | | BACKCHAR(eptr); |
1683 | | } |
1684 | | } |
1685 | | else |
1686 | | #endif |
1687 | | |
1688 | | /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ |
1689 | |
|
1690 | 0 | { |
1691 | 0 | eptr -= GET(ecode, 1); |
1692 | 0 | if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); |
1693 | 0 | } |
1694 | | |
1695 | | /* Save the earliest consulted character, then skip to next op code */ |
1696 | | |
1697 | 0 | if (eptr < md->start_used_ptr) md->start_used_ptr = eptr; |
1698 | 0 | ecode += 1 + LINK_SIZE; |
1699 | 0 | break; |
1700 | | |
1701 | | /* The callout item calls an external function, if one is provided, passing |
1702 | | details of the match so far. This is mainly for debugging, though the |
1703 | | function is able to force a failure. */ |
1704 | | |
1705 | 0 | case OP_CALLOUT: |
1706 | 0 | if (PUBL(callout) != NULL) |
1707 | 0 | { |
1708 | 0 | PUBL(callout_block) cb; |
1709 | 0 | cb.version = 2; /* Version 1 of the callout block */ |
1710 | 0 | cb.callout_number = ecode[1]; |
1711 | 0 | cb.offset_vector = md->offset_vector; |
1712 | 0 | #if defined COMPILE_PCRE8 |
1713 | 0 | cb.subject = (PCRE_SPTR)md->start_subject; |
1714 | | #elif defined COMPILE_PCRE16 |
1715 | | cb.subject = (PCRE_SPTR16)md->start_subject; |
1716 | | #elif defined COMPILE_PCRE32 |
1717 | | cb.subject = (PCRE_SPTR32)md->start_subject; |
1718 | | #endif |
1719 | 0 | cb.subject_length = (int)(md->end_subject - md->start_subject); |
1720 | 0 | cb.start_match = (int)(mstart - md->start_subject); |
1721 | 0 | cb.current_position = (int)(eptr - md->start_subject); |
1722 | 0 | cb.pattern_position = GET(ecode, 2); |
1723 | 0 | cb.next_item_length = GET(ecode, 2 + LINK_SIZE); |
1724 | 0 | cb.capture_top = offset_top/2; |
1725 | 0 | cb.capture_last = md->capture_last & CAPLMASK; |
1726 | | /* Internal change requires this for API compatibility. */ |
1727 | 0 | if (cb.capture_last == 0) cb.capture_last = -1; |
1728 | 0 | cb.callout_data = md->callout_data; |
1729 | 0 | cb.mark = md->nomatch_mark; |
1730 | 0 | if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); |
1731 | 0 | if (rrc < 0) RRETURN(rrc); |
1732 | 0 | } |
1733 | 0 | ecode += 2 + 2*LINK_SIZE; |
1734 | 0 | break; |
1735 | | |
1736 | | /* Recursion either matches the current regex, or some subexpression. The |
1737 | | offset data is the offset to the starting bracket from the start of the |
1738 | | whole pattern. (This is so that it works from duplicated subpatterns.) |
1739 | | |
1740 | | The state of the capturing groups is preserved over recursion, and |
1741 | | re-instated afterwards. We don't know how many are started and not yet |
1742 | | finished (offset_top records the completed total) so we just have to save |
1743 | | all the potential data. There may be up to 65535 such values, which is too |
1744 | | large to put on the stack, but using malloc for small numbers seems |
1745 | | expensive. As a compromise, the stack is used when there are no more than |
1746 | | REC_STACK_SAVE_MAX values to store; otherwise malloc is used. |
1747 | | |
1748 | | There are also other values that have to be saved. We use a chained |
1749 | | sequence of blocks that actually live on the stack. Thanks to Robin Houston |
1750 | | for the original version of this logic. It has, however, been hacked around |
1751 | | a lot, so he is not to blame for the current way it works. */ |
1752 | | |
1753 | 0 | case OP_RECURSE: |
1754 | 0 | { |
1755 | 0 | recursion_info *ri; |
1756 | 0 | unsigned int recno; |
1757 | |
|
1758 | 0 | callpat = md->start_code + GET(ecode, 1); |
1759 | 0 | recno = (callpat == md->start_code)? 0 : |
1760 | 0 | GET2(callpat, 1 + LINK_SIZE); |
1761 | | |
1762 | | /* Check for repeating a recursion without advancing the subject pointer. |
1763 | | This should catch convoluted mutual recursions. (Some simple cases are |
1764 | | caught at compile time.) */ |
1765 | |
|
1766 | 0 | for (ri = md->recursive; ri != NULL; ri = ri->prevrec) |
1767 | 0 | if (recno == ri->group_num && eptr == ri->subject_position) |
1768 | 0 | RRETURN(PCRE_ERROR_RECURSELOOP); |
1769 | | |
1770 | | /* Add to "recursing stack" */ |
1771 | |
|
1772 | 0 | new_recursive.group_num = recno; |
1773 | 0 | new_recursive.saved_capture_last = md->capture_last; |
1774 | 0 | new_recursive.subject_position = eptr; |
1775 | 0 | new_recursive.prevrec = md->recursive; |
1776 | 0 | md->recursive = &new_recursive; |
1777 | | |
1778 | | /* Where to continue from afterwards */ |
1779 | |
|
1780 | 0 | ecode += 1 + LINK_SIZE; |
1781 | | |
1782 | | /* Now save the offset data */ |
1783 | |
|
1784 | 0 | new_recursive.saved_max = md->offset_end; |
1785 | 0 | if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) |
1786 | 0 | new_recursive.offset_save = stacksave; |
1787 | 0 | else |
1788 | 0 | { |
1789 | 0 | new_recursive.offset_save = |
1790 | 0 | (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int)); |
1791 | 0 | if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); |
1792 | 0 | } |
1793 | 0 | memcpy(new_recursive.offset_save, md->offset_vector, |
1794 | 0 | new_recursive.saved_max * sizeof(int)); |
1795 | | |
1796 | | /* OK, now we can do the recursion. After processing each alternative, |
1797 | | restore the offset data and the last captured value. If there were nested |
1798 | | recursions, md->recursive might be changed, so reset it before looping. |
1799 | | */ |
1800 | |
|
1801 | 0 | DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); |
1802 | 0 | cbegroup = (*callpat >= OP_SBRA); |
1803 | 0 | do |
1804 | 0 | { |
1805 | 0 | if (cbegroup) md->match_function_type = MATCH_CBEGROUP; |
1806 | 0 | RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top, |
1807 | 0 | md, eptrb, RM6); |
1808 | 0 | memcpy(md->offset_vector, new_recursive.offset_save, |
1809 | 0 | new_recursive.saved_max * sizeof(int)); |
1810 | 0 | md->capture_last = new_recursive.saved_capture_last; |
1811 | 0 | md->recursive = new_recursive.prevrec; |
1812 | 0 | if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) |
1813 | 0 | { |
1814 | 0 | DPRINTF(("Recursion matched\n")); |
1815 | 0 | if (new_recursive.offset_save != stacksave) |
1816 | 0 | (PUBL(free))(new_recursive.offset_save); |
1817 | | |
1818 | | /* Set where we got to in the subject, and reset the start in case |
1819 | | it was changed by \K. This *is* propagated back out of a recursion, |
1820 | | for Perl compatibility. */ |
1821 | |
|
1822 | 0 | eptr = md->end_match_ptr; |
1823 | 0 | mstart = md->start_match_ptr; |
1824 | 0 | goto RECURSION_MATCHED; /* Exit loop; end processing */ |
1825 | 0 | } |
1826 | | |
1827 | | /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a |
1828 | | recursion; they cause a NOMATCH for the entire recursion. These codes |
1829 | | are defined in a range that can be tested for. */ |
1830 | | |
1831 | 0 | if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) |
1832 | 0 | { |
1833 | 0 | if (new_recursive.offset_save != stacksave) |
1834 | 0 | (PUBL(free))(new_recursive.offset_save); |
1835 | 0 | RRETURN(MATCH_NOMATCH); |
1836 | 0 | } |
1837 | | |
1838 | | /* Any return code other than NOMATCH is an error. */ |
1839 | | |
1840 | 0 | if (rrc != MATCH_NOMATCH) |
1841 | 0 | { |
1842 | 0 | DPRINTF(("Recursion gave error %d\n", rrc)); |
1843 | 0 | if (new_recursive.offset_save != stacksave) |
1844 | 0 | (PUBL(free))(new_recursive.offset_save); |
1845 | 0 | RRETURN(rrc); |
1846 | 0 | } |
1847 | | |
1848 | 0 | md->recursive = &new_recursive; |
1849 | 0 | callpat += GET(callpat, 1); |
1850 | 0 | } |
1851 | 0 | while (*callpat == OP_ALT); |
1852 | | |
1853 | 0 | DPRINTF(("Recursion didn't match\n")); |
1854 | 0 | md->recursive = new_recursive.prevrec; |
1855 | 0 | if (new_recursive.offset_save != stacksave) |
1856 | 0 | (PUBL(free))(new_recursive.offset_save); |
1857 | 0 | RRETURN(MATCH_NOMATCH); |
1858 | 0 | } |
1859 | | |
1860 | 0 | RECURSION_MATCHED: |
1861 | 0 | break; |
1862 | | |
1863 | | /* An alternation is the end of a branch; scan along to find the end of the |
1864 | | bracketed group and go to there. */ |
1865 | | |
1866 | 0 | case OP_ALT: |
1867 | 0 | do ecode += GET(ecode,1); while (*ecode == OP_ALT); |
1868 | 0 | break; |
1869 | | |
1870 | | /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, |
1871 | | indicating that it may occur zero times. It may repeat infinitely, or not |
1872 | | at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets |
1873 | | with fixed upper repeat limits are compiled as a number of copies, with the |
1874 | | optional ones preceded by BRAZERO or BRAMINZERO. */ |
1875 | | |
1876 | 0 | case OP_BRAZERO: |
1877 | 0 | next = ecode + 1; |
1878 | 0 | RMATCH(eptr, next, offset_top, md, eptrb, RM10); |
1879 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1880 | 0 | do next += GET(next, 1); while (*next == OP_ALT); |
1881 | 0 | ecode = next + 1 + LINK_SIZE; |
1882 | 0 | break; |
1883 | | |
1884 | 0 | case OP_BRAMINZERO: |
1885 | 0 | next = ecode + 1; |
1886 | 0 | do next += GET(next, 1); while (*next == OP_ALT); |
1887 | 0 | RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11); |
1888 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1889 | 0 | ecode++; |
1890 | 0 | break; |
1891 | | |
1892 | 0 | case OP_SKIPZERO: |
1893 | 0 | next = ecode+1; |
1894 | 0 | do next += GET(next,1); while (*next == OP_ALT); |
1895 | 0 | ecode = next + 1 + LINK_SIZE; |
1896 | 0 | break; |
1897 | | |
1898 | | /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything |
1899 | | here; just jump to the group, with allow_zero set TRUE. */ |
1900 | | |
1901 | 0 | case OP_BRAPOSZERO: |
1902 | 0 | op = *(++ecode); |
1903 | 0 | allow_zero = TRUE; |
1904 | 0 | if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE; |
1905 | 0 | goto POSSESSIVE_NON_CAPTURE; |
1906 | | |
1907 | | /* End of a group, repeated or non-repeating. */ |
1908 | | |
1909 | 0 | case OP_KET: |
1910 | 0 | case OP_KETRMIN: |
1911 | 0 | case OP_KETRMAX: |
1912 | 0 | case OP_KETRPOS: |
1913 | 0 | prev = ecode - GET(ecode, 1); |
1914 | | |
1915 | | /* If this was a group that remembered the subject start, in order to break |
1916 | | infinite repeats of empty string matches, retrieve the subject start from |
1917 | | the chain. Otherwise, set it NULL. */ |
1918 | |
|
1919 | 0 | if (*prev >= OP_SBRA || *prev == OP_ONCE) |
1920 | 0 | { |
1921 | 0 | saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ |
1922 | 0 | eptrb = eptrb->epb_prev; /* Backup to previous group */ |
1923 | 0 | } |
1924 | 0 | else saved_eptr = NULL; |
1925 | | |
1926 | | /* If we are at the end of an assertion group or a non-capturing atomic |
1927 | | group, stop matching and return MATCH_MATCH, but record the current high |
1928 | | water mark for use by positive assertions. We also need to record the match |
1929 | | start in case it was changed by \K. */ |
1930 | |
|
1931 | 0 | if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) || |
1932 | 0 | *prev == OP_ONCE_NC) |
1933 | 0 | { |
1934 | 0 | md->end_match_ptr = eptr; /* For ONCE_NC */ |
1935 | 0 | md->end_offset_top = offset_top; |
1936 | 0 | md->start_match_ptr = mstart; |
1937 | 0 | RRETURN(MATCH_MATCH); /* Sets md->mark */ |
1938 | 0 | } |
1939 | | |
1940 | | /* For capturing groups we have to check the group number back at the start |
1941 | | and if necessary complete handling an extraction by setting the offsets and |
1942 | | bumping the high water mark. Whole-pattern recursion is coded as a recurse |
1943 | | into group 0, so it won't be picked up here. Instead, we catch it when the |
1944 | | OP_END is reached. Other recursion is handled here. We just have to record |
1945 | | the current subject position and start match pointer and give a MATCH |
1946 | | return. */ |
1947 | | |
1948 | 0 | if (*prev == OP_CBRA || *prev == OP_SCBRA || |
1949 | 0 | *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS) |
1950 | 0 | { |
1951 | 0 | number = GET2(prev, 1+LINK_SIZE); |
1952 | 0 | offset = number << 1; |
1953 | |
|
1954 | | #ifdef PCRE_DEBUG |
1955 | | printf("end bracket %d", number); |
1956 | | printf("\n"); |
1957 | | #endif |
1958 | | |
1959 | | /* Handle a recursively called group. */ |
1960 | |
|
1961 | 0 | if (md->recursive != NULL && md->recursive->group_num == number) |
1962 | 0 | { |
1963 | 0 | md->end_match_ptr = eptr; |
1964 | 0 | md->start_match_ptr = mstart; |
1965 | 0 | RRETURN(MATCH_MATCH); |
1966 | 0 | } |
1967 | | |
1968 | | /* Deal with capturing */ |
1969 | | |
1970 | 0 | md->capture_last = (md->capture_last & OVFLMASK) | number; |
1971 | 0 | if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else |
1972 | 0 | { |
1973 | | /* If offset is greater than offset_top, it means that we are |
1974 | | "skipping" a capturing group, and that group's offsets must be marked |
1975 | | unset. In earlier versions of PCRE, all the offsets were unset at the |
1976 | | start of matching, but this doesn't work because atomic groups and |
1977 | | assertions can cause a value to be set that should later be unset. |
1978 | | Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as |
1979 | | part of the atomic group, but this is not on the final matching path, |
1980 | | so must be unset when 2 is set. (If there is no group 2, there is no |
1981 | | problem, because offset_top will then be 2, indicating no capture.) */ |
1982 | |
|
1983 | 0 | if (offset > offset_top) |
1984 | 0 | { |
1985 | 0 | register int *iptr = md->offset_vector + offset_top; |
1986 | 0 | register int *iend = md->offset_vector + offset; |
1987 | 0 | while (iptr < iend) *iptr++ = -1; |
1988 | 0 | } |
1989 | | |
1990 | | /* Now make the extraction */ |
1991 | |
|
1992 | 0 | md->offset_vector[offset] = |
1993 | 0 | md->offset_vector[md->offset_end - number]; |
1994 | 0 | md->offset_vector[offset+1] = (int)(eptr - md->start_subject); |
1995 | 0 | if (offset_top <= offset) offset_top = offset + 2; |
1996 | 0 | } |
1997 | 0 | } |
1998 | | |
1999 | | /* OP_KETRPOS is a possessive repeating ket. Remember the current position, |
2000 | | and return the MATCH_KETRPOS. This makes it possible to do the repeats one |
2001 | | at a time from the outer level, thus saving stack. This must precede the |
2002 | | empty string test - in this case that test is done at the outer level. */ |
2003 | | |
2004 | 0 | if (*ecode == OP_KETRPOS) |
2005 | 0 | { |
2006 | 0 | md->start_match_ptr = mstart; /* In case \K reset it */ |
2007 | 0 | md->end_match_ptr = eptr; |
2008 | 0 | md->end_offset_top = offset_top; |
2009 | 0 | RRETURN(MATCH_KETRPOS); |
2010 | 0 | } |
2011 | | |
2012 | | /* For an ordinary non-repeating ket, just continue at this level. This |
2013 | | also happens for a repeating ket if no characters were matched in the |
2014 | | group. This is the forcible breaking of infinite loops as implemented in |
2015 | | Perl 5.005. For a non-repeating atomic group that includes captures, |
2016 | | establish a backup point by processing the rest of the pattern at a lower |
2017 | | level. If this results in a NOMATCH return, pass MATCH_ONCE back to the |
2018 | | original OP_ONCE level, thereby bypassing intermediate backup points, but |
2019 | | resetting any captures that happened along the way. */ |
2020 | | |
2021 | 0 | if (*ecode == OP_KET || eptr == saved_eptr) |
2022 | 0 | { |
2023 | 0 | if (*prev == OP_ONCE) |
2024 | 0 | { |
2025 | 0 | RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12); |
2026 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2027 | 0 | md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ |
2028 | 0 | RRETURN(MATCH_ONCE); |
2029 | 0 | } |
2030 | 0 | ecode += 1 + LINK_SIZE; /* Carry on at this level */ |
2031 | 0 | break; |
2032 | 0 | } |
2033 | | |
2034 | | /* The normal repeating kets try the rest of the pattern or restart from |
2035 | | the preceding bracket, in the appropriate order. In the second case, we can |
2036 | | use tail recursion to avoid using another stack frame, unless we have an |
2037 | | an atomic group or an unlimited repeat of a group that can match an empty |
2038 | | string. */ |
2039 | | |
2040 | 0 | if (*ecode == OP_KETRMIN) |
2041 | 0 | { |
2042 | 0 | RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7); |
2043 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2044 | 0 | if (*prev == OP_ONCE) |
2045 | 0 | { |
2046 | 0 | RMATCH(eptr, prev, offset_top, md, eptrb, RM8); |
2047 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2048 | 0 | md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ |
2049 | 0 | RRETURN(MATCH_ONCE); |
2050 | 0 | } |
2051 | 0 | if (*prev >= OP_SBRA) /* Could match an empty string */ |
2052 | 0 | { |
2053 | 0 | RMATCH(eptr, prev, offset_top, md, eptrb, RM50); |
2054 | 0 | RRETURN(rrc); |
2055 | 0 | } |
2056 | 0 | ecode = prev; |
2057 | 0 | goto TAIL_RECURSE; |
2058 | 0 | } |
2059 | 0 | else /* OP_KETRMAX */ |
2060 | 0 | { |
2061 | 0 | RMATCH(eptr, prev, offset_top, md, eptrb, RM13); |
2062 | 0 | if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH; |
2063 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2064 | 0 | if (*prev == OP_ONCE) |
2065 | 0 | { |
2066 | 0 | RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9); |
2067 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2068 | 0 | md->once_target = prev; |
2069 | 0 | RRETURN(MATCH_ONCE); |
2070 | 0 | } |
2071 | 0 | ecode += 1 + LINK_SIZE; |
2072 | 0 | goto TAIL_RECURSE; |
2073 | 0 | } |
2074 | | /* Control never gets here */ |
2075 | | |
2076 | | /* Not multiline mode: start of subject assertion, unless notbol. */ |
2077 | | |
2078 | 0 | case OP_CIRC: |
2079 | 0 | if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); |
2080 | | |
2081 | | /* Start of subject assertion */ |
2082 | |
|
2083 | 0 | case OP_SOD: |
2084 | 0 | if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); |
2085 | 0 | ecode++; |
2086 | 0 | break; |
2087 | | |
2088 | | /* Multiline mode: start of subject unless notbol, or after any newline. */ |
2089 | | |
2090 | 0 | case OP_CIRCM: |
2091 | 0 | if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); |
2092 | 0 | if (eptr != md->start_subject && |
2093 | 0 | (eptr == md->end_subject || !WAS_NEWLINE(eptr))) |
2094 | 0 | RRETURN(MATCH_NOMATCH); |
2095 | 0 | ecode++; |
2096 | 0 | break; |
2097 | | |
2098 | | /* Start of match assertion */ |
2099 | | |
2100 | 0 | case OP_SOM: |
2101 | 0 | if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); |
2102 | 0 | ecode++; |
2103 | 0 | break; |
2104 | | |
2105 | | /* Reset the start of match point */ |
2106 | | |
2107 | 0 | case OP_SET_SOM: |
2108 | 0 | mstart = eptr; |
2109 | 0 | ecode++; |
2110 | 0 | break; |
2111 | | |
2112 | | /* Multiline mode: assert before any newline, or before end of subject |
2113 | | unless noteol is set. */ |
2114 | | |
2115 | 0 | case OP_DOLLM: |
2116 | 0 | if (eptr < md->end_subject) |
2117 | 0 | { |
2118 | 0 | if (!IS_NEWLINE(eptr)) |
2119 | 0 | { |
2120 | 0 | if (md->partial != 0 && |
2121 | 0 | eptr + 1 >= md->end_subject && |
2122 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
2123 | 0 | NLBLOCK->nllen == 2 && |
2124 | 0 | UCHAR21TEST(eptr) == NLBLOCK->nl[0]) |
2125 | 0 | { |
2126 | 0 | md->hitend = TRUE; |
2127 | 0 | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); |
2128 | 0 | } |
2129 | 0 | RRETURN(MATCH_NOMATCH); |
2130 | 0 | } |
2131 | 0 | } |
2132 | 0 | else |
2133 | 0 | { |
2134 | 0 | if (md->noteol) RRETURN(MATCH_NOMATCH); |
2135 | 0 | SCHECK_PARTIAL(); |
2136 | 0 | } |
2137 | 0 | ecode++; |
2138 | 0 | break; |
2139 | | |
2140 | | /* Not multiline mode: assert before a terminating newline or before end of |
2141 | | subject unless noteol is set. */ |
2142 | | |
2143 | 0 | case OP_DOLL: |
2144 | 0 | if (md->noteol) RRETURN(MATCH_NOMATCH); |
2145 | 0 | if (!md->endonly) goto ASSERT_NL_OR_EOS; |
2146 | | |
2147 | | /* ... else fall through for endonly */ |
2148 | | |
2149 | | /* End of subject assertion (\z) */ |
2150 | | |
2151 | 0 | case OP_EOD: |
2152 | 0 | if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); |
2153 | 0 | SCHECK_PARTIAL(); |
2154 | 0 | ecode++; |
2155 | 0 | break; |
2156 | | |
2157 | | /* End of subject or ending \n assertion (\Z) */ |
2158 | | |
2159 | 0 | case OP_EODN: |
2160 | 0 | ASSERT_NL_OR_EOS: |
2161 | 0 | if (eptr < md->end_subject && |
2162 | 0 | (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) |
2163 | 0 | { |
2164 | 0 | if (md->partial != 0 && |
2165 | 0 | eptr + 1 >= md->end_subject && |
2166 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
2167 | 0 | NLBLOCK->nllen == 2 && |
2168 | 0 | UCHAR21TEST(eptr) == NLBLOCK->nl[0]) |
2169 | 0 | { |
2170 | 0 | md->hitend = TRUE; |
2171 | 0 | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); |
2172 | 0 | } |
2173 | 0 | RRETURN(MATCH_NOMATCH); |
2174 | 0 | } |
2175 | | |
2176 | | /* Either at end of string or \n before end. */ |
2177 | | |
2178 | 0 | SCHECK_PARTIAL(); |
2179 | 0 | ecode++; |
2180 | 0 | break; |
2181 | | |
2182 | | /* Word boundary assertions */ |
2183 | | |
2184 | 0 | case OP_NOT_WORD_BOUNDARY: |
2185 | 0 | case OP_WORD_BOUNDARY: |
2186 | 0 | { |
2187 | | |
2188 | | /* Find out if the previous and current characters are "word" characters. |
2189 | | It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to |
2190 | | be "non-word" characters. Remember the earliest consulted character for |
2191 | | partial matching. */ |
2192 | |
|
2193 | | #ifdef SUPPORT_UTF |
2194 | | if (utf) |
2195 | | { |
2196 | | /* Get status of previous character */ |
2197 | | |
2198 | | if (eptr == md->start_subject) prev_is_word = FALSE; else |
2199 | | { |
2200 | | PCRE_PUCHAR lastptr = eptr - 1; |
2201 | | BACKCHAR(lastptr); |
2202 | | if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; |
2203 | | GETCHAR(c, lastptr); |
2204 | | #ifdef SUPPORT_UCP |
2205 | | if (md->use_ucp) |
2206 | | { |
2207 | | if (c == '_') prev_is_word = TRUE; else |
2208 | | { |
2209 | | int cat = UCD_CATEGORY(c); |
2210 | | prev_is_word = (cat == ucp_L || cat == ucp_N); |
2211 | | } |
2212 | | } |
2213 | | else |
2214 | | #endif |
2215 | | prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; |
2216 | | } |
2217 | | |
2218 | | /* Get status of next character */ |
2219 | | |
2220 | | if (eptr >= md->end_subject) |
2221 | | { |
2222 | | SCHECK_PARTIAL(); |
2223 | | cur_is_word = FALSE; |
2224 | | } |
2225 | | else |
2226 | | { |
2227 | | GETCHAR(c, eptr); |
2228 | | #ifdef SUPPORT_UCP |
2229 | | if (md->use_ucp) |
2230 | | { |
2231 | | if (c == '_') cur_is_word = TRUE; else |
2232 | | { |
2233 | | int cat = UCD_CATEGORY(c); |
2234 | | cur_is_word = (cat == ucp_L || cat == ucp_N); |
2235 | | } |
2236 | | } |
2237 | | else |
2238 | | #endif |
2239 | | cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; |
2240 | | } |
2241 | | } |
2242 | | else |
2243 | | #endif |
2244 | | |
2245 | | /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for |
2246 | | consistency with the behaviour of \w we do use it in this case. */ |
2247 | |
|
2248 | 0 | { |
2249 | | /* Get status of previous character */ |
2250 | |
|
2251 | 0 | if (eptr == md->start_subject) prev_is_word = FALSE; else |
2252 | 0 | { |
2253 | 0 | if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1; |
2254 | | #ifdef SUPPORT_UCP |
2255 | | if (md->use_ucp) |
2256 | | { |
2257 | | c = eptr[-1]; |
2258 | | if (c == '_') prev_is_word = TRUE; else |
2259 | | { |
2260 | | int cat = UCD_CATEGORY(c); |
2261 | | prev_is_word = (cat == ucp_L || cat == ucp_N); |
2262 | | } |
2263 | | } |
2264 | | else |
2265 | | #endif |
2266 | 0 | prev_is_word = MAX_255(eptr[-1]) |
2267 | 0 | && ((md->ctypes[eptr[-1]] & ctype_word) != 0); |
2268 | 0 | } |
2269 | | |
2270 | | /* Get status of next character */ |
2271 | |
|
2272 | 0 | if (eptr >= md->end_subject) |
2273 | 0 | { |
2274 | 0 | SCHECK_PARTIAL(); |
2275 | 0 | cur_is_word = FALSE; |
2276 | 0 | } |
2277 | 0 | else |
2278 | | #ifdef SUPPORT_UCP |
2279 | | if (md->use_ucp) |
2280 | | { |
2281 | | c = *eptr; |
2282 | | if (c == '_') cur_is_word = TRUE; else |
2283 | | { |
2284 | | int cat = UCD_CATEGORY(c); |
2285 | | cur_is_word = (cat == ucp_L || cat == ucp_N); |
2286 | | } |
2287 | | } |
2288 | | else |
2289 | | #endif |
2290 | 0 | cur_is_word = MAX_255(*eptr) |
2291 | 0 | && ((md->ctypes[*eptr] & ctype_word) != 0); |
2292 | 0 | } |
2293 | | |
2294 | | /* Now see if the situation is what we want */ |
2295 | | |
2296 | 0 | if ((*ecode++ == OP_WORD_BOUNDARY)? |
2297 | 0 | cur_is_word == prev_is_word : cur_is_word != prev_is_word) |
2298 | 0 | RRETURN(MATCH_NOMATCH); |
2299 | 0 | } |
2300 | 0 | break; |
2301 | | |
2302 | | /* Match any single character type except newline; have to take care with |
2303 | | CRLF newlines and partial matching. */ |
2304 | | |
2305 | 0 | case OP_ANY: |
2306 | 0 | if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); |
2307 | 0 | if (md->partial != 0 && |
2308 | 0 | eptr == md->end_subject - 1 && |
2309 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
2310 | 0 | NLBLOCK->nllen == 2 && |
2311 | 0 | UCHAR21TEST(eptr) == NLBLOCK->nl[0]) |
2312 | 0 | { |
2313 | 0 | md->hitend = TRUE; |
2314 | 0 | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); |
2315 | 0 | } |
2316 | | |
2317 | | /* Fall through */ |
2318 | | |
2319 | | /* Match any single character whatsoever. */ |
2320 | | |
2321 | 0 | case OP_ALLANY: |
2322 | 0 | if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ |
2323 | 0 | { /* not be updated before SCHECK_PARTIAL. */ |
2324 | 0 | SCHECK_PARTIAL(); |
2325 | 0 | RRETURN(MATCH_NOMATCH); |
2326 | 0 | } |
2327 | 0 | eptr++; |
2328 | | #ifdef SUPPORT_UTF |
2329 | | if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); |
2330 | | #endif |
2331 | 0 | ecode++; |
2332 | 0 | break; |
2333 | | |
2334 | | /* Match a single byte, even in UTF-8 mode. This opcode really does match |
2335 | | any byte, even newline, independent of the setting of PCRE_DOTALL. */ |
2336 | | |
2337 | 0 | case OP_ANYBYTE: |
2338 | 0 | if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ |
2339 | 0 | { /* not be updated before SCHECK_PARTIAL. */ |
2340 | 0 | SCHECK_PARTIAL(); |
2341 | 0 | RRETURN(MATCH_NOMATCH); |
2342 | 0 | } |
2343 | 0 | eptr++; |
2344 | 0 | ecode++; |
2345 | 0 | break; |
2346 | | |
2347 | 0 | case OP_NOT_DIGIT: |
2348 | 0 | if (eptr >= md->end_subject) |
2349 | 0 | { |
2350 | 0 | SCHECK_PARTIAL(); |
2351 | 0 | RRETURN(MATCH_NOMATCH); |
2352 | 0 | } |
2353 | 0 | GETCHARINCTEST(c, eptr); |
2354 | 0 | if ( |
2355 | | #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) |
2356 | | c < 256 && |
2357 | | #endif |
2358 | 0 | (md->ctypes[c] & ctype_digit) != 0 |
2359 | 0 | ) |
2360 | 0 | RRETURN(MATCH_NOMATCH); |
2361 | 0 | ecode++; |
2362 | 0 | break; |
2363 | | |
2364 | 0 | case OP_DIGIT: |
2365 | 0 | if (eptr >= md->end_subject) |
2366 | 0 | { |
2367 | 0 | SCHECK_PARTIAL(); |
2368 | 0 | RRETURN(MATCH_NOMATCH); |
2369 | 0 | } |
2370 | 0 | GETCHARINCTEST(c, eptr); |
2371 | 0 | if ( |
2372 | | #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) |
2373 | | c > 255 || |
2374 | | #endif |
2375 | 0 | (md->ctypes[c] & ctype_digit) == 0 |
2376 | 0 | ) |
2377 | 0 | RRETURN(MATCH_NOMATCH); |
2378 | 0 | ecode++; |
2379 | 0 | break; |
2380 | | |
2381 | 0 | case OP_NOT_WHITESPACE: |
2382 | 0 | if (eptr >= md->end_subject) |
2383 | 0 | { |
2384 | 0 | SCHECK_PARTIAL(); |
2385 | 0 | RRETURN(MATCH_NOMATCH); |
2386 | 0 | } |
2387 | 0 | GETCHARINCTEST(c, eptr); |
2388 | 0 | if ( |
2389 | | #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) |
2390 | | c < 256 && |
2391 | | #endif |
2392 | 0 | (md->ctypes[c] & ctype_space) != 0 |
2393 | 0 | ) |
2394 | 0 | RRETURN(MATCH_NOMATCH); |
2395 | 0 | ecode++; |
2396 | 0 | break; |
2397 | | |
2398 | 0 | case OP_WHITESPACE: |
2399 | 0 | if (eptr >= md->end_subject) |
2400 | 0 | { |
2401 | 0 | SCHECK_PARTIAL(); |
2402 | 0 | RRETURN(MATCH_NOMATCH); |
2403 | 0 | } |
2404 | 0 | GETCHARINCTEST(c, eptr); |
2405 | 0 | if ( |
2406 | | #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) |
2407 | | c > 255 || |
2408 | | #endif |
2409 | 0 | (md->ctypes[c] & ctype_space) == 0 |
2410 | 0 | ) |
2411 | 0 | RRETURN(MATCH_NOMATCH); |
2412 | 0 | ecode++; |
2413 | 0 | break; |
2414 | | |
2415 | 0 | case OP_NOT_WORDCHAR: |
2416 | 0 | if (eptr >= md->end_subject) |
2417 | 0 | { |
2418 | 0 | SCHECK_PARTIAL(); |
2419 | 0 | RRETURN(MATCH_NOMATCH); |
2420 | 0 | } |
2421 | 0 | GETCHARINCTEST(c, eptr); |
2422 | 0 | if ( |
2423 | | #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) |
2424 | | c < 256 && |
2425 | | #endif |
2426 | 0 | (md->ctypes[c] & ctype_word) != 0 |
2427 | 0 | ) |
2428 | 0 | RRETURN(MATCH_NOMATCH); |
2429 | 0 | ecode++; |
2430 | 0 | break; |
2431 | | |
2432 | 0 | case OP_WORDCHAR: |
2433 | 0 | if (eptr >= md->end_subject) |
2434 | 0 | { |
2435 | 0 | SCHECK_PARTIAL(); |
2436 | 0 | RRETURN(MATCH_NOMATCH); |
2437 | 0 | } |
2438 | 0 | GETCHARINCTEST(c, eptr); |
2439 | 0 | if ( |
2440 | | #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) |
2441 | | c > 255 || |
2442 | | #endif |
2443 | 0 | (md->ctypes[c] & ctype_word) == 0 |
2444 | 0 | ) |
2445 | 0 | RRETURN(MATCH_NOMATCH); |
2446 | 0 | ecode++; |
2447 | 0 | break; |
2448 | | |
2449 | 0 | case OP_ANYNL: |
2450 | 0 | if (eptr >= md->end_subject) |
2451 | 0 | { |
2452 | 0 | SCHECK_PARTIAL(); |
2453 | 0 | RRETURN(MATCH_NOMATCH); |
2454 | 0 | } |
2455 | 0 | GETCHARINCTEST(c, eptr); |
2456 | 0 | switch(c) |
2457 | 0 | { |
2458 | 0 | default: RRETURN(MATCH_NOMATCH); |
2459 | |
|
2460 | 0 | case CHAR_CR: |
2461 | 0 | if (eptr >= md->end_subject) |
2462 | 0 | { |
2463 | 0 | SCHECK_PARTIAL(); |
2464 | 0 | } |
2465 | 0 | else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++; |
2466 | 0 | break; |
2467 | | |
2468 | 0 | case CHAR_LF: |
2469 | 0 | break; |
2470 | | |
2471 | 0 | case CHAR_VT: |
2472 | 0 | case CHAR_FF: |
2473 | 0 | case CHAR_NEL: |
2474 | 0 | #ifndef EBCDIC |
2475 | 0 | case 0x2028: |
2476 | 0 | case 0x2029: |
2477 | 0 | #endif /* Not EBCDIC */ |
2478 | 0 | if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); |
2479 | 0 | break; |
2480 | 0 | } |
2481 | 0 | ecode++; |
2482 | 0 | break; |
2483 | | |
2484 | 0 | case OP_NOT_HSPACE: |
2485 | 0 | if (eptr >= md->end_subject) |
2486 | 0 | { |
2487 | 0 | SCHECK_PARTIAL(); |
2488 | 0 | RRETURN(MATCH_NOMATCH); |
2489 | 0 | } |
2490 | 0 | GETCHARINCTEST(c, eptr); |
2491 | 0 | switch(c) |
2492 | 0 | { |
2493 | 0 | HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ |
2494 | 0 | default: break; |
2495 | 0 | } |
2496 | 0 | ecode++; |
2497 | 0 | break; |
2498 | | |
2499 | 0 | case OP_HSPACE: |
2500 | 0 | if (eptr >= md->end_subject) |
2501 | 0 | { |
2502 | 0 | SCHECK_PARTIAL(); |
2503 | 0 | RRETURN(MATCH_NOMATCH); |
2504 | 0 | } |
2505 | 0 | GETCHARINCTEST(c, eptr); |
2506 | 0 | switch(c) |
2507 | 0 | { |
2508 | 0 | HSPACE_CASES: break; /* Byte and multibyte cases */ |
2509 | 0 | default: RRETURN(MATCH_NOMATCH); |
2510 | 0 | } |
2511 | 0 | ecode++; |
2512 | 0 | break; |
2513 | | |
2514 | 0 | case OP_NOT_VSPACE: |
2515 | 0 | if (eptr >= md->end_subject) |
2516 | 0 | { |
2517 | 0 | SCHECK_PARTIAL(); |
2518 | 0 | RRETURN(MATCH_NOMATCH); |
2519 | 0 | } |
2520 | 0 | GETCHARINCTEST(c, eptr); |
2521 | 0 | switch(c) |
2522 | 0 | { |
2523 | 0 | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
2524 | 0 | default: break; |
2525 | 0 | } |
2526 | 0 | ecode++; |
2527 | 0 | break; |
2528 | | |
2529 | 0 | case OP_VSPACE: |
2530 | 0 | if (eptr >= md->end_subject) |
2531 | 0 | { |
2532 | 0 | SCHECK_PARTIAL(); |
2533 | 0 | RRETURN(MATCH_NOMATCH); |
2534 | 0 | } |
2535 | 0 | GETCHARINCTEST(c, eptr); |
2536 | 0 | switch(c) |
2537 | 0 | { |
2538 | 0 | VSPACE_CASES: break; |
2539 | 0 | default: RRETURN(MATCH_NOMATCH); |
2540 | 0 | } |
2541 | 0 | ecode++; |
2542 | 0 | break; |
2543 | | |
2544 | | #ifdef SUPPORT_UCP |
2545 | | /* Check the next character by Unicode property. We will get here only |
2546 | | if the support is in the binary; otherwise a compile-time error occurs. */ |
2547 | | |
2548 | | case OP_PROP: |
2549 | | case OP_NOTPROP: |
2550 | | if (eptr >= md->end_subject) |
2551 | | { |
2552 | | SCHECK_PARTIAL(); |
2553 | | RRETURN(MATCH_NOMATCH); |
2554 | | } |
2555 | | GETCHARINCTEST(c, eptr); |
2556 | | { |
2557 | | const pcre_uint32 *cp; |
2558 | | const ucd_record *prop = GET_UCD(c); |
2559 | | |
2560 | | switch(ecode[1]) |
2561 | | { |
2562 | | case PT_ANY: |
2563 | | if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
2564 | | break; |
2565 | | |
2566 | | case PT_LAMP: |
2567 | | if ((prop->chartype == ucp_Lu || |
2568 | | prop->chartype == ucp_Ll || |
2569 | | prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) |
2570 | | RRETURN(MATCH_NOMATCH); |
2571 | | break; |
2572 | | |
2573 | | case PT_GC: |
2574 | | if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP)) |
2575 | | RRETURN(MATCH_NOMATCH); |
2576 | | break; |
2577 | | |
2578 | | case PT_PC: |
2579 | | if ((ecode[2] != prop->chartype) == (op == OP_PROP)) |
2580 | | RRETURN(MATCH_NOMATCH); |
2581 | | break; |
2582 | | |
2583 | | case PT_SC: |
2584 | | if ((ecode[2] != prop->script) == (op == OP_PROP)) |
2585 | | RRETURN(MATCH_NOMATCH); |
2586 | | break; |
2587 | | |
2588 | | /* These are specials */ |
2589 | | |
2590 | | case PT_ALNUM: |
2591 | | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
2592 | | PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) |
2593 | | RRETURN(MATCH_NOMATCH); |
2594 | | break; |
2595 | | |
2596 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
2597 | | which means that Perl space and POSIX space are now identical. PCRE |
2598 | | was changed at release 8.34. */ |
2599 | | |
2600 | | case PT_SPACE: /* Perl space */ |
2601 | | case PT_PXSPACE: /* POSIX space */ |
2602 | | switch(c) |
2603 | | { |
2604 | | HSPACE_CASES: |
2605 | | VSPACE_CASES: |
2606 | | if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
2607 | | break; |
2608 | | |
2609 | | default: |
2610 | | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == |
2611 | | (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); |
2612 | | break; |
2613 | | } |
2614 | | break; |
2615 | | |
2616 | | case PT_WORD: |
2617 | | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
2618 | | PRIV(ucp_gentype)[prop->chartype] == ucp_N || |
2619 | | c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) |
2620 | | RRETURN(MATCH_NOMATCH); |
2621 | | break; |
2622 | | |
2623 | | case PT_CLIST: |
2624 | | cp = PRIV(ucd_caseless_sets) + ecode[2]; |
2625 | | for (;;) |
2626 | | { |
2627 | | if (c < *cp) |
2628 | | { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; } |
2629 | | if (c == *cp++) |
2630 | | { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } } |
2631 | | } |
2632 | | break; |
2633 | | |
2634 | | case PT_UCNC: |
2635 | | if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || |
2636 | | c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || |
2637 | | c >= 0xe000) == (op == OP_NOTPROP)) |
2638 | | RRETURN(MATCH_NOMATCH); |
2639 | | break; |
2640 | | |
2641 | | /* This should never occur */ |
2642 | | |
2643 | | default: |
2644 | | RRETURN(PCRE_ERROR_INTERNAL); |
2645 | | } |
2646 | | |
2647 | | ecode += 3; |
2648 | | } |
2649 | | break; |
2650 | | |
2651 | | /* Match an extended Unicode sequence. We will get here only if the support |
2652 | | is in the binary; otherwise a compile-time error occurs. */ |
2653 | | |
2654 | | case OP_EXTUNI: |
2655 | | if (eptr >= md->end_subject) |
2656 | | { |
2657 | | SCHECK_PARTIAL(); |
2658 | | RRETURN(MATCH_NOMATCH); |
2659 | | } |
2660 | | else |
2661 | | { |
2662 | | int lgb, rgb; |
2663 | | GETCHARINCTEST(c, eptr); |
2664 | | lgb = UCD_GRAPHBREAK(c); |
2665 | | while (eptr < md->end_subject) |
2666 | | { |
2667 | | int len = 1; |
2668 | | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } |
2669 | | rgb = UCD_GRAPHBREAK(c); |
2670 | | if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; |
2671 | | lgb = rgb; |
2672 | | eptr += len; |
2673 | | } |
2674 | | } |
2675 | | CHECK_PARTIAL(); |
2676 | | ecode++; |
2677 | | break; |
2678 | | #endif /* SUPPORT_UCP */ |
2679 | | |
2680 | | |
2681 | | /* Match a back reference, possibly repeatedly. Look past the end of the |
2682 | | item to see if there is repeat information following. The code is similar |
2683 | | to that for character classes, but repeated for efficiency. Then obey |
2684 | | similar code to character type repeats - written out again for speed. |
2685 | | However, if the referenced string is the empty string, always treat |
2686 | | it as matched, any number of times (otherwise there could be infinite |
2687 | | loops). If the reference is unset, there are two possibilities: |
2688 | | |
2689 | | (a) In the default, Perl-compatible state, set the length negative; |
2690 | | this ensures that every attempt at a match fails. We can't just fail |
2691 | | here, because of the possibility of quantifiers with zero minima. |
2692 | | |
2693 | | (b) If the JavaScript compatibility flag is set, set the length to zero |
2694 | | so that the back reference matches an empty string. |
2695 | | |
2696 | | Otherwise, set the length to the length of what was matched by the |
2697 | | referenced subpattern. |
2698 | | |
2699 | | The OP_REF and OP_REFI opcodes are used for a reference to a numbered group |
2700 | | or to a non-duplicated named group. For a duplicated named group, OP_DNREF |
2701 | | and OP_DNREFI are used. In this case we must scan the list of groups to |
2702 | | which the name refers, and use the first one that is set. */ |
2703 | | |
2704 | 0 | case OP_DNREF: |
2705 | 0 | case OP_DNREFI: |
2706 | 0 | caseless = op == OP_DNREFI; |
2707 | 0 | { |
2708 | 0 | int count = GET2(ecode, 1+IMM2_SIZE); |
2709 | 0 | pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; |
2710 | 0 | ecode += 1 + 2*IMM2_SIZE; |
2711 | | |
2712 | | /* Setting the default length first and initializing 'offset' avoids |
2713 | | compiler warnings in the REF_REPEAT code. */ |
2714 | |
|
2715 | 0 | length = (md->jscript_compat)? 0 : -1; |
2716 | 0 | offset = 0; |
2717 | |
|
2718 | 0 | while (count-- > 0) |
2719 | 0 | { |
2720 | 0 | offset = GET2(slot, 0) << 1; |
2721 | 0 | if (offset < offset_top && md->offset_vector[offset] >= 0) |
2722 | 0 | { |
2723 | 0 | length = md->offset_vector[offset+1] - md->offset_vector[offset]; |
2724 | 0 | break; |
2725 | 0 | } |
2726 | 0 | slot += md->name_entry_size; |
2727 | 0 | } |
2728 | 0 | } |
2729 | 0 | goto REF_REPEAT; |
2730 | | |
2731 | 0 | case OP_REF: |
2732 | 0 | case OP_REFI: |
2733 | 0 | caseless = op == OP_REFI; |
2734 | 0 | offset = GET2(ecode, 1) << 1; /* Doubled ref number */ |
2735 | 0 | ecode += 1 + IMM2_SIZE; |
2736 | 0 | if (offset >= offset_top || md->offset_vector[offset] < 0) |
2737 | 0 | length = (md->jscript_compat)? 0 : -1; |
2738 | 0 | else |
2739 | 0 | length = md->offset_vector[offset+1] - md->offset_vector[offset]; |
2740 | | |
2741 | | /* Set up for repetition, or handle the non-repeated case */ |
2742 | |
|
2743 | 0 | REF_REPEAT: |
2744 | 0 | switch (*ecode) |
2745 | 0 | { |
2746 | 0 | case OP_CRSTAR: |
2747 | 0 | case OP_CRMINSTAR: |
2748 | 0 | case OP_CRPLUS: |
2749 | 0 | case OP_CRMINPLUS: |
2750 | 0 | case OP_CRQUERY: |
2751 | 0 | case OP_CRMINQUERY: |
2752 | 0 | c = *ecode++ - OP_CRSTAR; |
2753 | 0 | minimize = (c & 1) != 0; |
2754 | 0 | min = rep_min[c]; /* Pick up values from tables; */ |
2755 | 0 | max = rep_max[c]; /* zero for max => infinity */ |
2756 | 0 | if (max == 0) max = INT_MAX; |
2757 | 0 | break; |
2758 | | |
2759 | 0 | case OP_CRRANGE: |
2760 | 0 | case OP_CRMINRANGE: |
2761 | 0 | minimize = (*ecode == OP_CRMINRANGE); |
2762 | 0 | min = GET2(ecode, 1); |
2763 | 0 | max = GET2(ecode, 1 + IMM2_SIZE); |
2764 | 0 | if (max == 0) max = INT_MAX; |
2765 | 0 | ecode += 1 + 2 * IMM2_SIZE; |
2766 | 0 | break; |
2767 | | |
2768 | 0 | default: /* No repeat follows */ |
2769 | 0 | if ((length = match_ref(offset, eptr, length, md, caseless)) < 0) |
2770 | 0 | { |
2771 | 0 | if (length == -2) eptr = md->end_subject; /* Partial match */ |
2772 | 0 | CHECK_PARTIAL(); |
2773 | 0 | RRETURN(MATCH_NOMATCH); |
2774 | 0 | } |
2775 | 0 | eptr += length; |
2776 | 0 | continue; /* With the main loop */ |
2777 | 0 | } |
2778 | | |
2779 | | /* Handle repeated back references. If the length of the reference is |
2780 | | zero, just continue with the main loop. If the length is negative, it |
2781 | | means the reference is unset in non-Java-compatible mode. If the minimum is |
2782 | | zero, we can continue at the same level without recursion. For any other |
2783 | | minimum, carrying on will result in NOMATCH. */ |
2784 | | |
2785 | 0 | if (length == 0) continue; |
2786 | 0 | if (length < 0 && min == 0) continue; |
2787 | | |
2788 | | /* First, ensure the minimum number of matches are present. We get back |
2789 | | the length of the reference string explicitly rather than passing the |
2790 | | address of eptr, so that eptr can be a register variable. */ |
2791 | | |
2792 | 0 | for (i = 1; i <= min; i++) |
2793 | 0 | { |
2794 | 0 | int slength; |
2795 | 0 | if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) |
2796 | 0 | { |
2797 | 0 | if (slength == -2) eptr = md->end_subject; /* Partial match */ |
2798 | 0 | CHECK_PARTIAL(); |
2799 | 0 | RRETURN(MATCH_NOMATCH); |
2800 | 0 | } |
2801 | 0 | eptr += slength; |
2802 | 0 | } |
2803 | | |
2804 | | /* If min = max, continue at the same level without recursion. |
2805 | | They are not both allowed to be zero. */ |
2806 | | |
2807 | 0 | if (min == max) continue; |
2808 | | |
2809 | | /* If minimizing, keep trying and advancing the pointer */ |
2810 | | |
2811 | 0 | if (minimize) |
2812 | 0 | { |
2813 | 0 | for (fi = min;; fi++) |
2814 | 0 | { |
2815 | 0 | int slength; |
2816 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM14); |
2817 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2818 | 0 | if (fi >= max) RRETURN(MATCH_NOMATCH); |
2819 | 0 | if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) |
2820 | 0 | { |
2821 | 0 | if (slength == -2) eptr = md->end_subject; /* Partial match */ |
2822 | 0 | CHECK_PARTIAL(); |
2823 | 0 | RRETURN(MATCH_NOMATCH); |
2824 | 0 | } |
2825 | 0 | eptr += slength; |
2826 | 0 | } |
2827 | | /* Control never gets here */ |
2828 | 0 | } |
2829 | | |
2830 | | /* If maximizing, find the longest string and work backwards */ |
2831 | | |
2832 | 0 | else |
2833 | 0 | { |
2834 | 0 | pp = eptr; |
2835 | 0 | for (i = min; i < max; i++) |
2836 | 0 | { |
2837 | 0 | int slength; |
2838 | 0 | if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) |
2839 | 0 | { |
2840 | | /* Can't use CHECK_PARTIAL because we don't want to update eptr in |
2841 | | the soft partial matching case. */ |
2842 | |
|
2843 | 0 | if (slength == -2 && md->partial != 0 && |
2844 | 0 | md->end_subject > md->start_used_ptr) |
2845 | 0 | { |
2846 | 0 | md->hitend = TRUE; |
2847 | 0 | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); |
2848 | 0 | } |
2849 | 0 | break; |
2850 | 0 | } |
2851 | 0 | eptr += slength; |
2852 | 0 | } |
2853 | | |
2854 | 0 | while (eptr >= pp) |
2855 | 0 | { |
2856 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM15); |
2857 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2858 | 0 | eptr -= length; |
2859 | 0 | } |
2860 | 0 | RRETURN(MATCH_NOMATCH); |
2861 | 0 | } |
2862 | | /* Control never gets here */ |
2863 | | |
2864 | | /* Match a bit-mapped character class, possibly repeatedly. This op code is |
2865 | | used when all the characters in the class have values in the range 0-255, |
2866 | | and either the matching is caseful, or the characters are in the range |
2867 | | 0-127 when UTF-8 processing is enabled. The only difference between |
2868 | | OP_CLASS and OP_NCLASS occurs when a data character outside the range is |
2869 | | encountered. |
2870 | | |
2871 | | First, look past the end of the item to see if there is repeat information |
2872 | | following. Then obey similar code to character type repeats - written out |
2873 | | again for speed. */ |
2874 | | |
2875 | 0 | case OP_NCLASS: |
2876 | 0 | case OP_CLASS: |
2877 | 0 | { |
2878 | | /* The data variable is saved across frames, so the byte map needs to |
2879 | | be stored there. */ |
2880 | 0 | #define BYTE_MAP ((pcre_uint8 *)data) |
2881 | 0 | data = ecode + 1; /* Save for matching */ |
2882 | 0 | ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */ |
2883 | |
|
2884 | 0 | switch (*ecode) |
2885 | 0 | { |
2886 | 0 | case OP_CRSTAR: |
2887 | 0 | case OP_CRMINSTAR: |
2888 | 0 | case OP_CRPLUS: |
2889 | 0 | case OP_CRMINPLUS: |
2890 | 0 | case OP_CRQUERY: |
2891 | 0 | case OP_CRMINQUERY: |
2892 | 0 | case OP_CRPOSSTAR: |
2893 | 0 | case OP_CRPOSPLUS: |
2894 | 0 | case OP_CRPOSQUERY: |
2895 | 0 | c = *ecode++ - OP_CRSTAR; |
2896 | 0 | if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; |
2897 | 0 | else possessive = TRUE; |
2898 | 0 | min = rep_min[c]; /* Pick up values from tables; */ |
2899 | 0 | max = rep_max[c]; /* zero for max => infinity */ |
2900 | 0 | if (max == 0) max = INT_MAX; |
2901 | 0 | break; |
2902 | | |
2903 | 0 | case OP_CRRANGE: |
2904 | 0 | case OP_CRMINRANGE: |
2905 | 0 | case OP_CRPOSRANGE: |
2906 | 0 | minimize = (*ecode == OP_CRMINRANGE); |
2907 | 0 | possessive = (*ecode == OP_CRPOSRANGE); |
2908 | 0 | min = GET2(ecode, 1); |
2909 | 0 | max = GET2(ecode, 1 + IMM2_SIZE); |
2910 | 0 | if (max == 0) max = INT_MAX; |
2911 | 0 | ecode += 1 + 2 * IMM2_SIZE; |
2912 | 0 | break; |
2913 | | |
2914 | 0 | default: /* No repeat follows */ |
2915 | 0 | min = max = 1; |
2916 | 0 | break; |
2917 | 0 | } |
2918 | | |
2919 | | /* First, ensure the minimum number of matches are present. */ |
2920 | | |
2921 | | #ifdef SUPPORT_UTF |
2922 | | if (utf) |
2923 | | { |
2924 | | for (i = 1; i <= min; i++) |
2925 | | { |
2926 | | if (eptr >= md->end_subject) |
2927 | | { |
2928 | | SCHECK_PARTIAL(); |
2929 | | RRETURN(MATCH_NOMATCH); |
2930 | | } |
2931 | | GETCHARINC(c, eptr); |
2932 | | if (c > 255) |
2933 | | { |
2934 | | if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2935 | | } |
2936 | | else |
2937 | | if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
2938 | | } |
2939 | | } |
2940 | | else |
2941 | | #endif |
2942 | | /* Not UTF mode */ |
2943 | 0 | { |
2944 | 0 | for (i = 1; i <= min; i++) |
2945 | 0 | { |
2946 | 0 | if (eptr >= md->end_subject) |
2947 | 0 | { |
2948 | 0 | SCHECK_PARTIAL(); |
2949 | 0 | RRETURN(MATCH_NOMATCH); |
2950 | 0 | } |
2951 | 0 | c = *eptr++; |
2952 | | #ifndef COMPILE_PCRE8 |
2953 | | if (c > 255) |
2954 | | { |
2955 | | if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2956 | | } |
2957 | | else |
2958 | | #endif |
2959 | 0 | if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
2960 | 0 | } |
2961 | 0 | } |
2962 | | |
2963 | | /* If max == min we can continue with the main loop without the |
2964 | | need to recurse. */ |
2965 | | |
2966 | 0 | if (min == max) continue; |
2967 | | |
2968 | | /* If minimizing, keep testing the rest of the expression and advancing |
2969 | | the pointer while it matches the class. */ |
2970 | | |
2971 | 0 | if (minimize) |
2972 | 0 | { |
2973 | | #ifdef SUPPORT_UTF |
2974 | | if (utf) |
2975 | | { |
2976 | | for (fi = min;; fi++) |
2977 | | { |
2978 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM16); |
2979 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2980 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
2981 | | if (eptr >= md->end_subject) |
2982 | | { |
2983 | | SCHECK_PARTIAL(); |
2984 | | RRETURN(MATCH_NOMATCH); |
2985 | | } |
2986 | | GETCHARINC(c, eptr); |
2987 | | if (c > 255) |
2988 | | { |
2989 | | if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2990 | | } |
2991 | | else |
2992 | | if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
2993 | | } |
2994 | | } |
2995 | | else |
2996 | | #endif |
2997 | | /* Not UTF mode */ |
2998 | 0 | { |
2999 | 0 | for (fi = min;; fi++) |
3000 | 0 | { |
3001 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM17); |
3002 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3003 | 0 | if (fi >= max) RRETURN(MATCH_NOMATCH); |
3004 | 0 | if (eptr >= md->end_subject) |
3005 | 0 | { |
3006 | 0 | SCHECK_PARTIAL(); |
3007 | 0 | RRETURN(MATCH_NOMATCH); |
3008 | 0 | } |
3009 | 0 | c = *eptr++; |
3010 | | #ifndef COMPILE_PCRE8 |
3011 | | if (c > 255) |
3012 | | { |
3013 | | if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); |
3014 | | } |
3015 | | else |
3016 | | #endif |
3017 | 0 | if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
3018 | 0 | } |
3019 | 0 | } |
3020 | | /* Control never gets here */ |
3021 | 0 | } |
3022 | | |
3023 | | /* If maximizing, find the longest possible run, then work backwards. */ |
3024 | | |
3025 | 0 | else |
3026 | 0 | { |
3027 | 0 | pp = eptr; |
3028 | |
|
3029 | | #ifdef SUPPORT_UTF |
3030 | | if (utf) |
3031 | | { |
3032 | | for (i = min; i < max; i++) |
3033 | | { |
3034 | | int len = 1; |
3035 | | if (eptr >= md->end_subject) |
3036 | | { |
3037 | | SCHECK_PARTIAL(); |
3038 | | break; |
3039 | | } |
3040 | | GETCHARLEN(c, eptr, len); |
3041 | | if (c > 255) |
3042 | | { |
3043 | | if (op == OP_CLASS) break; |
3044 | | } |
3045 | | else |
3046 | | if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; |
3047 | | eptr += len; |
3048 | | } |
3049 | | |
3050 | | if (possessive) continue; /* No backtracking */ |
3051 | | |
3052 | | for (;;) |
3053 | | { |
3054 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM18); |
3055 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3056 | | if (eptr-- <= pp) break; /* Stop if tried at original pos */ |
3057 | | BACKCHAR(eptr); |
3058 | | } |
3059 | | } |
3060 | | else |
3061 | | #endif |
3062 | | /* Not UTF mode */ |
3063 | 0 | { |
3064 | 0 | for (i = min; i < max; i++) |
3065 | 0 | { |
3066 | 0 | if (eptr >= md->end_subject) |
3067 | 0 | { |
3068 | 0 | SCHECK_PARTIAL(); |
3069 | 0 | break; |
3070 | 0 | } |
3071 | 0 | c = *eptr; |
3072 | | #ifndef COMPILE_PCRE8 |
3073 | | if (c > 255) |
3074 | | { |
3075 | | if (op == OP_CLASS) break; |
3076 | | } |
3077 | | else |
3078 | | #endif |
3079 | 0 | if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; |
3080 | 0 | eptr++; |
3081 | 0 | } |
3082 | | |
3083 | 0 | if (possessive) continue; /* No backtracking */ |
3084 | | |
3085 | 0 | while (eptr >= pp) |
3086 | 0 | { |
3087 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM19); |
3088 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3089 | 0 | eptr--; |
3090 | 0 | } |
3091 | 0 | } |
3092 | | |
3093 | 0 | RRETURN(MATCH_NOMATCH); |
3094 | 0 | } |
3095 | 0 | #undef BYTE_MAP |
3096 | 0 | } |
3097 | | /* Control never gets here */ |
3098 | | |
3099 | | |
3100 | | /* Match an extended character class. In the 8-bit library, this opcode is |
3101 | | encountered only when UTF-8 mode mode is supported. In the 16-bit and |
3102 | | 32-bit libraries, codepoints greater than 255 may be encountered even when |
3103 | | UTF is not supported. */ |
3104 | | |
3105 | | #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
3106 | | case OP_XCLASS: |
3107 | | { |
3108 | | data = ecode + 1 + LINK_SIZE; /* Save for matching */ |
3109 | | ecode += GET(ecode, 1); /* Advance past the item */ |
3110 | | |
3111 | | switch (*ecode) |
3112 | | { |
3113 | | case OP_CRSTAR: |
3114 | | case OP_CRMINSTAR: |
3115 | | case OP_CRPLUS: |
3116 | | case OP_CRMINPLUS: |
3117 | | case OP_CRQUERY: |
3118 | | case OP_CRMINQUERY: |
3119 | | case OP_CRPOSSTAR: |
3120 | | case OP_CRPOSPLUS: |
3121 | | case OP_CRPOSQUERY: |
3122 | | c = *ecode++ - OP_CRSTAR; |
3123 | | if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; |
3124 | | else possessive = TRUE; |
3125 | | min = rep_min[c]; /* Pick up values from tables; */ |
3126 | | max = rep_max[c]; /* zero for max => infinity */ |
3127 | | if (max == 0) max = INT_MAX; |
3128 | | break; |
3129 | | |
3130 | | case OP_CRRANGE: |
3131 | | case OP_CRMINRANGE: |
3132 | | case OP_CRPOSRANGE: |
3133 | | minimize = (*ecode == OP_CRMINRANGE); |
3134 | | possessive = (*ecode == OP_CRPOSRANGE); |
3135 | | min = GET2(ecode, 1); |
3136 | | max = GET2(ecode, 1 + IMM2_SIZE); |
3137 | | if (max == 0) max = INT_MAX; |
3138 | | ecode += 1 + 2 * IMM2_SIZE; |
3139 | | break; |
3140 | | |
3141 | | default: /* No repeat follows */ |
3142 | | min = max = 1; |
3143 | | break; |
3144 | | } |
3145 | | |
3146 | | /* First, ensure the minimum number of matches are present. */ |
3147 | | |
3148 | | for (i = 1; i <= min; i++) |
3149 | | { |
3150 | | if (eptr >= md->end_subject) |
3151 | | { |
3152 | | SCHECK_PARTIAL(); |
3153 | | RRETURN(MATCH_NOMATCH); |
3154 | | } |
3155 | | GETCHARINCTEST(c, eptr); |
3156 | | if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); |
3157 | | } |
3158 | | |
3159 | | /* If max == min we can continue with the main loop without the |
3160 | | need to recurse. */ |
3161 | | |
3162 | | if (min == max) continue; |
3163 | | |
3164 | | /* If minimizing, keep testing the rest of the expression and advancing |
3165 | | the pointer while it matches the class. */ |
3166 | | |
3167 | | if (minimize) |
3168 | | { |
3169 | | for (fi = min;; fi++) |
3170 | | { |
3171 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM20); |
3172 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3173 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
3174 | | if (eptr >= md->end_subject) |
3175 | | { |
3176 | | SCHECK_PARTIAL(); |
3177 | | RRETURN(MATCH_NOMATCH); |
3178 | | } |
3179 | | GETCHARINCTEST(c, eptr); |
3180 | | if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); |
3181 | | } |
3182 | | /* Control never gets here */ |
3183 | | } |
3184 | | |
3185 | | /* If maximizing, find the longest possible run, then work backwards. */ |
3186 | | |
3187 | | else |
3188 | | { |
3189 | | pp = eptr; |
3190 | | for (i = min; i < max; i++) |
3191 | | { |
3192 | | int len = 1; |
3193 | | if (eptr >= md->end_subject) |
3194 | | { |
3195 | | SCHECK_PARTIAL(); |
3196 | | break; |
3197 | | } |
3198 | | #ifdef SUPPORT_UTF |
3199 | | GETCHARLENTEST(c, eptr, len); |
3200 | | #else |
3201 | | c = *eptr; |
3202 | | #endif |
3203 | | if (!PRIV(xclass)(c, data, utf)) break; |
3204 | | eptr += len; |
3205 | | } |
3206 | | |
3207 | | if (possessive) continue; /* No backtracking */ |
3208 | | |
3209 | | for(;;) |
3210 | | { |
3211 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); |
3212 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3213 | | if (eptr-- <= pp) break; /* Stop if tried at original pos */ |
3214 | | #ifdef SUPPORT_UTF |
3215 | | if (utf) BACKCHAR(eptr); |
3216 | | #endif |
3217 | | } |
3218 | | RRETURN(MATCH_NOMATCH); |
3219 | | } |
3220 | | |
3221 | | /* Control never gets here */ |
3222 | | } |
3223 | | #endif /* End of XCLASS */ |
3224 | | |
3225 | | /* Match a single character, casefully */ |
3226 | | |
3227 | 0 | case OP_CHAR: |
3228 | | #ifdef SUPPORT_UTF |
3229 | | if (utf) |
3230 | | { |
3231 | | length = 1; |
3232 | | ecode++; |
3233 | | GETCHARLEN(fc, ecode, length); |
3234 | | if (length > md->end_subject - eptr) |
3235 | | { |
3236 | | CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ |
3237 | | RRETURN(MATCH_NOMATCH); |
3238 | | } |
3239 | | while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH); |
3240 | | } |
3241 | | else |
3242 | | #endif |
3243 | | /* Not UTF mode */ |
3244 | 0 | { |
3245 | 0 | if (md->end_subject - eptr < 1) |
3246 | 0 | { |
3247 | 0 | SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ |
3248 | 0 | RRETURN(MATCH_NOMATCH); |
3249 | 0 | } |
3250 | 0 | if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); |
3251 | 0 | ecode += 2; |
3252 | 0 | } |
3253 | 0 | break; |
3254 | | |
3255 | | /* Match a single character, caselessly. If we are at the end of the |
3256 | | subject, give up immediately. */ |
3257 | | |
3258 | 0 | case OP_CHARI: |
3259 | 0 | if (eptr >= md->end_subject) |
3260 | 0 | { |
3261 | 0 | SCHECK_PARTIAL(); |
3262 | 0 | RRETURN(MATCH_NOMATCH); |
3263 | 0 | } |
3264 | | |
3265 | | #ifdef SUPPORT_UTF |
3266 | | if (utf) |
3267 | | { |
3268 | | length = 1; |
3269 | | ecode++; |
3270 | | GETCHARLEN(fc, ecode, length); |
3271 | | |
3272 | | /* If the pattern character's value is < 128, we have only one byte, and |
3273 | | we know that its other case must also be one byte long, so we can use the |
3274 | | fast lookup table. We know that there is at least one byte left in the |
3275 | | subject. */ |
3276 | | |
3277 | | if (fc < 128) |
3278 | | { |
3279 | | pcre_uint32 cc = UCHAR21(eptr); |
3280 | | if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH); |
3281 | | ecode++; |
3282 | | eptr++; |
3283 | | } |
3284 | | |
3285 | | /* Otherwise we must pick up the subject character. Note that we cannot |
3286 | | use the value of "length" to check for sufficient bytes left, because the |
3287 | | other case of the character may have more or fewer bytes. */ |
3288 | | |
3289 | | else |
3290 | | { |
3291 | | pcre_uint32 dc; |
3292 | | GETCHARINC(dc, eptr); |
3293 | | ecode += length; |
3294 | | |
3295 | | /* If we have Unicode property support, we can use it to test the other |
3296 | | case of the character, if there is one. */ |
3297 | | |
3298 | | if (fc != dc) |
3299 | | { |
3300 | | #ifdef SUPPORT_UCP |
3301 | | if (dc != UCD_OTHERCASE(fc)) |
3302 | | #endif |
3303 | | RRETURN(MATCH_NOMATCH); |
3304 | | } |
3305 | | } |
3306 | | } |
3307 | | else |
3308 | | #endif /* SUPPORT_UTF */ |
3309 | | |
3310 | | /* Not UTF mode */ |
3311 | 0 | { |
3312 | 0 | if (TABLE_GET(ecode[1], md->lcc, ecode[1]) |
3313 | 0 | != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH); |
3314 | 0 | eptr++; |
3315 | 0 | ecode += 2; |
3316 | 0 | } |
3317 | 0 | break; |
3318 | | |
3319 | | /* Match a single character repeatedly. */ |
3320 | | |
3321 | 0 | case OP_EXACT: |
3322 | 0 | case OP_EXACTI: |
3323 | 0 | min = max = GET2(ecode, 1); |
3324 | 0 | ecode += 1 + IMM2_SIZE; |
3325 | 0 | goto REPEATCHAR; |
3326 | | |
3327 | 0 | case OP_POSUPTO: |
3328 | 0 | case OP_POSUPTOI: |
3329 | 0 | possessive = TRUE; |
3330 | | /* Fall through */ |
3331 | |
|
3332 | 0 | case OP_UPTO: |
3333 | 0 | case OP_UPTOI: |
3334 | 0 | case OP_MINUPTO: |
3335 | 0 | case OP_MINUPTOI: |
3336 | 0 | min = 0; |
3337 | 0 | max = GET2(ecode, 1); |
3338 | 0 | minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI; |
3339 | 0 | ecode += 1 + IMM2_SIZE; |
3340 | 0 | goto REPEATCHAR; |
3341 | | |
3342 | 0 | case OP_POSSTAR: |
3343 | 0 | case OP_POSSTARI: |
3344 | 0 | possessive = TRUE; |
3345 | 0 | min = 0; |
3346 | 0 | max = INT_MAX; |
3347 | 0 | ecode++; |
3348 | 0 | goto REPEATCHAR; |
3349 | | |
3350 | 0 | case OP_POSPLUS: |
3351 | 0 | case OP_POSPLUSI: |
3352 | 0 | possessive = TRUE; |
3353 | 0 | min = 1; |
3354 | 0 | max = INT_MAX; |
3355 | 0 | ecode++; |
3356 | 0 | goto REPEATCHAR; |
3357 | | |
3358 | 0 | case OP_POSQUERY: |
3359 | 0 | case OP_POSQUERYI: |
3360 | 0 | possessive = TRUE; |
3361 | 0 | min = 0; |
3362 | 0 | max = 1; |
3363 | 0 | ecode++; |
3364 | 0 | goto REPEATCHAR; |
3365 | | |
3366 | 0 | case OP_STAR: |
3367 | 0 | case OP_STARI: |
3368 | 0 | case OP_MINSTAR: |
3369 | 0 | case OP_MINSTARI: |
3370 | 0 | case OP_PLUS: |
3371 | 0 | case OP_PLUSI: |
3372 | 0 | case OP_MINPLUS: |
3373 | 0 | case OP_MINPLUSI: |
3374 | 0 | case OP_QUERY: |
3375 | 0 | case OP_QUERYI: |
3376 | 0 | case OP_MINQUERY: |
3377 | 0 | case OP_MINQUERYI: |
3378 | 0 | c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI); |
3379 | 0 | minimize = (c & 1) != 0; |
3380 | 0 | min = rep_min[c]; /* Pick up values from tables; */ |
3381 | 0 | max = rep_max[c]; /* zero for max => infinity */ |
3382 | 0 | if (max == 0) max = INT_MAX; |
3383 | | |
3384 | | /* Common code for all repeated single-character matches. We first check |
3385 | | for the minimum number of characters. If the minimum equals the maximum, we |
3386 | | are done. Otherwise, if minimizing, check the rest of the pattern for a |
3387 | | match; if there isn't one, advance up to the maximum, one character at a |
3388 | | time. |
3389 | | |
3390 | | If maximizing, advance up to the maximum number of matching characters, |
3391 | | until eptr is past the end of the maximum run. If possessive, we are |
3392 | | then done (no backing up). Otherwise, match at this position; anything |
3393 | | other than no match is immediately returned. For nomatch, back up one |
3394 | | character, unless we are matching \R and the last thing matched was |
3395 | | \r\n, in which case, back up two bytes. When we reach the first optional |
3396 | | character position, we can save stack by doing a tail recurse. |
3397 | | |
3398 | | The various UTF/non-UTF and caseful/caseless cases are handled separately, |
3399 | | for speed. */ |
3400 | |
|
3401 | 0 | REPEATCHAR: |
3402 | | #ifdef SUPPORT_UTF |
3403 | | if (utf) |
3404 | | { |
3405 | | length = 1; |
3406 | | charptr = ecode; |
3407 | | GETCHARLEN(fc, ecode, length); |
3408 | | ecode += length; |
3409 | | |
3410 | | /* Handle multibyte character matching specially here. There is |
3411 | | support for caseless matching if UCP support is present. */ |
3412 | | |
3413 | | if (length > 1) |
3414 | | { |
3415 | | #ifdef SUPPORT_UCP |
3416 | | pcre_uint32 othercase; |
3417 | | if (op >= OP_STARI && /* Caseless */ |
3418 | | (othercase = UCD_OTHERCASE(fc)) != fc) |
3419 | | oclength = PRIV(ord2utf)(othercase, occhars); |
3420 | | else oclength = 0; |
3421 | | #endif /* SUPPORT_UCP */ |
3422 | | |
3423 | | for (i = 1; i <= min; i++) |
3424 | | { |
3425 | | if (eptr <= md->end_subject - length && |
3426 | | memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; |
3427 | | #ifdef SUPPORT_UCP |
3428 | | else if (oclength > 0 && |
3429 | | eptr <= md->end_subject - oclength && |
3430 | | memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; |
3431 | | #endif /* SUPPORT_UCP */ |
3432 | | else |
3433 | | { |
3434 | | CHECK_PARTIAL(); |
3435 | | RRETURN(MATCH_NOMATCH); |
3436 | | } |
3437 | | } |
3438 | | |
3439 | | if (min == max) continue; |
3440 | | |
3441 | | if (minimize) |
3442 | | { |
3443 | | for (fi = min;; fi++) |
3444 | | { |
3445 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM22); |
3446 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3447 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
3448 | | if (eptr <= md->end_subject - length && |
3449 | | memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; |
3450 | | #ifdef SUPPORT_UCP |
3451 | | else if (oclength > 0 && |
3452 | | eptr <= md->end_subject - oclength && |
3453 | | memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; |
3454 | | #endif /* SUPPORT_UCP */ |
3455 | | else |
3456 | | { |
3457 | | CHECK_PARTIAL(); |
3458 | | RRETURN(MATCH_NOMATCH); |
3459 | | } |
3460 | | } |
3461 | | /* Control never gets here */ |
3462 | | } |
3463 | | |
3464 | | else /* Maximize */ |
3465 | | { |
3466 | | pp = eptr; |
3467 | | for (i = min; i < max; i++) |
3468 | | { |
3469 | | if (eptr <= md->end_subject - length && |
3470 | | memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; |
3471 | | #ifdef SUPPORT_UCP |
3472 | | else if (oclength > 0 && |
3473 | | eptr <= md->end_subject - oclength && |
3474 | | memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; |
3475 | | #endif /* SUPPORT_UCP */ |
3476 | | else |
3477 | | { |
3478 | | CHECK_PARTIAL(); |
3479 | | break; |
3480 | | } |
3481 | | } |
3482 | | |
3483 | | if (possessive) continue; /* No backtracking */ |
3484 | | for(;;) |
3485 | | { |
3486 | | if (eptr <= pp) goto TAIL_RECURSE; |
3487 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM23); |
3488 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3489 | | #ifdef SUPPORT_UCP |
3490 | | eptr--; |
3491 | | BACKCHAR(eptr); |
3492 | | #else /* without SUPPORT_UCP */ |
3493 | | eptr -= length; |
3494 | | #endif /* SUPPORT_UCP */ |
3495 | | } |
3496 | | } |
3497 | | /* Control never gets here */ |
3498 | | } |
3499 | | |
3500 | | /* If the length of a UTF-8 character is 1, we fall through here, and |
3501 | | obey the code as for non-UTF-8 characters below, though in this case the |
3502 | | value of fc will always be < 128. */ |
3503 | | } |
3504 | | else |
3505 | | #endif /* SUPPORT_UTF */ |
3506 | | /* When not in UTF-8 mode, load a single-byte character. */ |
3507 | 0 | fc = *ecode++; |
3508 | | |
3509 | | /* The value of fc at this point is always one character, though we may |
3510 | | or may not be in UTF mode. The code is duplicated for the caseless and |
3511 | | caseful cases, for speed, since matching characters is likely to be quite |
3512 | | common. First, ensure the minimum number of matches are present. If min = |
3513 | | max, continue at the same level without recursing. Otherwise, if |
3514 | | minimizing, keep trying the rest of the expression and advancing one |
3515 | | matching character if failing, up to the maximum. Alternatively, if |
3516 | | maximizing, find the maximum number of characters and work backwards. */ |
3517 | |
|
3518 | 0 | DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, |
3519 | 0 | max, (char *)eptr)); |
3520 | |
|
3521 | 0 | if (op >= OP_STARI) /* Caseless */ |
3522 | 0 | { |
3523 | 0 | #ifdef COMPILE_PCRE8 |
3524 | | /* fc must be < 128 if UTF is enabled. */ |
3525 | 0 | foc = md->fcc[fc]; |
3526 | | #else |
3527 | | #ifdef SUPPORT_UTF |
3528 | | #ifdef SUPPORT_UCP |
3529 | | if (utf && fc > 127) |
3530 | | foc = UCD_OTHERCASE(fc); |
3531 | | #else |
3532 | | if (utf && fc > 127) |
3533 | | foc = fc; |
3534 | | #endif /* SUPPORT_UCP */ |
3535 | | else |
3536 | | #endif /* SUPPORT_UTF */ |
3537 | | foc = TABLE_GET(fc, md->fcc, fc); |
3538 | | #endif /* COMPILE_PCRE8 */ |
3539 | |
|
3540 | 0 | for (i = 1; i <= min; i++) |
3541 | 0 | { |
3542 | 0 | pcre_uint32 cc; /* Faster than pcre_uchar */ |
3543 | 0 | if (eptr >= md->end_subject) |
3544 | 0 | { |
3545 | 0 | SCHECK_PARTIAL(); |
3546 | 0 | RRETURN(MATCH_NOMATCH); |
3547 | 0 | } |
3548 | 0 | cc = UCHAR21TEST(eptr); |
3549 | 0 | if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); |
3550 | 0 | eptr++; |
3551 | 0 | } |
3552 | 0 | if (min == max) continue; |
3553 | 0 | if (minimize) |
3554 | 0 | { |
3555 | 0 | for (fi = min;; fi++) |
3556 | 0 | { |
3557 | 0 | pcre_uint32 cc; /* Faster than pcre_uchar */ |
3558 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM24); |
3559 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3560 | 0 | if (fi >= max) RRETURN(MATCH_NOMATCH); |
3561 | 0 | if (eptr >= md->end_subject) |
3562 | 0 | { |
3563 | 0 | SCHECK_PARTIAL(); |
3564 | 0 | RRETURN(MATCH_NOMATCH); |
3565 | 0 | } |
3566 | 0 | cc = UCHAR21TEST(eptr); |
3567 | 0 | if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); |
3568 | 0 | eptr++; |
3569 | 0 | } |
3570 | | /* Control never gets here */ |
3571 | 0 | } |
3572 | 0 | else /* Maximize */ |
3573 | 0 | { |
3574 | 0 | pp = eptr; |
3575 | 0 | for (i = min; i < max; i++) |
3576 | 0 | { |
3577 | 0 | pcre_uint32 cc; /* Faster than pcre_uchar */ |
3578 | 0 | if (eptr >= md->end_subject) |
3579 | 0 | { |
3580 | 0 | SCHECK_PARTIAL(); |
3581 | 0 | break; |
3582 | 0 | } |
3583 | 0 | cc = UCHAR21TEST(eptr); |
3584 | 0 | if (fc != cc && foc != cc) break; |
3585 | 0 | eptr++; |
3586 | 0 | } |
3587 | 0 | if (possessive) continue; /* No backtracking */ |
3588 | 0 | for (;;) |
3589 | 0 | { |
3590 | 0 | if (eptr == pp) goto TAIL_RECURSE; |
3591 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM25); |
3592 | 0 | eptr--; |
3593 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3594 | 0 | } |
3595 | | /* Control never gets here */ |
3596 | 0 | } |
3597 | 0 | } |
3598 | | |
3599 | | /* Caseful comparisons (includes all multi-byte characters) */ |
3600 | | |
3601 | 0 | else |
3602 | 0 | { |
3603 | 0 | for (i = 1; i <= min; i++) |
3604 | 0 | { |
3605 | 0 | if (eptr >= md->end_subject) |
3606 | 0 | { |
3607 | 0 | SCHECK_PARTIAL(); |
3608 | 0 | RRETURN(MATCH_NOMATCH); |
3609 | 0 | } |
3610 | 0 | if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH); |
3611 | 0 | } |
3612 | | |
3613 | 0 | if (min == max) continue; |
3614 | | |
3615 | 0 | if (minimize) |
3616 | 0 | { |
3617 | 0 | for (fi = min;; fi++) |
3618 | 0 | { |
3619 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM26); |
3620 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3621 | 0 | if (fi >= max) RRETURN(MATCH_NOMATCH); |
3622 | 0 | if (eptr >= md->end_subject) |
3623 | 0 | { |
3624 | 0 | SCHECK_PARTIAL(); |
3625 | 0 | RRETURN(MATCH_NOMATCH); |
3626 | 0 | } |
3627 | 0 | if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH); |
3628 | 0 | } |
3629 | | /* Control never gets here */ |
3630 | 0 | } |
3631 | 0 | else /* Maximize */ |
3632 | 0 | { |
3633 | 0 | pp = eptr; |
3634 | 0 | for (i = min; i < max; i++) |
3635 | 0 | { |
3636 | 0 | if (eptr >= md->end_subject) |
3637 | 0 | { |
3638 | 0 | SCHECK_PARTIAL(); |
3639 | 0 | break; |
3640 | 0 | } |
3641 | 0 | if (fc != UCHAR21TEST(eptr)) break; |
3642 | 0 | eptr++; |
3643 | 0 | } |
3644 | 0 | if (possessive) continue; /* No backtracking */ |
3645 | 0 | for (;;) |
3646 | 0 | { |
3647 | 0 | if (eptr == pp) goto TAIL_RECURSE; |
3648 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM27); |
3649 | 0 | eptr--; |
3650 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3651 | 0 | } |
3652 | | /* Control never gets here */ |
3653 | 0 | } |
3654 | 0 | } |
3655 | | /* Control never gets here */ |
3656 | | |
3657 | | /* Match a negated single one-byte character. The character we are |
3658 | | checking can be multibyte. */ |
3659 | | |
3660 | 0 | case OP_NOT: |
3661 | 0 | case OP_NOTI: |
3662 | 0 | if (eptr >= md->end_subject) |
3663 | 0 | { |
3664 | 0 | SCHECK_PARTIAL(); |
3665 | 0 | RRETURN(MATCH_NOMATCH); |
3666 | 0 | } |
3667 | | #ifdef SUPPORT_UTF |
3668 | | if (utf) |
3669 | | { |
3670 | | register pcre_uint32 ch, och; |
3671 | | |
3672 | | ecode++; |
3673 | | GETCHARINC(ch, ecode); |
3674 | | GETCHARINC(c, eptr); |
3675 | | |
3676 | | if (op == OP_NOT) |
3677 | | { |
3678 | | if (ch == c) RRETURN(MATCH_NOMATCH); |
3679 | | } |
3680 | | else |
3681 | | { |
3682 | | #ifdef SUPPORT_UCP |
3683 | | if (ch > 127) |
3684 | | och = UCD_OTHERCASE(ch); |
3685 | | #else |
3686 | | if (ch > 127) |
3687 | | och = ch; |
3688 | | #endif /* SUPPORT_UCP */ |
3689 | | else |
3690 | | och = TABLE_GET(ch, md->fcc, ch); |
3691 | | if (ch == c || och == c) RRETURN(MATCH_NOMATCH); |
3692 | | } |
3693 | | } |
3694 | | else |
3695 | | #endif |
3696 | 0 | { |
3697 | 0 | register pcre_uint32 ch = ecode[1]; |
3698 | 0 | c = *eptr++; |
3699 | 0 | if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c)) |
3700 | 0 | RRETURN(MATCH_NOMATCH); |
3701 | 0 | ecode += 2; |
3702 | 0 | } |
3703 | 0 | break; |
3704 | | |
3705 | | /* Match a negated single one-byte character repeatedly. This is almost a |
3706 | | repeat of the code for a repeated single character, but I haven't found a |
3707 | | nice way of commoning these up that doesn't require a test of the |
3708 | | positive/negative option for each character match. Maybe that wouldn't add |
3709 | | very much to the time taken, but character matching *is* what this is all |
3710 | | about... */ |
3711 | | |
3712 | 0 | case OP_NOTEXACT: |
3713 | 0 | case OP_NOTEXACTI: |
3714 | 0 | min = max = GET2(ecode, 1); |
3715 | 0 | ecode += 1 + IMM2_SIZE; |
3716 | 0 | goto REPEATNOTCHAR; |
3717 | | |
3718 | 0 | case OP_NOTUPTO: |
3719 | 0 | case OP_NOTUPTOI: |
3720 | 0 | case OP_NOTMINUPTO: |
3721 | 0 | case OP_NOTMINUPTOI: |
3722 | 0 | min = 0; |
3723 | 0 | max = GET2(ecode, 1); |
3724 | 0 | minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI; |
3725 | 0 | ecode += 1 + IMM2_SIZE; |
3726 | 0 | goto REPEATNOTCHAR; |
3727 | | |
3728 | 0 | case OP_NOTPOSSTAR: |
3729 | 0 | case OP_NOTPOSSTARI: |
3730 | 0 | possessive = TRUE; |
3731 | 0 | min = 0; |
3732 | 0 | max = INT_MAX; |
3733 | 0 | ecode++; |
3734 | 0 | goto REPEATNOTCHAR; |
3735 | | |
3736 | 0 | case OP_NOTPOSPLUS: |
3737 | 0 | case OP_NOTPOSPLUSI: |
3738 | 0 | possessive = TRUE; |
3739 | 0 | min = 1; |
3740 | 0 | max = INT_MAX; |
3741 | 0 | ecode++; |
3742 | 0 | goto REPEATNOTCHAR; |
3743 | | |
3744 | 0 | case OP_NOTPOSQUERY: |
3745 | 0 | case OP_NOTPOSQUERYI: |
3746 | 0 | possessive = TRUE; |
3747 | 0 | min = 0; |
3748 | 0 | max = 1; |
3749 | 0 | ecode++; |
3750 | 0 | goto REPEATNOTCHAR; |
3751 | | |
3752 | 0 | case OP_NOTPOSUPTO: |
3753 | 0 | case OP_NOTPOSUPTOI: |
3754 | 0 | possessive = TRUE; |
3755 | 0 | min = 0; |
3756 | 0 | max = GET2(ecode, 1); |
3757 | 0 | ecode += 1 + IMM2_SIZE; |
3758 | 0 | goto REPEATNOTCHAR; |
3759 | | |
3760 | 0 | case OP_NOTSTAR: |
3761 | 0 | case OP_NOTSTARI: |
3762 | 0 | case OP_NOTMINSTAR: |
3763 | 0 | case OP_NOTMINSTARI: |
3764 | 0 | case OP_NOTPLUS: |
3765 | 0 | case OP_NOTPLUSI: |
3766 | 0 | case OP_NOTMINPLUS: |
3767 | 0 | case OP_NOTMINPLUSI: |
3768 | 0 | case OP_NOTQUERY: |
3769 | 0 | case OP_NOTQUERYI: |
3770 | 0 | case OP_NOTMINQUERY: |
3771 | 0 | case OP_NOTMINQUERYI: |
3772 | 0 | c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); |
3773 | 0 | minimize = (c & 1) != 0; |
3774 | 0 | min = rep_min[c]; /* Pick up values from tables; */ |
3775 | 0 | max = rep_max[c]; /* zero for max => infinity */ |
3776 | 0 | if (max == 0) max = INT_MAX; |
3777 | | |
3778 | | /* Common code for all repeated single-byte matches. */ |
3779 | |
|
3780 | 0 | REPEATNOTCHAR: |
3781 | 0 | GETCHARINCTEST(fc, ecode); |
3782 | | |
3783 | | /* The code is duplicated for the caseless and caseful cases, for speed, |
3784 | | since matching characters is likely to be quite common. First, ensure the |
3785 | | minimum number of matches are present. If min = max, continue at the same |
3786 | | level without recursing. Otherwise, if minimizing, keep trying the rest of |
3787 | | the expression and advancing one matching character if failing, up to the |
3788 | | maximum. Alternatively, if maximizing, find the maximum number of |
3789 | | characters and work backwards. */ |
3790 | |
|
3791 | 0 | DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, |
3792 | 0 | max, (char *)eptr)); |
3793 | |
|
3794 | 0 | if (op >= OP_NOTSTARI) /* Caseless */ |
3795 | 0 | { |
3796 | | #ifdef SUPPORT_UTF |
3797 | | #ifdef SUPPORT_UCP |
3798 | | if (utf && fc > 127) |
3799 | | foc = UCD_OTHERCASE(fc); |
3800 | | #else |
3801 | | if (utf && fc > 127) |
3802 | | foc = fc; |
3803 | | #endif /* SUPPORT_UCP */ |
3804 | | else |
3805 | | #endif /* SUPPORT_UTF */ |
3806 | 0 | foc = TABLE_GET(fc, md->fcc, fc); |
3807 | |
|
3808 | | #ifdef SUPPORT_UTF |
3809 | | if (utf) |
3810 | | { |
3811 | | register pcre_uint32 d; |
3812 | | for (i = 1; i <= min; i++) |
3813 | | { |
3814 | | if (eptr >= md->end_subject) |
3815 | | { |
3816 | | SCHECK_PARTIAL(); |
3817 | | RRETURN(MATCH_NOMATCH); |
3818 | | } |
3819 | | GETCHARINC(d, eptr); |
3820 | | if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); |
3821 | | } |
3822 | | } |
3823 | | else |
3824 | | #endif /* SUPPORT_UTF */ |
3825 | | /* Not UTF mode */ |
3826 | 0 | { |
3827 | 0 | for (i = 1; i <= min; i++) |
3828 | 0 | { |
3829 | 0 | if (eptr >= md->end_subject) |
3830 | 0 | { |
3831 | 0 | SCHECK_PARTIAL(); |
3832 | 0 | RRETURN(MATCH_NOMATCH); |
3833 | 0 | } |
3834 | 0 | if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); |
3835 | 0 | eptr++; |
3836 | 0 | } |
3837 | 0 | } |
3838 | | |
3839 | 0 | if (min == max) continue; |
3840 | | |
3841 | 0 | if (minimize) |
3842 | 0 | { |
3843 | | #ifdef SUPPORT_UTF |
3844 | | if (utf) |
3845 | | { |
3846 | | register pcre_uint32 d; |
3847 | | for (fi = min;; fi++) |
3848 | | { |
3849 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM28); |
3850 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3851 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
3852 | | if (eptr >= md->end_subject) |
3853 | | { |
3854 | | SCHECK_PARTIAL(); |
3855 | | RRETURN(MATCH_NOMATCH); |
3856 | | } |
3857 | | GETCHARINC(d, eptr); |
3858 | | if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); |
3859 | | } |
3860 | | } |
3861 | | else |
3862 | | #endif /*SUPPORT_UTF */ |
3863 | | /* Not UTF mode */ |
3864 | 0 | { |
3865 | 0 | for (fi = min;; fi++) |
3866 | 0 | { |
3867 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM29); |
3868 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3869 | 0 | if (fi >= max) RRETURN(MATCH_NOMATCH); |
3870 | 0 | if (eptr >= md->end_subject) |
3871 | 0 | { |
3872 | 0 | SCHECK_PARTIAL(); |
3873 | 0 | RRETURN(MATCH_NOMATCH); |
3874 | 0 | } |
3875 | 0 | if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); |
3876 | 0 | eptr++; |
3877 | 0 | } |
3878 | 0 | } |
3879 | | /* Control never gets here */ |
3880 | 0 | } |
3881 | | |
3882 | | /* Maximize case */ |
3883 | | |
3884 | 0 | else |
3885 | 0 | { |
3886 | 0 | pp = eptr; |
3887 | |
|
3888 | | #ifdef SUPPORT_UTF |
3889 | | if (utf) |
3890 | | { |
3891 | | register pcre_uint32 d; |
3892 | | for (i = min; i < max; i++) |
3893 | | { |
3894 | | int len = 1; |
3895 | | if (eptr >= md->end_subject) |
3896 | | { |
3897 | | SCHECK_PARTIAL(); |
3898 | | break; |
3899 | | } |
3900 | | GETCHARLEN(d, eptr, len); |
3901 | | if (fc == d || (unsigned int)foc == d) break; |
3902 | | eptr += len; |
3903 | | } |
3904 | | if (possessive) continue; /* No backtracking */ |
3905 | | for(;;) |
3906 | | { |
3907 | | if (eptr <= pp) goto TAIL_RECURSE; |
3908 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); |
3909 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3910 | | eptr--; |
3911 | | BACKCHAR(eptr); |
3912 | | } |
3913 | | } |
3914 | | else |
3915 | | #endif /* SUPPORT_UTF */ |
3916 | | /* Not UTF mode */ |
3917 | 0 | { |
3918 | 0 | for (i = min; i < max; i++) |
3919 | 0 | { |
3920 | 0 | if (eptr >= md->end_subject) |
3921 | 0 | { |
3922 | 0 | SCHECK_PARTIAL(); |
3923 | 0 | break; |
3924 | 0 | } |
3925 | 0 | if (fc == *eptr || foc == *eptr) break; |
3926 | 0 | eptr++; |
3927 | 0 | } |
3928 | 0 | if (possessive) continue; /* No backtracking */ |
3929 | 0 | for (;;) |
3930 | 0 | { |
3931 | 0 | if (eptr == pp) goto TAIL_RECURSE; |
3932 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM31); |
3933 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3934 | 0 | eptr--; |
3935 | 0 | } |
3936 | 0 | } |
3937 | | /* Control never gets here */ |
3938 | 0 | } |
3939 | 0 | } |
3940 | | |
3941 | | /* Caseful comparisons */ |
3942 | | |
3943 | 0 | else |
3944 | 0 | { |
3945 | | #ifdef SUPPORT_UTF |
3946 | | if (utf) |
3947 | | { |
3948 | | register pcre_uint32 d; |
3949 | | for (i = 1; i <= min; i++) |
3950 | | { |
3951 | | if (eptr >= md->end_subject) |
3952 | | { |
3953 | | SCHECK_PARTIAL(); |
3954 | | RRETURN(MATCH_NOMATCH); |
3955 | | } |
3956 | | GETCHARINC(d, eptr); |
3957 | | if (fc == d) RRETURN(MATCH_NOMATCH); |
3958 | | } |
3959 | | } |
3960 | | else |
3961 | | #endif |
3962 | | /* Not UTF mode */ |
3963 | 0 | { |
3964 | 0 | for (i = 1; i <= min; i++) |
3965 | 0 | { |
3966 | 0 | if (eptr >= md->end_subject) |
3967 | 0 | { |
3968 | 0 | SCHECK_PARTIAL(); |
3969 | 0 | RRETURN(MATCH_NOMATCH); |
3970 | 0 | } |
3971 | 0 | if (fc == *eptr++) RRETURN(MATCH_NOMATCH); |
3972 | 0 | } |
3973 | 0 | } |
3974 | | |
3975 | 0 | if (min == max) continue; |
3976 | | |
3977 | 0 | if (minimize) |
3978 | 0 | { |
3979 | | #ifdef SUPPORT_UTF |
3980 | | if (utf) |
3981 | | { |
3982 | | register pcre_uint32 d; |
3983 | | for (fi = min;; fi++) |
3984 | | { |
3985 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM32); |
3986 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3987 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
3988 | | if (eptr >= md->end_subject) |
3989 | | { |
3990 | | SCHECK_PARTIAL(); |
3991 | | RRETURN(MATCH_NOMATCH); |
3992 | | } |
3993 | | GETCHARINC(d, eptr); |
3994 | | if (fc == d) RRETURN(MATCH_NOMATCH); |
3995 | | } |
3996 | | } |
3997 | | else |
3998 | | #endif |
3999 | | /* Not UTF mode */ |
4000 | 0 | { |
4001 | 0 | for (fi = min;; fi++) |
4002 | 0 | { |
4003 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM33); |
4004 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4005 | 0 | if (fi >= max) RRETURN(MATCH_NOMATCH); |
4006 | 0 | if (eptr >= md->end_subject) |
4007 | 0 | { |
4008 | 0 | SCHECK_PARTIAL(); |
4009 | 0 | RRETURN(MATCH_NOMATCH); |
4010 | 0 | } |
4011 | 0 | if (fc == *eptr++) RRETURN(MATCH_NOMATCH); |
4012 | 0 | } |
4013 | 0 | } |
4014 | | /* Control never gets here */ |
4015 | 0 | } |
4016 | | |
4017 | | /* Maximize case */ |
4018 | | |
4019 | 0 | else |
4020 | 0 | { |
4021 | 0 | pp = eptr; |
4022 | |
|
4023 | | #ifdef SUPPORT_UTF |
4024 | | if (utf) |
4025 | | { |
4026 | | register pcre_uint32 d; |
4027 | | for (i = min; i < max; i++) |
4028 | | { |
4029 | | int len = 1; |
4030 | | if (eptr >= md->end_subject) |
4031 | | { |
4032 | | SCHECK_PARTIAL(); |
4033 | | break; |
4034 | | } |
4035 | | GETCHARLEN(d, eptr, len); |
4036 | | if (fc == d) break; |
4037 | | eptr += len; |
4038 | | } |
4039 | | if (possessive) continue; /* No backtracking */ |
4040 | | for(;;) |
4041 | | { |
4042 | | if (eptr <= pp) goto TAIL_RECURSE; |
4043 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM34); |
4044 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4045 | | eptr--; |
4046 | | BACKCHAR(eptr); |
4047 | | } |
4048 | | } |
4049 | | else |
4050 | | #endif |
4051 | | /* Not UTF mode */ |
4052 | 0 | { |
4053 | 0 | for (i = min; i < max; i++) |
4054 | 0 | { |
4055 | 0 | if (eptr >= md->end_subject) |
4056 | 0 | { |
4057 | 0 | SCHECK_PARTIAL(); |
4058 | 0 | break; |
4059 | 0 | } |
4060 | 0 | if (fc == *eptr) break; |
4061 | 0 | eptr++; |
4062 | 0 | } |
4063 | 0 | if (possessive) continue; /* No backtracking */ |
4064 | 0 | for (;;) |
4065 | 0 | { |
4066 | 0 | if (eptr == pp) goto TAIL_RECURSE; |
4067 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM35); |
4068 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4069 | 0 | eptr--; |
4070 | 0 | } |
4071 | 0 | } |
4072 | | /* Control never gets here */ |
4073 | 0 | } |
4074 | 0 | } |
4075 | | /* Control never gets here */ |
4076 | | |
4077 | | /* Match a single character type repeatedly; several different opcodes |
4078 | | share code. This is very similar to the code for single characters, but we |
4079 | | repeat it in the interests of efficiency. */ |
4080 | | |
4081 | 0 | case OP_TYPEEXACT: |
4082 | 0 | min = max = GET2(ecode, 1); |
4083 | 0 | minimize = TRUE; |
4084 | 0 | ecode += 1 + IMM2_SIZE; |
4085 | 0 | goto REPEATTYPE; |
4086 | | |
4087 | 0 | case OP_TYPEUPTO: |
4088 | 0 | case OP_TYPEMINUPTO: |
4089 | 0 | min = 0; |
4090 | 0 | max = GET2(ecode, 1); |
4091 | 0 | minimize = *ecode == OP_TYPEMINUPTO; |
4092 | 0 | ecode += 1 + IMM2_SIZE; |
4093 | 0 | goto REPEATTYPE; |
4094 | | |
4095 | 0 | case OP_TYPEPOSSTAR: |
4096 | 0 | possessive = TRUE; |
4097 | 0 | min = 0; |
4098 | 0 | max = INT_MAX; |
4099 | 0 | ecode++; |
4100 | 0 | goto REPEATTYPE; |
4101 | | |
4102 | 0 | case OP_TYPEPOSPLUS: |
4103 | 0 | possessive = TRUE; |
4104 | 0 | min = 1; |
4105 | 0 | max = INT_MAX; |
4106 | 0 | ecode++; |
4107 | 0 | goto REPEATTYPE; |
4108 | | |
4109 | 0 | case OP_TYPEPOSQUERY: |
4110 | 0 | possessive = TRUE; |
4111 | 0 | min = 0; |
4112 | 0 | max = 1; |
4113 | 0 | ecode++; |
4114 | 0 | goto REPEATTYPE; |
4115 | | |
4116 | 0 | case OP_TYPEPOSUPTO: |
4117 | 0 | possessive = TRUE; |
4118 | 0 | min = 0; |
4119 | 0 | max = GET2(ecode, 1); |
4120 | 0 | ecode += 1 + IMM2_SIZE; |
4121 | 0 | goto REPEATTYPE; |
4122 | | |
4123 | 0 | case OP_TYPESTAR: |
4124 | 0 | case OP_TYPEMINSTAR: |
4125 | 0 | case OP_TYPEPLUS: |
4126 | 0 | case OP_TYPEMINPLUS: |
4127 | 0 | case OP_TYPEQUERY: |
4128 | 0 | case OP_TYPEMINQUERY: |
4129 | 0 | c = *ecode++ - OP_TYPESTAR; |
4130 | 0 | minimize = (c & 1) != 0; |
4131 | 0 | min = rep_min[c]; /* Pick up values from tables; */ |
4132 | 0 | max = rep_max[c]; /* zero for max => infinity */ |
4133 | 0 | if (max == 0) max = INT_MAX; |
4134 | | |
4135 | | /* Common code for all repeated single character type matches. Note that |
4136 | | in UTF-8 mode, '.' matches a character of any length, but for the other |
4137 | | character types, the valid characters are all one-byte long. */ |
4138 | |
|
4139 | 0 | REPEATTYPE: |
4140 | 0 | ctype = *ecode++; /* Code for the character type */ |
4141 | |
|
4142 | | #ifdef SUPPORT_UCP |
4143 | | if (ctype == OP_PROP || ctype == OP_NOTPROP) |
4144 | | { |
4145 | | prop_fail_result = ctype == OP_NOTPROP; |
4146 | | prop_type = *ecode++; |
4147 | | prop_value = *ecode++; |
4148 | | } |
4149 | | else prop_type = -1; |
4150 | | #endif |
4151 | | |
4152 | | /* First, ensure the minimum number of matches are present. Use inline |
4153 | | code for maximizing the speed, and do the type test once at the start |
4154 | | (i.e. keep it out of the loop). Separate the UTF-8 code completely as that |
4155 | | is tidier. Also separate the UCP code, which can be the same for both UTF-8 |
4156 | | and single-bytes. */ |
4157 | |
|
4158 | 0 | if (min > 0) |
4159 | 0 | { |
4160 | | #ifdef SUPPORT_UCP |
4161 | | if (prop_type >= 0) |
4162 | | { |
4163 | | switch(prop_type) |
4164 | | { |
4165 | | case PT_ANY: |
4166 | | if (prop_fail_result) RRETURN(MATCH_NOMATCH); |
4167 | | for (i = 1; i <= min; i++) |
4168 | | { |
4169 | | if (eptr >= md->end_subject) |
4170 | | { |
4171 | | SCHECK_PARTIAL(); |
4172 | | RRETURN(MATCH_NOMATCH); |
4173 | | } |
4174 | | GETCHARINCTEST(c, eptr); |
4175 | | } |
4176 | | break; |
4177 | | |
4178 | | case PT_LAMP: |
4179 | | for (i = 1; i <= min; i++) |
4180 | | { |
4181 | | int chartype; |
4182 | | if (eptr >= md->end_subject) |
4183 | | { |
4184 | | SCHECK_PARTIAL(); |
4185 | | RRETURN(MATCH_NOMATCH); |
4186 | | } |
4187 | | GETCHARINCTEST(c, eptr); |
4188 | | chartype = UCD_CHARTYPE(c); |
4189 | | if ((chartype == ucp_Lu || |
4190 | | chartype == ucp_Ll || |
4191 | | chartype == ucp_Lt) == prop_fail_result) |
4192 | | RRETURN(MATCH_NOMATCH); |
4193 | | } |
4194 | | break; |
4195 | | |
4196 | | case PT_GC: |
4197 | | for (i = 1; i <= min; i++) |
4198 | | { |
4199 | | if (eptr >= md->end_subject) |
4200 | | { |
4201 | | SCHECK_PARTIAL(); |
4202 | | RRETURN(MATCH_NOMATCH); |
4203 | | } |
4204 | | GETCHARINCTEST(c, eptr); |
4205 | | if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) |
4206 | | RRETURN(MATCH_NOMATCH); |
4207 | | } |
4208 | | break; |
4209 | | |
4210 | | case PT_PC: |
4211 | | for (i = 1; i <= min; i++) |
4212 | | { |
4213 | | if (eptr >= md->end_subject) |
4214 | | { |
4215 | | SCHECK_PARTIAL(); |
4216 | | RRETURN(MATCH_NOMATCH); |
4217 | | } |
4218 | | GETCHARINCTEST(c, eptr); |
4219 | | if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) |
4220 | | RRETURN(MATCH_NOMATCH); |
4221 | | } |
4222 | | break; |
4223 | | |
4224 | | case PT_SC: |
4225 | | for (i = 1; i <= min; i++) |
4226 | | { |
4227 | | if (eptr >= md->end_subject) |
4228 | | { |
4229 | | SCHECK_PARTIAL(); |
4230 | | RRETURN(MATCH_NOMATCH); |
4231 | | } |
4232 | | GETCHARINCTEST(c, eptr); |
4233 | | if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) |
4234 | | RRETURN(MATCH_NOMATCH); |
4235 | | } |
4236 | | break; |
4237 | | |
4238 | | case PT_ALNUM: |
4239 | | for (i = 1; i <= min; i++) |
4240 | | { |
4241 | | int category; |
4242 | | if (eptr >= md->end_subject) |
4243 | | { |
4244 | | SCHECK_PARTIAL(); |
4245 | | RRETURN(MATCH_NOMATCH); |
4246 | | } |
4247 | | GETCHARINCTEST(c, eptr); |
4248 | | category = UCD_CATEGORY(c); |
4249 | | if ((category == ucp_L || category == ucp_N) == prop_fail_result) |
4250 | | RRETURN(MATCH_NOMATCH); |
4251 | | } |
4252 | | break; |
4253 | | |
4254 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
4255 | | which means that Perl space and POSIX space are now identical. PCRE |
4256 | | was changed at release 8.34. */ |
4257 | | |
4258 | | case PT_SPACE: /* Perl space */ |
4259 | | case PT_PXSPACE: /* POSIX space */ |
4260 | | for (i = 1; i <= min; i++) |
4261 | | { |
4262 | | if (eptr >= md->end_subject) |
4263 | | { |
4264 | | SCHECK_PARTIAL(); |
4265 | | RRETURN(MATCH_NOMATCH); |
4266 | | } |
4267 | | GETCHARINCTEST(c, eptr); |
4268 | | switch(c) |
4269 | | { |
4270 | | HSPACE_CASES: |
4271 | | VSPACE_CASES: |
4272 | | if (prop_fail_result) RRETURN(MATCH_NOMATCH); |
4273 | | break; |
4274 | | |
4275 | | default: |
4276 | | if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) |
4277 | | RRETURN(MATCH_NOMATCH); |
4278 | | break; |
4279 | | } |
4280 | | } |
4281 | | break; |
4282 | | |
4283 | | case PT_WORD: |
4284 | | for (i = 1; i <= min; i++) |
4285 | | { |
4286 | | int category; |
4287 | | if (eptr >= md->end_subject) |
4288 | | { |
4289 | | SCHECK_PARTIAL(); |
4290 | | RRETURN(MATCH_NOMATCH); |
4291 | | } |
4292 | | GETCHARINCTEST(c, eptr); |
4293 | | category = UCD_CATEGORY(c); |
4294 | | if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE) |
4295 | | == prop_fail_result) |
4296 | | RRETURN(MATCH_NOMATCH); |
4297 | | } |
4298 | | break; |
4299 | | |
4300 | | case PT_CLIST: |
4301 | | for (i = 1; i <= min; i++) |
4302 | | { |
4303 | | const pcre_uint32 *cp; |
4304 | | if (eptr >= md->end_subject) |
4305 | | { |
4306 | | SCHECK_PARTIAL(); |
4307 | | RRETURN(MATCH_NOMATCH); |
4308 | | } |
4309 | | GETCHARINCTEST(c, eptr); |
4310 | | cp = PRIV(ucd_caseless_sets) + prop_value; |
4311 | | for (;;) |
4312 | | { |
4313 | | if (c < *cp) |
4314 | | { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } |
4315 | | if (c == *cp++) |
4316 | | { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } |
4317 | | } |
4318 | | } |
4319 | | break; |
4320 | | |
4321 | | case PT_UCNC: |
4322 | | for (i = 1; i <= min; i++) |
4323 | | { |
4324 | | if (eptr >= md->end_subject) |
4325 | | { |
4326 | | SCHECK_PARTIAL(); |
4327 | | RRETURN(MATCH_NOMATCH); |
4328 | | } |
4329 | | GETCHARINCTEST(c, eptr); |
4330 | | if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || |
4331 | | c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || |
4332 | | c >= 0xe000) == prop_fail_result) |
4333 | | RRETURN(MATCH_NOMATCH); |
4334 | | } |
4335 | | break; |
4336 | | |
4337 | | /* This should not occur */ |
4338 | | |
4339 | | default: |
4340 | | RRETURN(PCRE_ERROR_INTERNAL); |
4341 | | } |
4342 | | } |
4343 | | |
4344 | | /* Match extended Unicode sequences. We will get here only if the |
4345 | | support is in the binary; otherwise a compile-time error occurs. */ |
4346 | | |
4347 | | else if (ctype == OP_EXTUNI) |
4348 | | { |
4349 | | for (i = 1; i <= min; i++) |
4350 | | { |
4351 | | if (eptr >= md->end_subject) |
4352 | | { |
4353 | | SCHECK_PARTIAL(); |
4354 | | RRETURN(MATCH_NOMATCH); |
4355 | | } |
4356 | | else |
4357 | | { |
4358 | | int lgb, rgb; |
4359 | | GETCHARINCTEST(c, eptr); |
4360 | | lgb = UCD_GRAPHBREAK(c); |
4361 | | while (eptr < md->end_subject) |
4362 | | { |
4363 | | int len = 1; |
4364 | | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } |
4365 | | rgb = UCD_GRAPHBREAK(c); |
4366 | | if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; |
4367 | | lgb = rgb; |
4368 | | eptr += len; |
4369 | | } |
4370 | | } |
4371 | | CHECK_PARTIAL(); |
4372 | | } |
4373 | | } |
4374 | | |
4375 | | else |
4376 | | #endif /* SUPPORT_UCP */ |
4377 | | |
4378 | | /* Handle all other cases when the coding is UTF-8 */ |
4379 | |
|
4380 | | #ifdef SUPPORT_UTF |
4381 | | if (utf) switch(ctype) |
4382 | | { |
4383 | | case OP_ANY: |
4384 | | for (i = 1; i <= min; i++) |
4385 | | { |
4386 | | if (eptr >= md->end_subject) |
4387 | | { |
4388 | | SCHECK_PARTIAL(); |
4389 | | RRETURN(MATCH_NOMATCH); |
4390 | | } |
4391 | | if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); |
4392 | | if (md->partial != 0 && |
4393 | | eptr + 1 >= md->end_subject && |
4394 | | NLBLOCK->nltype == NLTYPE_FIXED && |
4395 | | NLBLOCK->nllen == 2 && |
4396 | | UCHAR21(eptr) == NLBLOCK->nl[0]) |
4397 | | { |
4398 | | md->hitend = TRUE; |
4399 | | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); |
4400 | | } |
4401 | | eptr++; |
4402 | | ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); |
4403 | | } |
4404 | | break; |
4405 | | |
4406 | | case OP_ALLANY: |
4407 | | for (i = 1; i <= min; i++) |
4408 | | { |
4409 | | if (eptr >= md->end_subject) |
4410 | | { |
4411 | | SCHECK_PARTIAL(); |
4412 | | RRETURN(MATCH_NOMATCH); |
4413 | | } |
4414 | | eptr++; |
4415 | | ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); |
4416 | | } |
4417 | | break; |
4418 | | |
4419 | | case OP_ANYBYTE: |
4420 | | if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH); |
4421 | | eptr += min; |
4422 | | break; |
4423 | | |
4424 | | case OP_ANYNL: |
4425 | | for (i = 1; i <= min; i++) |
4426 | | { |
4427 | | if (eptr >= md->end_subject) |
4428 | | { |
4429 | | SCHECK_PARTIAL(); |
4430 | | RRETURN(MATCH_NOMATCH); |
4431 | | } |
4432 | | GETCHARINC(c, eptr); |
4433 | | switch(c) |
4434 | | { |
4435 | | default: RRETURN(MATCH_NOMATCH); |
4436 | | |
4437 | | case CHAR_CR: |
4438 | | if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++; |
4439 | | break; |
4440 | | |
4441 | | case CHAR_LF: |
4442 | | break; |
4443 | | |
4444 | | case CHAR_VT: |
4445 | | case CHAR_FF: |
4446 | | case CHAR_NEL: |
4447 | | #ifndef EBCDIC |
4448 | | case 0x2028: |
4449 | | case 0x2029: |
4450 | | #endif /* Not EBCDIC */ |
4451 | | if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); |
4452 | | break; |
4453 | | } |
4454 | | } |
4455 | | break; |
4456 | | |
4457 | | case OP_NOT_HSPACE: |
4458 | | for (i = 1; i <= min; i++) |
4459 | | { |
4460 | | if (eptr >= md->end_subject) |
4461 | | { |
4462 | | SCHECK_PARTIAL(); |
4463 | | RRETURN(MATCH_NOMATCH); |
4464 | | } |
4465 | | GETCHARINC(c, eptr); |
4466 | | switch(c) |
4467 | | { |
4468 | | HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ |
4469 | | default: break; |
4470 | | } |
4471 | | } |
4472 | | break; |
4473 | | |
4474 | | case OP_HSPACE: |
4475 | | for (i = 1; i <= min; i++) |
4476 | | { |
4477 | | if (eptr >= md->end_subject) |
4478 | | { |
4479 | | SCHECK_PARTIAL(); |
4480 | | RRETURN(MATCH_NOMATCH); |
4481 | | } |
4482 | | GETCHARINC(c, eptr); |
4483 | | switch(c) |
4484 | | { |
4485 | | HSPACE_CASES: break; /* Byte and multibyte cases */ |
4486 | | default: RRETURN(MATCH_NOMATCH); |
4487 | | } |
4488 | | } |
4489 | | break; |
4490 | | |
4491 | | case OP_NOT_VSPACE: |
4492 | | for (i = 1; i <= min; i++) |
4493 | | { |
4494 | | if (eptr >= md->end_subject) |
4495 | | { |
4496 | | SCHECK_PARTIAL(); |
4497 | | RRETURN(MATCH_NOMATCH); |
4498 | | } |
4499 | | GETCHARINC(c, eptr); |
4500 | | switch(c) |
4501 | | { |
4502 | | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
4503 | | default: break; |
4504 | | } |
4505 | | } |
4506 | | break; |
4507 | | |
4508 | | case OP_VSPACE: |
4509 | | for (i = 1; i <= min; i++) |
4510 | | { |
4511 | | if (eptr >= md->end_subject) |
4512 | | { |
4513 | | SCHECK_PARTIAL(); |
4514 | | RRETURN(MATCH_NOMATCH); |
4515 | | } |
4516 | | GETCHARINC(c, eptr); |
4517 | | switch(c) |
4518 | | { |
4519 | | VSPACE_CASES: break; |
4520 | | default: RRETURN(MATCH_NOMATCH); |
4521 | | } |
4522 | | } |
4523 | | break; |
4524 | | |
4525 | | case OP_NOT_DIGIT: |
4526 | | for (i = 1; i <= min; i++) |
4527 | | { |
4528 | | if (eptr >= md->end_subject) |
4529 | | { |
4530 | | SCHECK_PARTIAL(); |
4531 | | RRETURN(MATCH_NOMATCH); |
4532 | | } |
4533 | | GETCHARINC(c, eptr); |
4534 | | if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) |
4535 | | RRETURN(MATCH_NOMATCH); |
4536 | | } |
4537 | | break; |
4538 | | |
4539 | | case OP_DIGIT: |
4540 | | for (i = 1; i <= min; i++) |
4541 | | { |
4542 | | pcre_uint32 cc; |
4543 | | if (eptr >= md->end_subject) |
4544 | | { |
4545 | | SCHECK_PARTIAL(); |
4546 | | RRETURN(MATCH_NOMATCH); |
4547 | | } |
4548 | | cc = UCHAR21(eptr); |
4549 | | if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0) |
4550 | | RRETURN(MATCH_NOMATCH); |
4551 | | eptr++; |
4552 | | /* No need to skip more bytes - we know it's a 1-byte character */ |
4553 | | } |
4554 | | break; |
4555 | | |
4556 | | case OP_NOT_WHITESPACE: |
4557 | | for (i = 1; i <= min; i++) |
4558 | | { |
4559 | | pcre_uint32 cc; |
4560 | | if (eptr >= md->end_subject) |
4561 | | { |
4562 | | SCHECK_PARTIAL(); |
4563 | | RRETURN(MATCH_NOMATCH); |
4564 | | } |
4565 | | cc = UCHAR21(eptr); |
4566 | | if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0) |
4567 | | RRETURN(MATCH_NOMATCH); |
4568 | | eptr++; |
4569 | | ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); |
4570 | | } |
4571 | | break; |
4572 | | |
4573 | | case OP_WHITESPACE: |
4574 | | for (i = 1; i <= min; i++) |
4575 | | { |
4576 | | pcre_uint32 cc; |
4577 | | if (eptr >= md->end_subject) |
4578 | | { |
4579 | | SCHECK_PARTIAL(); |
4580 | | RRETURN(MATCH_NOMATCH); |
4581 | | } |
4582 | | cc = UCHAR21(eptr); |
4583 | | if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0) |
4584 | | RRETURN(MATCH_NOMATCH); |
4585 | | eptr++; |
4586 | | /* No need to skip more bytes - we know it's a 1-byte character */ |
4587 | | } |
4588 | | break; |
4589 | | |
4590 | | case OP_NOT_WORDCHAR: |
4591 | | for (i = 1; i <= min; i++) |
4592 | | { |
4593 | | pcre_uint32 cc; |
4594 | | if (eptr >= md->end_subject) |
4595 | | { |
4596 | | SCHECK_PARTIAL(); |
4597 | | RRETURN(MATCH_NOMATCH); |
4598 | | } |
4599 | | cc = UCHAR21(eptr); |
4600 | | if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0) |
4601 | | RRETURN(MATCH_NOMATCH); |
4602 | | eptr++; |
4603 | | ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); |
4604 | | } |
4605 | | break; |
4606 | | |
4607 | | case OP_WORDCHAR: |
4608 | | for (i = 1; i <= min; i++) |
4609 | | { |
4610 | | pcre_uint32 cc; |
4611 | | if (eptr >= md->end_subject) |
4612 | | { |
4613 | | SCHECK_PARTIAL(); |
4614 | | RRETURN(MATCH_NOMATCH); |
4615 | | } |
4616 | | cc = UCHAR21(eptr); |
4617 | | if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0) |
4618 | | RRETURN(MATCH_NOMATCH); |
4619 | | eptr++; |
4620 | | /* No need to skip more bytes - we know it's a 1-byte character */ |
4621 | | } |
4622 | | break; |
4623 | | |
4624 | | default: |
4625 | | RRETURN(PCRE_ERROR_INTERNAL); |
4626 | | } /* End switch(ctype) */ |
4627 | | |
4628 | | else |
4629 | | #endif /* SUPPORT_UTF */ |
4630 | | |
4631 | | /* Code for the non-UTF-8 case for minimum matching of operators other |
4632 | | than OP_PROP and OP_NOTPROP. */ |
4633 | |
|
4634 | 0 | switch(ctype) |
4635 | 0 | { |
4636 | 0 | case OP_ANY: |
4637 | 0 | for (i = 1; i <= min; i++) |
4638 | 0 | { |
4639 | 0 | if (eptr >= md->end_subject) |
4640 | 0 | { |
4641 | 0 | SCHECK_PARTIAL(); |
4642 | 0 | RRETURN(MATCH_NOMATCH); |
4643 | 0 | } |
4644 | 0 | if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); |
4645 | 0 | if (md->partial != 0 && |
4646 | 0 | eptr + 1 >= md->end_subject && |
4647 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
4648 | 0 | NLBLOCK->nllen == 2 && |
4649 | 0 | *eptr == NLBLOCK->nl[0]) |
4650 | 0 | { |
4651 | 0 | md->hitend = TRUE; |
4652 | 0 | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); |
4653 | 0 | } |
4654 | 0 | eptr++; |
4655 | 0 | } |
4656 | 0 | break; |
4657 | | |
4658 | 0 | case OP_ALLANY: |
4659 | 0 | if (eptr > md->end_subject - min) |
4660 | 0 | { |
4661 | 0 | SCHECK_PARTIAL(); |
4662 | 0 | RRETURN(MATCH_NOMATCH); |
4663 | 0 | } |
4664 | 0 | eptr += min; |
4665 | 0 | break; |
4666 | | |
4667 | 0 | case OP_ANYBYTE: |
4668 | 0 | if (eptr > md->end_subject - min) |
4669 | 0 | { |
4670 | 0 | SCHECK_PARTIAL(); |
4671 | 0 | RRETURN(MATCH_NOMATCH); |
4672 | 0 | } |
4673 | 0 | eptr += min; |
4674 | 0 | break; |
4675 | | |
4676 | 0 | case OP_ANYNL: |
4677 | 0 | for (i = 1; i <= min; i++) |
4678 | 0 | { |
4679 | 0 | if (eptr >= md->end_subject) |
4680 | 0 | { |
4681 | 0 | SCHECK_PARTIAL(); |
4682 | 0 | RRETURN(MATCH_NOMATCH); |
4683 | 0 | } |
4684 | 0 | switch(*eptr++) |
4685 | 0 | { |
4686 | 0 | default: RRETURN(MATCH_NOMATCH); |
4687 | |
|
4688 | 0 | case CHAR_CR: |
4689 | 0 | if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; |
4690 | 0 | break; |
4691 | | |
4692 | 0 | case CHAR_LF: |
4693 | 0 | break; |
4694 | | |
4695 | 0 | case CHAR_VT: |
4696 | 0 | case CHAR_FF: |
4697 | 0 | case CHAR_NEL: |
4698 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
4699 | | case 0x2028: |
4700 | | case 0x2029: |
4701 | | #endif |
4702 | 0 | if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); |
4703 | 0 | break; |
4704 | 0 | } |
4705 | 0 | } |
4706 | 0 | break; |
4707 | | |
4708 | 0 | case OP_NOT_HSPACE: |
4709 | 0 | for (i = 1; i <= min; i++) |
4710 | 0 | { |
4711 | 0 | if (eptr >= md->end_subject) |
4712 | 0 | { |
4713 | 0 | SCHECK_PARTIAL(); |
4714 | 0 | RRETURN(MATCH_NOMATCH); |
4715 | 0 | } |
4716 | 0 | switch(*eptr++) |
4717 | 0 | { |
4718 | 0 | default: break; |
4719 | 0 | HSPACE_BYTE_CASES: |
4720 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
4721 | | HSPACE_MULTIBYTE_CASES: |
4722 | | #endif |
4723 | 0 | RRETURN(MATCH_NOMATCH); |
4724 | 0 | } |
4725 | 0 | } |
4726 | 0 | break; |
4727 | | |
4728 | 0 | case OP_HSPACE: |
4729 | 0 | for (i = 1; i <= min; i++) |
4730 | 0 | { |
4731 | 0 | if (eptr >= md->end_subject) |
4732 | 0 | { |
4733 | 0 | SCHECK_PARTIAL(); |
4734 | 0 | RRETURN(MATCH_NOMATCH); |
4735 | 0 | } |
4736 | 0 | switch(*eptr++) |
4737 | 0 | { |
4738 | 0 | default: RRETURN(MATCH_NOMATCH); |
4739 | 0 | HSPACE_BYTE_CASES: |
4740 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
4741 | | HSPACE_MULTIBYTE_CASES: |
4742 | | #endif |
4743 | 0 | break; |
4744 | 0 | } |
4745 | 0 | } |
4746 | 0 | break; |
4747 | | |
4748 | 0 | case OP_NOT_VSPACE: |
4749 | 0 | for (i = 1; i <= min; i++) |
4750 | 0 | { |
4751 | 0 | if (eptr >= md->end_subject) |
4752 | 0 | { |
4753 | 0 | SCHECK_PARTIAL(); |
4754 | 0 | RRETURN(MATCH_NOMATCH); |
4755 | 0 | } |
4756 | 0 | switch(*eptr++) |
4757 | 0 | { |
4758 | 0 | VSPACE_BYTE_CASES: |
4759 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
4760 | | VSPACE_MULTIBYTE_CASES: |
4761 | | #endif |
4762 | 0 | RRETURN(MATCH_NOMATCH); |
4763 | 0 | default: break; |
4764 | 0 | } |
4765 | 0 | } |
4766 | 0 | break; |
4767 | | |
4768 | 0 | case OP_VSPACE: |
4769 | 0 | for (i = 1; i <= min; i++) |
4770 | 0 | { |
4771 | 0 | if (eptr >= md->end_subject) |
4772 | 0 | { |
4773 | 0 | SCHECK_PARTIAL(); |
4774 | 0 | RRETURN(MATCH_NOMATCH); |
4775 | 0 | } |
4776 | 0 | switch(*eptr++) |
4777 | 0 | { |
4778 | 0 | default: RRETURN(MATCH_NOMATCH); |
4779 | 0 | VSPACE_BYTE_CASES: |
4780 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
4781 | | VSPACE_MULTIBYTE_CASES: |
4782 | | #endif |
4783 | 0 | break; |
4784 | 0 | } |
4785 | 0 | } |
4786 | 0 | break; |
4787 | | |
4788 | 0 | case OP_NOT_DIGIT: |
4789 | 0 | for (i = 1; i <= min; i++) |
4790 | 0 | { |
4791 | 0 | if (eptr >= md->end_subject) |
4792 | 0 | { |
4793 | 0 | SCHECK_PARTIAL(); |
4794 | 0 | RRETURN(MATCH_NOMATCH); |
4795 | 0 | } |
4796 | 0 | if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) |
4797 | 0 | RRETURN(MATCH_NOMATCH); |
4798 | 0 | eptr++; |
4799 | 0 | } |
4800 | 0 | break; |
4801 | | |
4802 | 0 | case OP_DIGIT: |
4803 | 0 | for (i = 1; i <= min; i++) |
4804 | 0 | { |
4805 | 0 | if (eptr >= md->end_subject) |
4806 | 0 | { |
4807 | 0 | SCHECK_PARTIAL(); |
4808 | 0 | RRETURN(MATCH_NOMATCH); |
4809 | 0 | } |
4810 | 0 | if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) |
4811 | 0 | RRETURN(MATCH_NOMATCH); |
4812 | 0 | eptr++; |
4813 | 0 | } |
4814 | 0 | break; |
4815 | | |
4816 | 0 | case OP_NOT_WHITESPACE: |
4817 | 0 | for (i = 1; i <= min; i++) |
4818 | 0 | { |
4819 | 0 | if (eptr >= md->end_subject) |
4820 | 0 | { |
4821 | 0 | SCHECK_PARTIAL(); |
4822 | 0 | RRETURN(MATCH_NOMATCH); |
4823 | 0 | } |
4824 | 0 | if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) |
4825 | 0 | RRETURN(MATCH_NOMATCH); |
4826 | 0 | eptr++; |
4827 | 0 | } |
4828 | 0 | break; |
4829 | | |
4830 | 0 | case OP_WHITESPACE: |
4831 | 0 | for (i = 1; i <= min; i++) |
4832 | 0 | { |
4833 | 0 | if (eptr >= md->end_subject) |
4834 | 0 | { |
4835 | 0 | SCHECK_PARTIAL(); |
4836 | 0 | RRETURN(MATCH_NOMATCH); |
4837 | 0 | } |
4838 | 0 | if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) |
4839 | 0 | RRETURN(MATCH_NOMATCH); |
4840 | 0 | eptr++; |
4841 | 0 | } |
4842 | 0 | break; |
4843 | | |
4844 | 0 | case OP_NOT_WORDCHAR: |
4845 | 0 | for (i = 1; i <= min; i++) |
4846 | 0 | { |
4847 | 0 | if (eptr >= md->end_subject) |
4848 | 0 | { |
4849 | 0 | SCHECK_PARTIAL(); |
4850 | 0 | RRETURN(MATCH_NOMATCH); |
4851 | 0 | } |
4852 | 0 | if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) |
4853 | 0 | RRETURN(MATCH_NOMATCH); |
4854 | 0 | eptr++; |
4855 | 0 | } |
4856 | 0 | break; |
4857 | | |
4858 | 0 | case OP_WORDCHAR: |
4859 | 0 | for (i = 1; i <= min; i++) |
4860 | 0 | { |
4861 | 0 | if (eptr >= md->end_subject) |
4862 | 0 | { |
4863 | 0 | SCHECK_PARTIAL(); |
4864 | 0 | RRETURN(MATCH_NOMATCH); |
4865 | 0 | } |
4866 | 0 | if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) |
4867 | 0 | RRETURN(MATCH_NOMATCH); |
4868 | 0 | eptr++; |
4869 | 0 | } |
4870 | 0 | break; |
4871 | | |
4872 | 0 | default: |
4873 | 0 | RRETURN(PCRE_ERROR_INTERNAL); |
4874 | 0 | } |
4875 | 0 | } |
4876 | | |
4877 | | /* If min = max, continue at the same level without recursing */ |
4878 | | |
4879 | 0 | if (min == max) continue; |
4880 | | |
4881 | | /* If minimizing, we have to test the rest of the pattern before each |
4882 | | subsequent match. Again, separate the UTF-8 case for speed, and also |
4883 | | separate the UCP cases. */ |
4884 | | |
4885 | 0 | if (minimize) |
4886 | 0 | { |
4887 | | #ifdef SUPPORT_UCP |
4888 | | if (prop_type >= 0) |
4889 | | { |
4890 | | switch(prop_type) |
4891 | | { |
4892 | | case PT_ANY: |
4893 | | for (fi = min;; fi++) |
4894 | | { |
4895 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM36); |
4896 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4897 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
4898 | | if (eptr >= md->end_subject) |
4899 | | { |
4900 | | SCHECK_PARTIAL(); |
4901 | | RRETURN(MATCH_NOMATCH); |
4902 | | } |
4903 | | GETCHARINCTEST(c, eptr); |
4904 | | if (prop_fail_result) RRETURN(MATCH_NOMATCH); |
4905 | | } |
4906 | | /* Control never gets here */ |
4907 | | |
4908 | | case PT_LAMP: |
4909 | | for (fi = min;; fi++) |
4910 | | { |
4911 | | int chartype; |
4912 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM37); |
4913 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4914 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
4915 | | if (eptr >= md->end_subject) |
4916 | | { |
4917 | | SCHECK_PARTIAL(); |
4918 | | RRETURN(MATCH_NOMATCH); |
4919 | | } |
4920 | | GETCHARINCTEST(c, eptr); |
4921 | | chartype = UCD_CHARTYPE(c); |
4922 | | if ((chartype == ucp_Lu || |
4923 | | chartype == ucp_Ll || |
4924 | | chartype == ucp_Lt) == prop_fail_result) |
4925 | | RRETURN(MATCH_NOMATCH); |
4926 | | } |
4927 | | /* Control never gets here */ |
4928 | | |
4929 | | case PT_GC: |
4930 | | for (fi = min;; fi++) |
4931 | | { |
4932 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM38); |
4933 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4934 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
4935 | | if (eptr >= md->end_subject) |
4936 | | { |
4937 | | SCHECK_PARTIAL(); |
4938 | | RRETURN(MATCH_NOMATCH); |
4939 | | } |
4940 | | GETCHARINCTEST(c, eptr); |
4941 | | if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) |
4942 | | RRETURN(MATCH_NOMATCH); |
4943 | | } |
4944 | | /* Control never gets here */ |
4945 | | |
4946 | | case PT_PC: |
4947 | | for (fi = min;; fi++) |
4948 | | { |
4949 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM39); |
4950 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4951 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
4952 | | if (eptr >= md->end_subject) |
4953 | | { |
4954 | | SCHECK_PARTIAL(); |
4955 | | RRETURN(MATCH_NOMATCH); |
4956 | | } |
4957 | | GETCHARINCTEST(c, eptr); |
4958 | | if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) |
4959 | | RRETURN(MATCH_NOMATCH); |
4960 | | } |
4961 | | /* Control never gets here */ |
4962 | | |
4963 | | case PT_SC: |
4964 | | for (fi = min;; fi++) |
4965 | | { |
4966 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM40); |
4967 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4968 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
4969 | | if (eptr >= md->end_subject) |
4970 | | { |
4971 | | SCHECK_PARTIAL(); |
4972 | | RRETURN(MATCH_NOMATCH); |
4973 | | } |
4974 | | GETCHARINCTEST(c, eptr); |
4975 | | if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) |
4976 | | RRETURN(MATCH_NOMATCH); |
4977 | | } |
4978 | | /* Control never gets here */ |
4979 | | |
4980 | | case PT_ALNUM: |
4981 | | for (fi = min;; fi++) |
4982 | | { |
4983 | | int category; |
4984 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM59); |
4985 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4986 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
4987 | | if (eptr >= md->end_subject) |
4988 | | { |
4989 | | SCHECK_PARTIAL(); |
4990 | | RRETURN(MATCH_NOMATCH); |
4991 | | } |
4992 | | GETCHARINCTEST(c, eptr); |
4993 | | category = UCD_CATEGORY(c); |
4994 | | if ((category == ucp_L || category == ucp_N) == prop_fail_result) |
4995 | | RRETURN(MATCH_NOMATCH); |
4996 | | } |
4997 | | /* Control never gets here */ |
4998 | | |
4999 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
5000 | | which means that Perl space and POSIX space are now identical. PCRE |
5001 | | was changed at release 8.34. */ |
5002 | | |
5003 | | case PT_SPACE: /* Perl space */ |
5004 | | case PT_PXSPACE: /* POSIX space */ |
5005 | | for (fi = min;; fi++) |
5006 | | { |
5007 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM61); |
5008 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5009 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
5010 | | if (eptr >= md->end_subject) |
5011 | | { |
5012 | | SCHECK_PARTIAL(); |
5013 | | RRETURN(MATCH_NOMATCH); |
5014 | | } |
5015 | | GETCHARINCTEST(c, eptr); |
5016 | | switch(c) |
5017 | | { |
5018 | | HSPACE_CASES: |
5019 | | VSPACE_CASES: |
5020 | | if (prop_fail_result) RRETURN(MATCH_NOMATCH); |
5021 | | break; |
5022 | | |
5023 | | default: |
5024 | | if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) |
5025 | | RRETURN(MATCH_NOMATCH); |
5026 | | break; |
5027 | | } |
5028 | | } |
5029 | | /* Control never gets here */ |
5030 | | |
5031 | | case PT_WORD: |
5032 | | for (fi = min;; fi++) |
5033 | | { |
5034 | | int category; |
5035 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM62); |
5036 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5037 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
5038 | | if (eptr >= md->end_subject) |
5039 | | { |
5040 | | SCHECK_PARTIAL(); |
5041 | | RRETURN(MATCH_NOMATCH); |
5042 | | } |
5043 | | GETCHARINCTEST(c, eptr); |
5044 | | category = UCD_CATEGORY(c); |
5045 | | if ((category == ucp_L || |
5046 | | category == ucp_N || |
5047 | | c == CHAR_UNDERSCORE) |
5048 | | == prop_fail_result) |
5049 | | RRETURN(MATCH_NOMATCH); |
5050 | | } |
5051 | | /* Control never gets here */ |
5052 | | |
5053 | | case PT_CLIST: |
5054 | | for (fi = min;; fi++) |
5055 | | { |
5056 | | const pcre_uint32 *cp; |
5057 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM67); |
5058 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5059 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
5060 | | if (eptr >= md->end_subject) |
5061 | | { |
5062 | | SCHECK_PARTIAL(); |
5063 | | RRETURN(MATCH_NOMATCH); |
5064 | | } |
5065 | | GETCHARINCTEST(c, eptr); |
5066 | | cp = PRIV(ucd_caseless_sets) + prop_value; |
5067 | | for (;;) |
5068 | | { |
5069 | | if (c < *cp) |
5070 | | { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } |
5071 | | if (c == *cp++) |
5072 | | { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } |
5073 | | } |
5074 | | } |
5075 | | /* Control never gets here */ |
5076 | | |
5077 | | case PT_UCNC: |
5078 | | for (fi = min;; fi++) |
5079 | | { |
5080 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); |
5081 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5082 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
5083 | | if (eptr >= md->end_subject) |
5084 | | { |
5085 | | SCHECK_PARTIAL(); |
5086 | | RRETURN(MATCH_NOMATCH); |
5087 | | } |
5088 | | GETCHARINCTEST(c, eptr); |
5089 | | if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || |
5090 | | c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || |
5091 | | c >= 0xe000) == prop_fail_result) |
5092 | | RRETURN(MATCH_NOMATCH); |
5093 | | } |
5094 | | /* Control never gets here */ |
5095 | | |
5096 | | /* This should never occur */ |
5097 | | default: |
5098 | | RRETURN(PCRE_ERROR_INTERNAL); |
5099 | | } |
5100 | | } |
5101 | | |
5102 | | /* Match extended Unicode sequences. We will get here only if the |
5103 | | support is in the binary; otherwise a compile-time error occurs. */ |
5104 | | |
5105 | | else if (ctype == OP_EXTUNI) |
5106 | | { |
5107 | | for (fi = min;; fi++) |
5108 | | { |
5109 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM41); |
5110 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5111 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
5112 | | if (eptr >= md->end_subject) |
5113 | | { |
5114 | | SCHECK_PARTIAL(); |
5115 | | RRETURN(MATCH_NOMATCH); |
5116 | | } |
5117 | | else |
5118 | | { |
5119 | | int lgb, rgb; |
5120 | | GETCHARINCTEST(c, eptr); |
5121 | | lgb = UCD_GRAPHBREAK(c); |
5122 | | while (eptr < md->end_subject) |
5123 | | { |
5124 | | int len = 1; |
5125 | | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } |
5126 | | rgb = UCD_GRAPHBREAK(c); |
5127 | | if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; |
5128 | | lgb = rgb; |
5129 | | eptr += len; |
5130 | | } |
5131 | | } |
5132 | | CHECK_PARTIAL(); |
5133 | | } |
5134 | | } |
5135 | | else |
5136 | | #endif /* SUPPORT_UCP */ |
5137 | |
|
5138 | | #ifdef SUPPORT_UTF |
5139 | | if (utf) |
5140 | | { |
5141 | | for (fi = min;; fi++) |
5142 | | { |
5143 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM42); |
5144 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5145 | | if (fi >= max) RRETURN(MATCH_NOMATCH); |
5146 | | if (eptr >= md->end_subject) |
5147 | | { |
5148 | | SCHECK_PARTIAL(); |
5149 | | RRETURN(MATCH_NOMATCH); |
5150 | | } |
5151 | | if (ctype == OP_ANY && IS_NEWLINE(eptr)) |
5152 | | RRETURN(MATCH_NOMATCH); |
5153 | | GETCHARINC(c, eptr); |
5154 | | switch(ctype) |
5155 | | { |
5156 | | case OP_ANY: /* This is the non-NL case */ |
5157 | | if (md->partial != 0 && /* Take care with CRLF partial */ |
5158 | | eptr >= md->end_subject && |
5159 | | NLBLOCK->nltype == NLTYPE_FIXED && |
5160 | | NLBLOCK->nllen == 2 && |
5161 | | c == NLBLOCK->nl[0]) |
5162 | | { |
5163 | | md->hitend = TRUE; |
5164 | | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); |
5165 | | } |
5166 | | break; |
5167 | | |
5168 | | case OP_ALLANY: |
5169 | | case OP_ANYBYTE: |
5170 | | break; |
5171 | | |
5172 | | case OP_ANYNL: |
5173 | | switch(c) |
5174 | | { |
5175 | | default: RRETURN(MATCH_NOMATCH); |
5176 | | case CHAR_CR: |
5177 | | if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++; |
5178 | | break; |
5179 | | |
5180 | | case CHAR_LF: |
5181 | | break; |
5182 | | |
5183 | | case CHAR_VT: |
5184 | | case CHAR_FF: |
5185 | | case CHAR_NEL: |
5186 | | #ifndef EBCDIC |
5187 | | case 0x2028: |
5188 | | case 0x2029: |
5189 | | #endif /* Not EBCDIC */ |
5190 | | if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); |
5191 | | break; |
5192 | | } |
5193 | | break; |
5194 | | |
5195 | | case OP_NOT_HSPACE: |
5196 | | switch(c) |
5197 | | { |
5198 | | HSPACE_CASES: RRETURN(MATCH_NOMATCH); |
5199 | | default: break; |
5200 | | } |
5201 | | break; |
5202 | | |
5203 | | case OP_HSPACE: |
5204 | | switch(c) |
5205 | | { |
5206 | | HSPACE_CASES: break; |
5207 | | default: RRETURN(MATCH_NOMATCH); |
5208 | | } |
5209 | | break; |
5210 | | |
5211 | | case OP_NOT_VSPACE: |
5212 | | switch(c) |
5213 | | { |
5214 | | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
5215 | | default: break; |
5216 | | } |
5217 | | break; |
5218 | | |
5219 | | case OP_VSPACE: |
5220 | | switch(c) |
5221 | | { |
5222 | | VSPACE_CASES: break; |
5223 | | default: RRETURN(MATCH_NOMATCH); |
5224 | | } |
5225 | | break; |
5226 | | |
5227 | | case OP_NOT_DIGIT: |
5228 | | if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) |
5229 | | RRETURN(MATCH_NOMATCH); |
5230 | | break; |
5231 | | |
5232 | | case OP_DIGIT: |
5233 | | if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) |
5234 | | RRETURN(MATCH_NOMATCH); |
5235 | | break; |
5236 | | |
5237 | | case OP_NOT_WHITESPACE: |
5238 | | if (c < 256 && (md->ctypes[c] & ctype_space) != 0) |
5239 | | RRETURN(MATCH_NOMATCH); |
5240 | | break; |
5241 | | |
5242 | | case OP_WHITESPACE: |
5243 | | if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) |
5244 | | RRETURN(MATCH_NOMATCH); |
5245 | | break; |
5246 | | |
5247 | | case OP_NOT_WORDCHAR: |
5248 | | if (c < 256 && (md->ctypes[c] & ctype_word) != 0) |
5249 | | RRETURN(MATCH_NOMATCH); |
5250 | | break; |
5251 | | |
5252 | | case OP_WORDCHAR: |
5253 | | if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) |
5254 | | RRETURN(MATCH_NOMATCH); |
5255 | | break; |
5256 | | |
5257 | | default: |
5258 | | RRETURN(PCRE_ERROR_INTERNAL); |
5259 | | } |
5260 | | } |
5261 | | } |
5262 | | else |
5263 | | #endif |
5264 | | /* Not UTF mode */ |
5265 | 0 | { |
5266 | 0 | for (fi = min;; fi++) |
5267 | 0 | { |
5268 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM43); |
5269 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5270 | 0 | if (fi >= max) RRETURN(MATCH_NOMATCH); |
5271 | 0 | if (eptr >= md->end_subject) |
5272 | 0 | { |
5273 | 0 | SCHECK_PARTIAL(); |
5274 | 0 | RRETURN(MATCH_NOMATCH); |
5275 | 0 | } |
5276 | 0 | if (ctype == OP_ANY && IS_NEWLINE(eptr)) |
5277 | 0 | RRETURN(MATCH_NOMATCH); |
5278 | 0 | c = *eptr++; |
5279 | 0 | switch(ctype) |
5280 | 0 | { |
5281 | 0 | case OP_ANY: /* This is the non-NL case */ |
5282 | 0 | if (md->partial != 0 && /* Take care with CRLF partial */ |
5283 | 0 | eptr >= md->end_subject && |
5284 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
5285 | 0 | NLBLOCK->nllen == 2 && |
5286 | 0 | c == NLBLOCK->nl[0]) |
5287 | 0 | { |
5288 | 0 | md->hitend = TRUE; |
5289 | 0 | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); |
5290 | 0 | } |
5291 | 0 | break; |
5292 | | |
5293 | 0 | case OP_ALLANY: |
5294 | 0 | case OP_ANYBYTE: |
5295 | 0 | break; |
5296 | | |
5297 | 0 | case OP_ANYNL: |
5298 | 0 | switch(c) |
5299 | 0 | { |
5300 | 0 | default: RRETURN(MATCH_NOMATCH); |
5301 | 0 | case CHAR_CR: |
5302 | 0 | if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; |
5303 | 0 | break; |
5304 | | |
5305 | 0 | case CHAR_LF: |
5306 | 0 | break; |
5307 | | |
5308 | 0 | case CHAR_VT: |
5309 | 0 | case CHAR_FF: |
5310 | 0 | case CHAR_NEL: |
5311 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
5312 | | case 0x2028: |
5313 | | case 0x2029: |
5314 | | #endif |
5315 | 0 | if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); |
5316 | 0 | break; |
5317 | 0 | } |
5318 | 0 | break; |
5319 | | |
5320 | 0 | case OP_NOT_HSPACE: |
5321 | 0 | switch(c) |
5322 | 0 | { |
5323 | 0 | default: break; |
5324 | 0 | HSPACE_BYTE_CASES: |
5325 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
5326 | | HSPACE_MULTIBYTE_CASES: |
5327 | | #endif |
5328 | 0 | RRETURN(MATCH_NOMATCH); |
5329 | 0 | } |
5330 | 0 | break; |
5331 | | |
5332 | 0 | case OP_HSPACE: |
5333 | 0 | switch(c) |
5334 | 0 | { |
5335 | 0 | default: RRETURN(MATCH_NOMATCH); |
5336 | 0 | HSPACE_BYTE_CASES: |
5337 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
5338 | | HSPACE_MULTIBYTE_CASES: |
5339 | | #endif |
5340 | 0 | break; |
5341 | 0 | } |
5342 | 0 | break; |
5343 | | |
5344 | 0 | case OP_NOT_VSPACE: |
5345 | 0 | switch(c) |
5346 | 0 | { |
5347 | 0 | default: break; |
5348 | 0 | VSPACE_BYTE_CASES: |
5349 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
5350 | | VSPACE_MULTIBYTE_CASES: |
5351 | | #endif |
5352 | 0 | RRETURN(MATCH_NOMATCH); |
5353 | 0 | } |
5354 | 0 | break; |
5355 | | |
5356 | 0 | case OP_VSPACE: |
5357 | 0 | switch(c) |
5358 | 0 | { |
5359 | 0 | default: RRETURN(MATCH_NOMATCH); |
5360 | 0 | VSPACE_BYTE_CASES: |
5361 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
5362 | | VSPACE_MULTIBYTE_CASES: |
5363 | | #endif |
5364 | 0 | break; |
5365 | 0 | } |
5366 | 0 | break; |
5367 | | |
5368 | 0 | case OP_NOT_DIGIT: |
5369 | 0 | if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); |
5370 | 0 | break; |
5371 | | |
5372 | 0 | case OP_DIGIT: |
5373 | 0 | if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); |
5374 | 0 | break; |
5375 | | |
5376 | 0 | case OP_NOT_WHITESPACE: |
5377 | 0 | if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); |
5378 | 0 | break; |
5379 | | |
5380 | 0 | case OP_WHITESPACE: |
5381 | 0 | if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); |
5382 | 0 | break; |
5383 | | |
5384 | 0 | case OP_NOT_WORDCHAR: |
5385 | 0 | if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); |
5386 | 0 | break; |
5387 | | |
5388 | 0 | case OP_WORDCHAR: |
5389 | 0 | if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); |
5390 | 0 | break; |
5391 | | |
5392 | 0 | default: |
5393 | 0 | RRETURN(PCRE_ERROR_INTERNAL); |
5394 | 0 | } |
5395 | 0 | } |
5396 | 0 | } |
5397 | | /* Control never gets here */ |
5398 | 0 | } |
5399 | | |
5400 | | /* If maximizing, it is worth using inline code for speed, doing the type |
5401 | | test once at the start (i.e. keep it out of the loop). Again, keep the |
5402 | | UTF-8 and UCP stuff separate. */ |
5403 | | |
5404 | 0 | else |
5405 | 0 | { |
5406 | 0 | pp = eptr; /* Remember where we started */ |
5407 | |
|
5408 | | #ifdef SUPPORT_UCP |
5409 | | if (prop_type >= 0) |
5410 | | { |
5411 | | switch(prop_type) |
5412 | | { |
5413 | | case PT_ANY: |
5414 | | for (i = min; i < max; i++) |
5415 | | { |
5416 | | int len = 1; |
5417 | | if (eptr >= md->end_subject) |
5418 | | { |
5419 | | SCHECK_PARTIAL(); |
5420 | | break; |
5421 | | } |
5422 | | GETCHARLENTEST(c, eptr, len); |
5423 | | if (prop_fail_result) break; |
5424 | | eptr+= len; |
5425 | | } |
5426 | | break; |
5427 | | |
5428 | | case PT_LAMP: |
5429 | | for (i = min; i < max; i++) |
5430 | | { |
5431 | | int chartype; |
5432 | | int len = 1; |
5433 | | if (eptr >= md->end_subject) |
5434 | | { |
5435 | | SCHECK_PARTIAL(); |
5436 | | break; |
5437 | | } |
5438 | | GETCHARLENTEST(c, eptr, len); |
5439 | | chartype = UCD_CHARTYPE(c); |
5440 | | if ((chartype == ucp_Lu || |
5441 | | chartype == ucp_Ll || |
5442 | | chartype == ucp_Lt) == prop_fail_result) |
5443 | | break; |
5444 | | eptr+= len; |
5445 | | } |
5446 | | break; |
5447 | | |
5448 | | case PT_GC: |
5449 | | for (i = min; i < max; i++) |
5450 | | { |
5451 | | int len = 1; |
5452 | | if (eptr >= md->end_subject) |
5453 | | { |
5454 | | SCHECK_PARTIAL(); |
5455 | | break; |
5456 | | } |
5457 | | GETCHARLENTEST(c, eptr, len); |
5458 | | if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break; |
5459 | | eptr+= len; |
5460 | | } |
5461 | | break; |
5462 | | |
5463 | | case PT_PC: |
5464 | | for (i = min; i < max; i++) |
5465 | | { |
5466 | | int len = 1; |
5467 | | if (eptr >= md->end_subject) |
5468 | | { |
5469 | | SCHECK_PARTIAL(); |
5470 | | break; |
5471 | | } |
5472 | | GETCHARLENTEST(c, eptr, len); |
5473 | | if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break; |
5474 | | eptr+= len; |
5475 | | } |
5476 | | break; |
5477 | | |
5478 | | case PT_SC: |
5479 | | for (i = min; i < max; i++) |
5480 | | { |
5481 | | int len = 1; |
5482 | | if (eptr >= md->end_subject) |
5483 | | { |
5484 | | SCHECK_PARTIAL(); |
5485 | | break; |
5486 | | } |
5487 | | GETCHARLENTEST(c, eptr, len); |
5488 | | if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break; |
5489 | | eptr+= len; |
5490 | | } |
5491 | | break; |
5492 | | |
5493 | | case PT_ALNUM: |
5494 | | for (i = min; i < max; i++) |
5495 | | { |
5496 | | int category; |
5497 | | int len = 1; |
5498 | | if (eptr >= md->end_subject) |
5499 | | { |
5500 | | SCHECK_PARTIAL(); |
5501 | | break; |
5502 | | } |
5503 | | GETCHARLENTEST(c, eptr, len); |
5504 | | category = UCD_CATEGORY(c); |
5505 | | if ((category == ucp_L || category == ucp_N) == prop_fail_result) |
5506 | | break; |
5507 | | eptr+= len; |
5508 | | } |
5509 | | break; |
5510 | | |
5511 | | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
5512 | | which means that Perl space and POSIX space are now identical. PCRE |
5513 | | was changed at release 8.34. */ |
5514 | | |
5515 | | case PT_SPACE: /* Perl space */ |
5516 | | case PT_PXSPACE: /* POSIX space */ |
5517 | | for (i = min; i < max; i++) |
5518 | | { |
5519 | | int len = 1; |
5520 | | if (eptr >= md->end_subject) |
5521 | | { |
5522 | | SCHECK_PARTIAL(); |
5523 | | break; |
5524 | | } |
5525 | | GETCHARLENTEST(c, eptr, len); |
5526 | | switch(c) |
5527 | | { |
5528 | | HSPACE_CASES: |
5529 | | VSPACE_CASES: |
5530 | | if (prop_fail_result) goto ENDLOOP99; /* Break the loop */ |
5531 | | break; |
5532 | | |
5533 | | default: |
5534 | | if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) |
5535 | | goto ENDLOOP99; /* Break the loop */ |
5536 | | break; |
5537 | | } |
5538 | | eptr+= len; |
5539 | | } |
5540 | | ENDLOOP99: |
5541 | | break; |
5542 | | |
5543 | | case PT_WORD: |
5544 | | for (i = min; i < max; i++) |
5545 | | { |
5546 | | int category; |
5547 | | int len = 1; |
5548 | | if (eptr >= md->end_subject) |
5549 | | { |
5550 | | SCHECK_PARTIAL(); |
5551 | | break; |
5552 | | } |
5553 | | GETCHARLENTEST(c, eptr, len); |
5554 | | category = UCD_CATEGORY(c); |
5555 | | if ((category == ucp_L || category == ucp_N || |
5556 | | c == CHAR_UNDERSCORE) == prop_fail_result) |
5557 | | break; |
5558 | | eptr+= len; |
5559 | | } |
5560 | | break; |
5561 | | |
5562 | | case PT_CLIST: |
5563 | | for (i = min; i < max; i++) |
5564 | | { |
5565 | | const pcre_uint32 *cp; |
5566 | | int len = 1; |
5567 | | if (eptr >= md->end_subject) |
5568 | | { |
5569 | | SCHECK_PARTIAL(); |
5570 | | break; |
5571 | | } |
5572 | | GETCHARLENTEST(c, eptr, len); |
5573 | | cp = PRIV(ucd_caseless_sets) + prop_value; |
5574 | | for (;;) |
5575 | | { |
5576 | | if (c < *cp) |
5577 | | { if (prop_fail_result) break; else goto GOT_MAX; } |
5578 | | if (c == *cp++) |
5579 | | { if (prop_fail_result) goto GOT_MAX; else break; } |
5580 | | } |
5581 | | eptr += len; |
5582 | | } |
5583 | | GOT_MAX: |
5584 | | break; |
5585 | | |
5586 | | case PT_UCNC: |
5587 | | for (i = min; i < max; i++) |
5588 | | { |
5589 | | int len = 1; |
5590 | | if (eptr >= md->end_subject) |
5591 | | { |
5592 | | SCHECK_PARTIAL(); |
5593 | | break; |
5594 | | } |
5595 | | GETCHARLENTEST(c, eptr, len); |
5596 | | if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || |
5597 | | c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || |
5598 | | c >= 0xe000) == prop_fail_result) |
5599 | | break; |
5600 | | eptr += len; |
5601 | | } |
5602 | | break; |
5603 | | |
5604 | | default: |
5605 | | RRETURN(PCRE_ERROR_INTERNAL); |
5606 | | } |
5607 | | |
5608 | | /* eptr is now past the end of the maximum run */ |
5609 | | |
5610 | | if (possessive) continue; /* No backtracking */ |
5611 | | for(;;) |
5612 | | { |
5613 | | if (eptr <= pp) goto TAIL_RECURSE; |
5614 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); |
5615 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5616 | | eptr--; |
5617 | | if (utf) BACKCHAR(eptr); |
5618 | | } |
5619 | | } |
5620 | | |
5621 | | /* Match extended Unicode grapheme clusters. We will get here only if the |
5622 | | support is in the binary; otherwise a compile-time error occurs. */ |
5623 | | |
5624 | | else if (ctype == OP_EXTUNI) |
5625 | | { |
5626 | | for (i = min; i < max; i++) |
5627 | | { |
5628 | | if (eptr >= md->end_subject) |
5629 | | { |
5630 | | SCHECK_PARTIAL(); |
5631 | | break; |
5632 | | } |
5633 | | else |
5634 | | { |
5635 | | int lgb, rgb; |
5636 | | GETCHARINCTEST(c, eptr); |
5637 | | lgb = UCD_GRAPHBREAK(c); |
5638 | | while (eptr < md->end_subject) |
5639 | | { |
5640 | | int len = 1; |
5641 | | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } |
5642 | | rgb = UCD_GRAPHBREAK(c); |
5643 | | if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; |
5644 | | lgb = rgb; |
5645 | | eptr += len; |
5646 | | } |
5647 | | } |
5648 | | CHECK_PARTIAL(); |
5649 | | } |
5650 | | |
5651 | | /* eptr is now past the end of the maximum run */ |
5652 | | |
5653 | | if (possessive) continue; /* No backtracking */ |
5654 | | |
5655 | | /* We use <= pp rather than == pp to detect the start of the run while |
5656 | | backtracking because the use of \C in UTF mode can cause BACKCHAR to |
5657 | | move back past pp. This is just palliative; the use of \C in UTF mode |
5658 | | is fraught with danger. */ |
5659 | | |
5660 | | for(;;) |
5661 | | { |
5662 | | int lgb, rgb; |
5663 | | PCRE_PUCHAR fptr; |
5664 | | |
5665 | | if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */ |
5666 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM45); |
5667 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5668 | | |
5669 | | /* Backtracking over an extended grapheme cluster involves inspecting |
5670 | | the previous two characters (if present) to see if a break is |
5671 | | permitted between them. */ |
5672 | | |
5673 | | eptr--; |
5674 | | if (!utf) c = *eptr; else |
5675 | | { |
5676 | | BACKCHAR(eptr); |
5677 | | GETCHAR(c, eptr); |
5678 | | } |
5679 | | rgb = UCD_GRAPHBREAK(c); |
5680 | | |
5681 | | for (;;) |
5682 | | { |
5683 | | if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */ |
5684 | | fptr = eptr - 1; |
5685 | | if (!utf) c = *fptr; else |
5686 | | { |
5687 | | BACKCHAR(fptr); |
5688 | | GETCHAR(c, fptr); |
5689 | | } |
5690 | | lgb = UCD_GRAPHBREAK(c); |
5691 | | if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; |
5692 | | eptr = fptr; |
5693 | | rgb = lgb; |
5694 | | } |
5695 | | } |
5696 | | } |
5697 | | |
5698 | | else |
5699 | | #endif /* SUPPORT_UCP */ |
5700 | |
|
5701 | | #ifdef SUPPORT_UTF |
5702 | | if (utf) |
5703 | | { |
5704 | | switch(ctype) |
5705 | | { |
5706 | | case OP_ANY: |
5707 | | for (i = min; i < max; i++) |
5708 | | { |
5709 | | if (eptr >= md->end_subject) |
5710 | | { |
5711 | | SCHECK_PARTIAL(); |
5712 | | break; |
5713 | | } |
5714 | | if (IS_NEWLINE(eptr)) break; |
5715 | | if (md->partial != 0 && /* Take care with CRLF partial */ |
5716 | | eptr + 1 >= md->end_subject && |
5717 | | NLBLOCK->nltype == NLTYPE_FIXED && |
5718 | | NLBLOCK->nllen == 2 && |
5719 | | UCHAR21(eptr) == NLBLOCK->nl[0]) |
5720 | | { |
5721 | | md->hitend = TRUE; |
5722 | | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); |
5723 | | } |
5724 | | eptr++; |
5725 | | ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); |
5726 | | } |
5727 | | break; |
5728 | | |
5729 | | case OP_ALLANY: |
5730 | | if (max < INT_MAX) |
5731 | | { |
5732 | | for (i = min; i < max; i++) |
5733 | | { |
5734 | | if (eptr >= md->end_subject) |
5735 | | { |
5736 | | SCHECK_PARTIAL(); |
5737 | | break; |
5738 | | } |
5739 | | eptr++; |
5740 | | ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); |
5741 | | } |
5742 | | } |
5743 | | else |
5744 | | { |
5745 | | eptr = md->end_subject; /* Unlimited UTF-8 repeat */ |
5746 | | SCHECK_PARTIAL(); |
5747 | | } |
5748 | | break; |
5749 | | |
5750 | | /* The byte case is the same as non-UTF8 */ |
5751 | | |
5752 | | case OP_ANYBYTE: |
5753 | | c = max - min; |
5754 | | if (c > (unsigned int)(md->end_subject - eptr)) |
5755 | | { |
5756 | | eptr = md->end_subject; |
5757 | | SCHECK_PARTIAL(); |
5758 | | } |
5759 | | else eptr += c; |
5760 | | break; |
5761 | | |
5762 | | case OP_ANYNL: |
5763 | | for (i = min; i < max; i++) |
5764 | | { |
5765 | | int len = 1; |
5766 | | if (eptr >= md->end_subject) |
5767 | | { |
5768 | | SCHECK_PARTIAL(); |
5769 | | break; |
5770 | | } |
5771 | | GETCHARLEN(c, eptr, len); |
5772 | | if (c == CHAR_CR) |
5773 | | { |
5774 | | if (++eptr >= md->end_subject) break; |
5775 | | if (UCHAR21(eptr) == CHAR_LF) eptr++; |
5776 | | } |
5777 | | else |
5778 | | { |
5779 | | if (c != CHAR_LF && |
5780 | | (md->bsr_anycrlf || |
5781 | | (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL |
5782 | | #ifndef EBCDIC |
5783 | | && c != 0x2028 && c != 0x2029 |
5784 | | #endif /* Not EBCDIC */ |
5785 | | ))) |
5786 | | break; |
5787 | | eptr += len; |
5788 | | } |
5789 | | } |
5790 | | break; |
5791 | | |
5792 | | case OP_NOT_HSPACE: |
5793 | | case OP_HSPACE: |
5794 | | for (i = min; i < max; i++) |
5795 | | { |
5796 | | BOOL gotspace; |
5797 | | int len = 1; |
5798 | | if (eptr >= md->end_subject) |
5799 | | { |
5800 | | SCHECK_PARTIAL(); |
5801 | | break; |
5802 | | } |
5803 | | GETCHARLEN(c, eptr, len); |
5804 | | switch(c) |
5805 | | { |
5806 | | HSPACE_CASES: gotspace = TRUE; break; |
5807 | | default: gotspace = FALSE; break; |
5808 | | } |
5809 | | if (gotspace == (ctype == OP_NOT_HSPACE)) break; |
5810 | | eptr += len; |
5811 | | } |
5812 | | break; |
5813 | | |
5814 | | case OP_NOT_VSPACE: |
5815 | | case OP_VSPACE: |
5816 | | for (i = min; i < max; i++) |
5817 | | { |
5818 | | BOOL gotspace; |
5819 | | int len = 1; |
5820 | | if (eptr >= md->end_subject) |
5821 | | { |
5822 | | SCHECK_PARTIAL(); |
5823 | | break; |
5824 | | } |
5825 | | GETCHARLEN(c, eptr, len); |
5826 | | switch(c) |
5827 | | { |
5828 | | VSPACE_CASES: gotspace = TRUE; break; |
5829 | | default: gotspace = FALSE; break; |
5830 | | } |
5831 | | if (gotspace == (ctype == OP_NOT_VSPACE)) break; |
5832 | | eptr += len; |
5833 | | } |
5834 | | break; |
5835 | | |
5836 | | case OP_NOT_DIGIT: |
5837 | | for (i = min; i < max; i++) |
5838 | | { |
5839 | | int len = 1; |
5840 | | if (eptr >= md->end_subject) |
5841 | | { |
5842 | | SCHECK_PARTIAL(); |
5843 | | break; |
5844 | | } |
5845 | | GETCHARLEN(c, eptr, len); |
5846 | | if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; |
5847 | | eptr+= len; |
5848 | | } |
5849 | | break; |
5850 | | |
5851 | | case OP_DIGIT: |
5852 | | for (i = min; i < max; i++) |
5853 | | { |
5854 | | int len = 1; |
5855 | | if (eptr >= md->end_subject) |
5856 | | { |
5857 | | SCHECK_PARTIAL(); |
5858 | | break; |
5859 | | } |
5860 | | GETCHARLEN(c, eptr, len); |
5861 | | if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; |
5862 | | eptr+= len; |
5863 | | } |
5864 | | break; |
5865 | | |
5866 | | case OP_NOT_WHITESPACE: |
5867 | | for (i = min; i < max; i++) |
5868 | | { |
5869 | | int len = 1; |
5870 | | if (eptr >= md->end_subject) |
5871 | | { |
5872 | | SCHECK_PARTIAL(); |
5873 | | break; |
5874 | | } |
5875 | | GETCHARLEN(c, eptr, len); |
5876 | | if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; |
5877 | | eptr+= len; |
5878 | | } |
5879 | | break; |
5880 | | |
5881 | | case OP_WHITESPACE: |
5882 | | for (i = min; i < max; i++) |
5883 | | { |
5884 | | int len = 1; |
5885 | | if (eptr >= md->end_subject) |
5886 | | { |
5887 | | SCHECK_PARTIAL(); |
5888 | | break; |
5889 | | } |
5890 | | GETCHARLEN(c, eptr, len); |
5891 | | if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; |
5892 | | eptr+= len; |
5893 | | } |
5894 | | break; |
5895 | | |
5896 | | case OP_NOT_WORDCHAR: |
5897 | | for (i = min; i < max; i++) |
5898 | | { |
5899 | | int len = 1; |
5900 | | if (eptr >= md->end_subject) |
5901 | | { |
5902 | | SCHECK_PARTIAL(); |
5903 | | break; |
5904 | | } |
5905 | | GETCHARLEN(c, eptr, len); |
5906 | | if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; |
5907 | | eptr+= len; |
5908 | | } |
5909 | | break; |
5910 | | |
5911 | | case OP_WORDCHAR: |
5912 | | for (i = min; i < max; i++) |
5913 | | { |
5914 | | int len = 1; |
5915 | | if (eptr >= md->end_subject) |
5916 | | { |
5917 | | SCHECK_PARTIAL(); |
5918 | | break; |
5919 | | } |
5920 | | GETCHARLEN(c, eptr, len); |
5921 | | if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; |
5922 | | eptr+= len; |
5923 | | } |
5924 | | break; |
5925 | | |
5926 | | default: |
5927 | | RRETURN(PCRE_ERROR_INTERNAL); |
5928 | | } |
5929 | | |
5930 | | if (possessive) continue; /* No backtracking */ |
5931 | | for(;;) |
5932 | | { |
5933 | | if (eptr <= pp) goto TAIL_RECURSE; |
5934 | | RMATCH(eptr, ecode, offset_top, md, eptrb, RM46); |
5935 | | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5936 | | eptr--; |
5937 | | BACKCHAR(eptr); |
5938 | | if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL && |
5939 | | UCHAR21(eptr - 1) == CHAR_CR) eptr--; |
5940 | | } |
5941 | | } |
5942 | | else |
5943 | | #endif /* SUPPORT_UTF */ |
5944 | | /* Not UTF mode */ |
5945 | 0 | { |
5946 | 0 | switch(ctype) |
5947 | 0 | { |
5948 | 0 | case OP_ANY: |
5949 | 0 | for (i = min; i < max; i++) |
5950 | 0 | { |
5951 | 0 | if (eptr >= md->end_subject) |
5952 | 0 | { |
5953 | 0 | SCHECK_PARTIAL(); |
5954 | 0 | break; |
5955 | 0 | } |
5956 | 0 | if (IS_NEWLINE(eptr)) break; |
5957 | 0 | if (md->partial != 0 && /* Take care with CRLF partial */ |
5958 | 0 | eptr + 1 >= md->end_subject && |
5959 | 0 | NLBLOCK->nltype == NLTYPE_FIXED && |
5960 | 0 | NLBLOCK->nllen == 2 && |
5961 | 0 | *eptr == NLBLOCK->nl[0]) |
5962 | 0 | { |
5963 | 0 | md->hitend = TRUE; |
5964 | 0 | if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); |
5965 | 0 | } |
5966 | 0 | eptr++; |
5967 | 0 | } |
5968 | 0 | break; |
5969 | | |
5970 | 0 | case OP_ALLANY: |
5971 | 0 | case OP_ANYBYTE: |
5972 | 0 | c = max - min; |
5973 | 0 | if (c > (unsigned int)(md->end_subject - eptr)) |
5974 | 0 | { |
5975 | 0 | eptr = md->end_subject; |
5976 | 0 | SCHECK_PARTIAL(); |
5977 | 0 | } |
5978 | 0 | else eptr += c; |
5979 | 0 | break; |
5980 | | |
5981 | 0 | case OP_ANYNL: |
5982 | 0 | for (i = min; i < max; i++) |
5983 | 0 | { |
5984 | 0 | if (eptr >= md->end_subject) |
5985 | 0 | { |
5986 | 0 | SCHECK_PARTIAL(); |
5987 | 0 | break; |
5988 | 0 | } |
5989 | 0 | c = *eptr; |
5990 | 0 | if (c == CHAR_CR) |
5991 | 0 | { |
5992 | 0 | if (++eptr >= md->end_subject) break; |
5993 | 0 | if (*eptr == CHAR_LF) eptr++; |
5994 | 0 | } |
5995 | 0 | else |
5996 | 0 | { |
5997 | 0 | if (c != CHAR_LF && (md->bsr_anycrlf || |
5998 | 0 | (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL |
5999 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
6000 | | && c != 0x2028 && c != 0x2029 |
6001 | | #endif |
6002 | 0 | ))) break; |
6003 | 0 | eptr++; |
6004 | 0 | } |
6005 | 0 | } |
6006 | 0 | break; |
6007 | | |
6008 | 0 | case OP_NOT_HSPACE: |
6009 | 0 | for (i = min; i < max; i++) |
6010 | 0 | { |
6011 | 0 | if (eptr >= md->end_subject) |
6012 | 0 | { |
6013 | 0 | SCHECK_PARTIAL(); |
6014 | 0 | break; |
6015 | 0 | } |
6016 | 0 | switch(*eptr) |
6017 | 0 | { |
6018 | 0 | default: eptr++; break; |
6019 | 0 | HSPACE_BYTE_CASES: |
6020 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
6021 | | HSPACE_MULTIBYTE_CASES: |
6022 | | #endif |
6023 | 0 | goto ENDLOOP00; |
6024 | 0 | } |
6025 | 0 | } |
6026 | 0 | ENDLOOP00: |
6027 | 0 | break; |
6028 | | |
6029 | 0 | case OP_HSPACE: |
6030 | 0 | for (i = min; i < max; i++) |
6031 | 0 | { |
6032 | 0 | if (eptr >= md->end_subject) |
6033 | 0 | { |
6034 | 0 | SCHECK_PARTIAL(); |
6035 | 0 | break; |
6036 | 0 | } |
6037 | 0 | switch(*eptr) |
6038 | 0 | { |
6039 | 0 | default: goto ENDLOOP01; |
6040 | 0 | HSPACE_BYTE_CASES: |
6041 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
6042 | | HSPACE_MULTIBYTE_CASES: |
6043 | | #endif |
6044 | 0 | eptr++; break; |
6045 | 0 | } |
6046 | 0 | } |
6047 | 0 | ENDLOOP01: |
6048 | 0 | break; |
6049 | | |
6050 | 0 | case OP_NOT_VSPACE: |
6051 | 0 | for (i = min; i < max; i++) |
6052 | 0 | { |
6053 | 0 | if (eptr >= md->end_subject) |
6054 | 0 | { |
6055 | 0 | SCHECK_PARTIAL(); |
6056 | 0 | break; |
6057 | 0 | } |
6058 | 0 | switch(*eptr) |
6059 | 0 | { |
6060 | 0 | default: eptr++; break; |
6061 | 0 | VSPACE_BYTE_CASES: |
6062 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
6063 | | VSPACE_MULTIBYTE_CASES: |
6064 | | #endif |
6065 | 0 | goto ENDLOOP02; |
6066 | 0 | } |
6067 | 0 | } |
6068 | 0 | ENDLOOP02: |
6069 | 0 | break; |
6070 | | |
6071 | 0 | case OP_VSPACE: |
6072 | 0 | for (i = min; i < max; i++) |
6073 | 0 | { |
6074 | 0 | if (eptr >= md->end_subject) |
6075 | 0 | { |
6076 | 0 | SCHECK_PARTIAL(); |
6077 | 0 | break; |
6078 | 0 | } |
6079 | 0 | switch(*eptr) |
6080 | 0 | { |
6081 | 0 | default: goto ENDLOOP03; |
6082 | 0 | VSPACE_BYTE_CASES: |
6083 | | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
6084 | | VSPACE_MULTIBYTE_CASES: |
6085 | | #endif |
6086 | 0 | eptr++; break; |
6087 | 0 | } |
6088 | 0 | } |
6089 | 0 | ENDLOOP03: |
6090 | 0 | break; |
6091 | | |
6092 | 0 | case OP_NOT_DIGIT: |
6093 | 0 | for (i = min; i < max; i++) |
6094 | 0 | { |
6095 | 0 | if (eptr >= md->end_subject) |
6096 | 0 | { |
6097 | 0 | SCHECK_PARTIAL(); |
6098 | 0 | break; |
6099 | 0 | } |
6100 | 0 | if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break; |
6101 | 0 | eptr++; |
6102 | 0 | } |
6103 | 0 | break; |
6104 | | |
6105 | 0 | case OP_DIGIT: |
6106 | 0 | for (i = min; i < max; i++) |
6107 | 0 | { |
6108 | 0 | if (eptr >= md->end_subject) |
6109 | 0 | { |
6110 | 0 | SCHECK_PARTIAL(); |
6111 | 0 | break; |
6112 | 0 | } |
6113 | 0 | if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break; |
6114 | 0 | eptr++; |
6115 | 0 | } |
6116 | 0 | break; |
6117 | | |
6118 | 0 | case OP_NOT_WHITESPACE: |
6119 | 0 | for (i = min; i < max; i++) |
6120 | 0 | { |
6121 | 0 | if (eptr >= md->end_subject) |
6122 | 0 | { |
6123 | 0 | SCHECK_PARTIAL(); |
6124 | 0 | break; |
6125 | 0 | } |
6126 | 0 | if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break; |
6127 | 0 | eptr++; |
6128 | 0 | } |
6129 | 0 | break; |
6130 | | |
6131 | 0 | case OP_WHITESPACE: |
6132 | 0 | for (i = min; i < max; i++) |
6133 | 0 | { |
6134 | 0 | if (eptr >= md->end_subject) |
6135 | 0 | { |
6136 | 0 | SCHECK_PARTIAL(); |
6137 | 0 | break; |
6138 | 0 | } |
6139 | 0 | if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break; |
6140 | 0 | eptr++; |
6141 | 0 | } |
6142 | 0 | break; |
6143 | | |
6144 | 0 | case OP_NOT_WORDCHAR: |
6145 | 0 | for (i = min; i < max; i++) |
6146 | 0 | { |
6147 | 0 | if (eptr >= md->end_subject) |
6148 | 0 | { |
6149 | 0 | SCHECK_PARTIAL(); |
6150 | 0 | break; |
6151 | 0 | } |
6152 | 0 | if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break; |
6153 | 0 | eptr++; |
6154 | 0 | } |
6155 | 0 | break; |
6156 | | |
6157 | 0 | case OP_WORDCHAR: |
6158 | 0 | for (i = min; i < max; i++) |
6159 | 0 | { |
6160 | 0 | if (eptr >= md->end_subject) |
6161 | 0 | { |
6162 | 0 | SCHECK_PARTIAL(); |
6163 | 0 | break; |
6164 | 0 | } |
6165 | 0 | if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break; |
6166 | 0 | eptr++; |
6167 | 0 | } |
6168 | 0 | break; |
6169 | | |
6170 | 0 | default: |
6171 | 0 | RRETURN(PCRE_ERROR_INTERNAL); |
6172 | 0 | } |
6173 | | |
6174 | 0 | if (possessive) continue; /* No backtracking */ |
6175 | 0 | for (;;) |
6176 | 0 | { |
6177 | 0 | if (eptr == pp) goto TAIL_RECURSE; |
6178 | 0 | RMATCH(eptr, ecode, offset_top, md, eptrb, RM47); |
6179 | 0 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6180 | 0 | eptr--; |
6181 | 0 | if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF && |
6182 | 0 | eptr[-1] == CHAR_CR) eptr--; |
6183 | 0 | } |
6184 | 0 | } |
6185 | | |
6186 | | /* Control never gets here */ |
6187 | 0 | } |
6188 | | |
6189 | | /* There's been some horrible disaster. Arrival here can only mean there is |
6190 | | something seriously wrong in the code above or the OP_xxx definitions. */ |
6191 | | |
6192 | 0 | default: |
6193 | 0 | DPRINTF(("Unknown opcode %d\n", *ecode)); |
6194 | 0 | RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); |
6195 | 0 | } |
6196 | | |
6197 | | /* Do not stick any code in here without much thought; it is assumed |
6198 | | that "continue" in the code above comes out to here to repeat the main |
6199 | | loop. */ |
6200 | |
|
6201 | 0 | } /* End of main loop */ |
6202 | | /* Control never reaches here */ |
6203 | | |
6204 | | |
6205 | | /* When compiling to use the heap rather than the stack for recursive calls to |
6206 | | match(), the RRETURN() macro jumps here. The number that is saved in |
6207 | | frame->Xwhere indicates which label we actually want to return to. */ |
6208 | | |
6209 | 0 | #ifdef NO_RECURSE |
6210 | 0 | #define LBL(val) case val: goto L_RM##val; |
6211 | 0 | HEAP_RETURN: |
6212 | 0 | switch (frame->Xwhere) |
6213 | 0 | { |
6214 | 0 | LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) |
6215 | 0 | LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) |
6216 | 0 | LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) |
6217 | 0 | LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) |
6218 | 0 | LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64) |
6219 | 0 | LBL(65) LBL(66) |
6220 | | #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
6221 | | LBL(20) LBL(21) |
6222 | | #endif |
6223 | | #ifdef SUPPORT_UTF |
6224 | | LBL(16) LBL(18) |
6225 | | LBL(22) LBL(23) LBL(28) LBL(30) |
6226 | | LBL(32) LBL(34) LBL(42) LBL(46) |
6227 | | #ifdef SUPPORT_UCP |
6228 | | LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) |
6229 | | LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) |
6230 | | #endif /* SUPPORT_UCP */ |
6231 | | #endif /* SUPPORT_UTF */ |
6232 | 0 | default: |
6233 | 0 | DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); |
6234 | 0 | return PCRE_ERROR_INTERNAL; |
6235 | 0 | } |
6236 | 0 | #undef LBL |
6237 | 0 | #endif /* NO_RECURSE */ |
6238 | 0 | } |
6239 | | |
6240 | | |
6241 | | /*************************************************************************** |
6242 | | **************************************************************************** |
6243 | | RECURSION IN THE match() FUNCTION |
6244 | | |
6245 | | Undefine all the macros that were defined above to handle this. */ |
6246 | | |
6247 | | #ifdef NO_RECURSE |
6248 | | #undef eptr |
6249 | | #undef ecode |
6250 | | #undef mstart |
6251 | | #undef offset_top |
6252 | | #undef eptrb |
6253 | | #undef flags |
6254 | | |
6255 | | #undef callpat |
6256 | | #undef charptr |
6257 | | #undef data |
6258 | | #undef next |
6259 | | #undef pp |
6260 | | #undef prev |
6261 | | #undef saved_eptr |
6262 | | |
6263 | | #undef new_recursive |
6264 | | |
6265 | | #undef cur_is_word |
6266 | | #undef condition |
6267 | | #undef prev_is_word |
6268 | | |
6269 | | #undef ctype |
6270 | | #undef length |
6271 | | #undef max |
6272 | | #undef min |
6273 | | #undef number |
6274 | | #undef offset |
6275 | | #undef op |
6276 | | #undef save_capture_last |
6277 | | #undef save_offset1 |
6278 | | #undef save_offset2 |
6279 | | #undef save_offset3 |
6280 | | #undef stacksave |
6281 | | |
6282 | | #undef newptrb |
6283 | | |
6284 | | #endif |
6285 | | |
6286 | | /* These two are defined as macros in both cases */ |
6287 | | |
6288 | | #undef fc |
6289 | | #undef fi |
6290 | | |
6291 | | /*************************************************************************** |
6292 | | ***************************************************************************/ |
6293 | | |
6294 | | |
6295 | | #ifdef NO_RECURSE |
6296 | | /************************************************* |
6297 | | * Release allocated heap frames * |
6298 | | *************************************************/ |
6299 | | |
6300 | | /* This function releases all the allocated frames. The base frame is on the |
6301 | | machine stack, and so must not be freed. |
6302 | | |
6303 | | Argument: the address of the base frame |
6304 | | Returns: nothing |
6305 | | */ |
6306 | | |
6307 | | static void |
6308 | | release_match_heapframes (heapframe *frame_base) |
6309 | 28.4k | { |
6310 | 28.4k | heapframe *nextframe = frame_base->Xnextframe; |
6311 | 28.4k | while (nextframe != NULL) |
6312 | 0 | { |
6313 | 0 | heapframe *oldframe = nextframe; |
6314 | 0 | nextframe = nextframe->Xnextframe; |
6315 | 0 | (PUBL(stack_free))(oldframe); |
6316 | 0 | } |
6317 | 28.4k | } |
6318 | | #endif |
6319 | | |
6320 | | |
6321 | | /************************************************* |
6322 | | * Execute a Regular Expression * |
6323 | | *************************************************/ |
6324 | | |
6325 | | /* This function applies a compiled re to a subject string and picks out |
6326 | | portions of the string if it matches. Two elements in the vector are set for |
6327 | | each substring: the offsets to the start and end of the substring. |
6328 | | |
6329 | | Arguments: |
6330 | | argument_re points to the compiled expression |
6331 | | extra_data points to extra data or is NULL |
6332 | | subject points to the subject string |
6333 | | length length of subject string (may contain binary zeros) |
6334 | | start_offset where to start in the subject string |
6335 | | options option bits |
6336 | | offsets points to a vector of ints to be filled in with offsets |
6337 | | offsetcount the number of elements in the vector |
6338 | | |
6339 | | Returns: > 0 => success; value is the number of elements filled in |
6340 | | = 0 => success, but offsets is not big enough |
6341 | | -1 => failed to match |
6342 | | < -1 => some kind of unexpected problem |
6343 | | */ |
6344 | | |
6345 | | #if defined COMPILE_PCRE8 |
6346 | | PCRE_EXP_DEFN int PCRE_CALL_CONVENTION |
6347 | | pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, |
6348 | | PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, |
6349 | | int offsetcount) |
6350 | | #elif defined COMPILE_PCRE16 |
6351 | | PCRE_EXP_DEFN int PCRE_CALL_CONVENTION |
6352 | | pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, |
6353 | | PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, |
6354 | | int offsetcount) |
6355 | | #elif defined COMPILE_PCRE32 |
6356 | | PCRE_EXP_DEFN int PCRE_CALL_CONVENTION |
6357 | | pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, |
6358 | | PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, |
6359 | | int offsetcount) |
6360 | | #endif |
6361 | 28.4k | { |
6362 | 28.4k | int rc, ocount, arg_offset_max; |
6363 | 28.4k | int newline; |
6364 | 28.4k | BOOL using_temporary_offsets = FALSE; |
6365 | 28.4k | BOOL anchored; |
6366 | 28.4k | BOOL startline; |
6367 | 28.4k | BOOL firstline; |
6368 | 28.4k | BOOL utf; |
6369 | 28.4k | BOOL has_first_char = FALSE; |
6370 | 28.4k | BOOL has_req_char = FALSE; |
6371 | 28.4k | pcre_uchar first_char = 0; |
6372 | 28.4k | pcre_uchar first_char2 = 0; |
6373 | 28.4k | pcre_uchar req_char = 0; |
6374 | 28.4k | pcre_uchar req_char2 = 0; |
6375 | 28.4k | match_data match_block; |
6376 | 28.4k | match_data *md = &match_block; |
6377 | 28.4k | const pcre_uint8 *tables; |
6378 | 28.4k | const pcre_uint8 *start_bits = NULL; |
6379 | 28.4k | PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset; |
6380 | 28.4k | PCRE_PUCHAR end_subject; |
6381 | 28.4k | PCRE_PUCHAR start_partial = NULL; |
6382 | 28.4k | PCRE_PUCHAR match_partial = NULL; |
6383 | 28.4k | PCRE_PUCHAR req_char_ptr = start_match - 1; |
6384 | | |
6385 | 28.4k | const pcre_study_data *study; |
6386 | 28.4k | const REAL_PCRE *re = (const REAL_PCRE *)argument_re; |
6387 | | |
6388 | 28.4k | #ifdef NO_RECURSE |
6389 | 28.4k | heapframe frame_zero; |
6390 | 28.4k | frame_zero.Xprevframe = NULL; /* Marks the top level */ |
6391 | 28.4k | frame_zero.Xnextframe = NULL; /* None are allocated yet */ |
6392 | 28.4k | md->match_frames_base = &frame_zero; |
6393 | 28.4k | #endif |
6394 | | |
6395 | | /* Check for the special magic call that measures the size of the stack used |
6396 | | per recursive call of match(). Without the funny casting for sizeof, a Windows |
6397 | | compiler gave this error: "unary minus operator applied to unsigned type, |
6398 | | result still unsigned". Hopefully the cast fixes that. */ |
6399 | | |
6400 | 28.4k | if (re == NULL && extra_data == NULL && subject == NULL && length == -999 && |
6401 | 28.4k | start_offset == -999) |
6402 | 0 | #ifdef NO_RECURSE |
6403 | 0 | return -((int)sizeof(heapframe)); |
6404 | | #else |
6405 | | return match(NULL, NULL, NULL, 0, NULL, NULL, 0); |
6406 | | #endif |
6407 | | |
6408 | | /* Plausibility checks */ |
6409 | | |
6410 | 28.4k | if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; |
6411 | 28.4k | if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) |
6412 | 0 | return PCRE_ERROR_NULL; |
6413 | 28.4k | if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; |
6414 | 28.4k | if (length < 0) return PCRE_ERROR_BADLENGTH; |
6415 | 28.4k | if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; |
6416 | | |
6417 | | /* Check that the first field in the block is the magic number. If it is not, |
6418 | | return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to |
6419 | | REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which |
6420 | | means that the pattern is likely compiled with different endianness. */ |
6421 | | |
6422 | 28.4k | if (re->magic_number != MAGIC_NUMBER) |
6423 | 0 | return re->magic_number == REVERSED_MAGIC_NUMBER? |
6424 | 0 | PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; |
6425 | 28.4k | if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; |
6426 | | |
6427 | | /* These two settings are used in the code for checking a UTF-8 string that |
6428 | | follows immediately afterwards. Other values in the md block are used only |
6429 | | during "normal" pcre_exec() processing, not when the JIT support is in use, |
6430 | | so they are set up later. */ |
6431 | | |
6432 | | /* PCRE_UTF16 has the same value as PCRE_UTF8. */ |
6433 | 28.4k | utf = md->utf = (re->options & PCRE_UTF8) != 0; |
6434 | 28.4k | md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : |
6435 | 28.4k | ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; |
6436 | | |
6437 | | /* Check a UTF-8 string if required. Pass back the character offset and error |
6438 | | code for an invalid string if a results vector is available. */ |
6439 | | |
6440 | | #ifdef SUPPORT_UTF |
6441 | | if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) |
6442 | | { |
6443 | | int erroroffset; |
6444 | | int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset); |
6445 | | if (errorcode != 0) |
6446 | | { |
6447 | | if (offsetcount >= 2) |
6448 | | { |
6449 | | offsets[0] = erroroffset; |
6450 | | offsets[1] = errorcode; |
6451 | | } |
6452 | | #if defined COMPILE_PCRE8 |
6453 | | return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)? |
6454 | | PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; |
6455 | | #elif defined COMPILE_PCRE16 |
6456 | | return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)? |
6457 | | PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; |
6458 | | #elif defined COMPILE_PCRE32 |
6459 | | return PCRE_ERROR_BADUTF32; |
6460 | | #endif |
6461 | | } |
6462 | | #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 |
6463 | | /* Check that a start_offset points to the start of a UTF character. */ |
6464 | | if (start_offset > 0 && start_offset < length && |
6465 | | NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) |
6466 | | return PCRE_ERROR_BADUTF8_OFFSET; |
6467 | | #endif |
6468 | | } |
6469 | | #endif |
6470 | | |
6471 | | /* If the pattern was successfully studied with JIT support, run the JIT |
6472 | | executable instead of the rest of this function. Most options must be set at |
6473 | | compile time for the JIT code to be usable. Fallback to the normal code path if |
6474 | | an unsupported flag is set. */ |
6475 | | |
6476 | | #ifdef SUPPORT_JIT |
6477 | | if (extra_data != NULL |
6478 | | && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT | |
6479 | | PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT |
6480 | | && extra_data->executable_jit != NULL |
6481 | | && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0) |
6482 | | { |
6483 | | rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length, |
6484 | | start_offset, options, offsets, offsetcount); |
6485 | | |
6486 | | /* PCRE_ERROR_NULL means that the selected normal or partial matching |
6487 | | mode is not compiled. In this case we simply fallback to interpreter. */ |
6488 | | |
6489 | | if (rc != PCRE_ERROR_JIT_BADOPTION) return rc; |
6490 | | } |
6491 | | #endif |
6492 | | |
6493 | | /* Carry on with non-JIT matching. This information is for finding all the |
6494 | | numbers associated with a given name, for condition testing. */ |
6495 | | |
6496 | 28.4k | md->name_table = (pcre_uchar *)re + re->name_table_offset; |
6497 | 28.4k | md->name_count = re->name_count; |
6498 | 28.4k | md->name_entry_size = re->name_entry_size; |
6499 | | |
6500 | | /* Fish out the optional data from the extra_data structure, first setting |
6501 | | the default values. */ |
6502 | | |
6503 | 28.4k | study = NULL; |
6504 | 28.4k | md->match_limit = MATCH_LIMIT; |
6505 | 28.4k | md->match_limit_recursion = MATCH_LIMIT_RECURSION; |
6506 | 28.4k | md->callout_data = NULL; |
6507 | | |
6508 | | /* The table pointer is always in native byte order. */ |
6509 | | |
6510 | 28.4k | tables = re->tables; |
6511 | | |
6512 | | /* The two limit values override the defaults, whatever their value. */ |
6513 | | |
6514 | 28.4k | if (extra_data != NULL) |
6515 | 0 | { |
6516 | 0 | unsigned long int flags = extra_data->flags; |
6517 | 0 | if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
6518 | 0 | study = (const pcre_study_data *)extra_data->study_data; |
6519 | 0 | if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) |
6520 | 0 | md->match_limit = extra_data->match_limit; |
6521 | 0 | if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
6522 | 0 | md->match_limit_recursion = extra_data->match_limit_recursion; |
6523 | 0 | if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
6524 | 0 | md->callout_data = extra_data->callout_data; |
6525 | 0 | if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; |
6526 | 0 | } |
6527 | | |
6528 | | /* Limits in the regex override only if they are smaller. */ |
6529 | | |
6530 | 28.4k | if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit) |
6531 | 0 | md->match_limit = re->limit_match; |
6532 | | |
6533 | 28.4k | if ((re->flags & PCRE_RLSET) != 0 && |
6534 | 28.4k | re->limit_recursion < md->match_limit_recursion) |
6535 | 0 | md->match_limit_recursion = re->limit_recursion; |
6536 | | |
6537 | | /* If the exec call supplied NULL for tables, use the inbuilt ones. This |
6538 | | is a feature that makes it possible to save compiled regex and re-use them |
6539 | | in other programs later. */ |
6540 | | |
6541 | 28.4k | if (tables == NULL) tables = PRIV(default_tables); |
6542 | | |
6543 | | /* Set up other data */ |
6544 | | |
6545 | 28.4k | anchored = ((re->options | options) & PCRE_ANCHORED) != 0; |
6546 | 28.4k | startline = (re->flags & PCRE_STARTLINE) != 0; |
6547 | 28.4k | firstline = (re->options & PCRE_FIRSTLINE) != 0; |
6548 | | |
6549 | | /* The code starts after the real_pcre block and the capture name table. */ |
6550 | | |
6551 | 28.4k | md->start_code = (const pcre_uchar *)re + re->name_table_offset + |
6552 | 28.4k | re->name_count * re->name_entry_size; |
6553 | | |
6554 | 28.4k | md->start_subject = (PCRE_PUCHAR)subject; |
6555 | 28.4k | md->start_offset = start_offset; |
6556 | 28.4k | md->end_subject = md->start_subject + length; |
6557 | 28.4k | end_subject = md->end_subject; |
6558 | | |
6559 | 28.4k | md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; |
6560 | 28.4k | md->use_ucp = (re->options & PCRE_UCP) != 0; |
6561 | 28.4k | md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; |
6562 | 28.4k | md->ignore_skip_arg = 0; |
6563 | | |
6564 | | /* Some options are unpacked into BOOL variables in the hope that testing |
6565 | | them will be faster than individual option bits. */ |
6566 | | |
6567 | 28.4k | md->notbol = (options & PCRE_NOTBOL) != 0; |
6568 | 28.4k | md->noteol = (options & PCRE_NOTEOL) != 0; |
6569 | 28.4k | md->notempty = (options & PCRE_NOTEMPTY) != 0; |
6570 | 28.4k | md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; |
6571 | | |
6572 | 28.4k | md->hitend = FALSE; |
6573 | 28.4k | md->mark = md->nomatch_mark = NULL; /* In case never set */ |
6574 | | |
6575 | 28.4k | md->recursive = NULL; /* No recursion at top level */ |
6576 | 28.4k | md->hasthen = (re->flags & PCRE_HASTHEN) != 0; |
6577 | | |
6578 | 28.4k | md->lcc = tables + lcc_offset; |
6579 | 28.4k | md->fcc = tables + fcc_offset; |
6580 | 28.4k | md->ctypes = tables + ctypes_offset; |
6581 | | |
6582 | | /* Handle different \R options. */ |
6583 | | |
6584 | 28.4k | switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) |
6585 | 28.4k | { |
6586 | 28.4k | case 0: |
6587 | 28.4k | if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) |
6588 | 0 | md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; |
6589 | 28.4k | else |
6590 | | #ifdef BSR_ANYCRLF |
6591 | | md->bsr_anycrlf = TRUE; |
6592 | | #else |
6593 | 28.4k | md->bsr_anycrlf = FALSE; |
6594 | 28.4k | #endif |
6595 | 28.4k | break; |
6596 | | |
6597 | 0 | case PCRE_BSR_ANYCRLF: |
6598 | 0 | md->bsr_anycrlf = TRUE; |
6599 | 0 | break; |
6600 | | |
6601 | 0 | case PCRE_BSR_UNICODE: |
6602 | 0 | md->bsr_anycrlf = FALSE; |
6603 | 0 | break; |
6604 | | |
6605 | 0 | default: return PCRE_ERROR_BADNEWLINE; |
6606 | 28.4k | } |
6607 | | |
6608 | | /* Handle different types of newline. The three bits give eight cases. If |
6609 | | nothing is set at run time, whatever was used at compile time applies. */ |
6610 | | |
6611 | 28.4k | switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : |
6612 | 28.4k | (pcre_uint32)options) & PCRE_NEWLINE_BITS) |
6613 | 28.4k | { |
6614 | 28.4k | case 0: newline = NEWLINE; break; /* Compile-time default */ |
6615 | 0 | case PCRE_NEWLINE_CR: newline = CHAR_CR; break; |
6616 | 0 | case PCRE_NEWLINE_LF: newline = CHAR_NL; break; |
6617 | 0 | case PCRE_NEWLINE_CR+ |
6618 | 0 | PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; |
6619 | 0 | case PCRE_NEWLINE_ANY: newline = -1; break; |
6620 | 0 | case PCRE_NEWLINE_ANYCRLF: newline = -2; break; |
6621 | 0 | default: return PCRE_ERROR_BADNEWLINE; |
6622 | 28.4k | } |
6623 | | |
6624 | 28.4k | if (newline == -2) |
6625 | 0 | { |
6626 | 0 | md->nltype = NLTYPE_ANYCRLF; |
6627 | 0 | } |
6628 | 28.4k | else if (newline < 0) |
6629 | 0 | { |
6630 | 0 | md->nltype = NLTYPE_ANY; |
6631 | 0 | } |
6632 | 28.4k | else |
6633 | 28.4k | { |
6634 | 28.4k | md->nltype = NLTYPE_FIXED; |
6635 | 28.4k | if (newline > 255) |
6636 | 0 | { |
6637 | 0 | md->nllen = 2; |
6638 | 0 | md->nl[0] = (newline >> 8) & 255; |
6639 | 0 | md->nl[1] = newline & 255; |
6640 | 0 | } |
6641 | 28.4k | else |
6642 | 28.4k | { |
6643 | 28.4k | md->nllen = 1; |
6644 | 28.4k | md->nl[0] = newline; |
6645 | 28.4k | } |
6646 | 28.4k | } |
6647 | | |
6648 | | /* Partial matching was originally supported only for a restricted set of |
6649 | | regexes; from release 8.00 there are no restrictions, but the bits are still |
6650 | | defined (though never set). So there's no harm in leaving this code. */ |
6651 | | |
6652 | 28.4k | if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) |
6653 | 0 | return PCRE_ERROR_BADPARTIAL; |
6654 | | |
6655 | | /* If the expression has got more back references than the offsets supplied can |
6656 | | hold, we get a temporary chunk of working store to use during the matching. |
6657 | | Otherwise, we can use the vector supplied, rounding down its size to a multiple |
6658 | | of 3. */ |
6659 | | |
6660 | 28.4k | ocount = offsetcount - (offsetcount % 3); |
6661 | 28.4k | arg_offset_max = (2*ocount)/3; |
6662 | | |
6663 | 28.4k | if (re->top_backref > 0 && re->top_backref >= ocount/3) |
6664 | 0 | { |
6665 | 0 | ocount = re->top_backref * 3 + 3; |
6666 | 0 | md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int)); |
6667 | 0 | if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; |
6668 | 0 | using_temporary_offsets = TRUE; |
6669 | 0 | DPRINTF(("Got memory to hold back references\n")); |
6670 | 0 | } |
6671 | 28.4k | else md->offset_vector = offsets; |
6672 | 28.4k | md->offset_end = ocount; |
6673 | 28.4k | md->offset_max = (2*ocount)/3; |
6674 | 28.4k | md->capture_last = 0; |
6675 | | |
6676 | | /* Reset the working variable associated with each extraction. These should |
6677 | | never be used unless previously set, but they get saved and restored, and so we |
6678 | | initialize them to avoid reading uninitialized locations. Also, unset the |
6679 | | offsets for the matched string. This is really just for tidiness with callouts, |
6680 | | in case they inspect these fields. */ |
6681 | | |
6682 | 28.4k | if (md->offset_vector != NULL) |
6683 | 0 | { |
6684 | 0 | register int *iptr = md->offset_vector + ocount; |
6685 | 0 | register int *iend = iptr - re->top_bracket; |
6686 | 0 | if (iend < md->offset_vector + 2) iend = md->offset_vector + 2; |
6687 | 0 | while (--iptr >= iend) *iptr = -1; |
6688 | 0 | if (offsetcount > 0) md->offset_vector[0] = -1; |
6689 | 0 | if (offsetcount > 1) md->offset_vector[1] = -1; |
6690 | 0 | } |
6691 | | |
6692 | | /* Set up the first character to match, if available. The first_char value is |
6693 | | never set for an anchored regular expression, but the anchoring may be forced |
6694 | | at run time, so we have to test for anchoring. The first char may be unset for |
6695 | | an unanchored pattern, of course. If there's no first char and the pattern was |
6696 | | studied, there may be a bitmap of possible first characters. */ |
6697 | | |
6698 | 28.4k | if (!anchored) |
6699 | 28.4k | { |
6700 | 28.4k | if ((re->flags & PCRE_FIRSTSET) != 0) |
6701 | 28.4k | { |
6702 | 28.4k | has_first_char = TRUE; |
6703 | 28.4k | first_char = first_char2 = (pcre_uchar)(re->first_char); |
6704 | 28.4k | if ((re->flags & PCRE_FCH_CASELESS) != 0) |
6705 | 0 | { |
6706 | 0 | first_char2 = TABLE_GET(first_char, md->fcc, first_char); |
6707 | | #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) |
6708 | | if (utf && first_char > 127) |
6709 | | first_char2 = UCD_OTHERCASE(first_char); |
6710 | | #endif |
6711 | 0 | } |
6712 | 28.4k | } |
6713 | 0 | else |
6714 | 0 | if (!startline && study != NULL && |
6715 | 0 | (study->flags & PCRE_STUDY_MAPPED) != 0) |
6716 | 0 | start_bits = study->start_bits; |
6717 | 28.4k | } |
6718 | | |
6719 | | /* For anchored or unanchored matches, there may be a "last known required |
6720 | | character" set. */ |
6721 | | |
6722 | 28.4k | if ((re->flags & PCRE_REQCHSET) != 0) |
6723 | 28.4k | { |
6724 | 28.4k | has_req_char = TRUE; |
6725 | 28.4k | req_char = req_char2 = (pcre_uchar)(re->req_char); |
6726 | 28.4k | if ((re->flags & PCRE_RCH_CASELESS) != 0) |
6727 | 0 | { |
6728 | 0 | req_char2 = TABLE_GET(req_char, md->fcc, req_char); |
6729 | | #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) |
6730 | | if (utf && req_char > 127) |
6731 | | req_char2 = UCD_OTHERCASE(req_char); |
6732 | | #endif |
6733 | 0 | } |
6734 | 28.4k | } |
6735 | | |
6736 | | |
6737 | | /* ==========================================================================*/ |
6738 | | |
6739 | | /* Loop for handling unanchored repeated matching attempts; for anchored regexs |
6740 | | the loop runs just once. */ |
6741 | | |
6742 | 28.4k | for(;;) |
6743 | 28.4k | { |
6744 | 28.4k | PCRE_PUCHAR save_end_subject = end_subject; |
6745 | 28.4k | PCRE_PUCHAR new_start_match; |
6746 | | |
6747 | | /* If firstline is TRUE, the start of the match is constrained to the first |
6748 | | line of a multiline string. That is, the match must be before or at the first |
6749 | | newline. Implement this by temporarily adjusting end_subject so that we stop |
6750 | | scanning at a newline. If the match fails at the newline, later code breaks |
6751 | | this loop. */ |
6752 | | |
6753 | 28.4k | if (firstline) |
6754 | 0 | { |
6755 | 0 | PCRE_PUCHAR t = start_match; |
6756 | | #ifdef SUPPORT_UTF |
6757 | | if (utf) |
6758 | | { |
6759 | | while (t < md->end_subject && !IS_NEWLINE(t)) |
6760 | | { |
6761 | | t++; |
6762 | | ACROSSCHAR(t < end_subject, *t, t++); |
6763 | | } |
6764 | | } |
6765 | | else |
6766 | | #endif |
6767 | 0 | while (t < md->end_subject && !IS_NEWLINE(t)) t++; |
6768 | 0 | end_subject = t; |
6769 | 0 | } |
6770 | | |
6771 | | /* There are some optimizations that avoid running the match if a known |
6772 | | starting point is not found, or if a known later character is not present. |
6773 | | However, there is an option that disables these, for testing and for ensuring |
6774 | | that all callouts do actually occur. The option can be set in the regex by |
6775 | | (*NO_START_OPT) or passed in match-time options. */ |
6776 | | |
6777 | 28.4k | if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) |
6778 | 28.4k | { |
6779 | | /* Advance to a unique first char if there is one. */ |
6780 | | |
6781 | 28.4k | if (has_first_char) |
6782 | 28.4k | { |
6783 | 28.4k | pcre_uchar smc; |
6784 | | |
6785 | 28.4k | if (first_char != first_char2) |
6786 | 0 | while (start_match < end_subject && |
6787 | 0 | (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2) |
6788 | 0 | start_match++; |
6789 | 28.4k | else |
6790 | 502k | while (start_match < end_subject && UCHAR21TEST(start_match) != first_char) |
6791 | 473k | start_match++; |
6792 | 28.4k | } |
6793 | | |
6794 | | /* Or to just after a linebreak for a multiline match */ |
6795 | | |
6796 | 0 | else if (startline) |
6797 | 0 | { |
6798 | 0 | if (start_match > md->start_subject + start_offset) |
6799 | 0 | { |
6800 | | #ifdef SUPPORT_UTF |
6801 | | if (utf) |
6802 | | { |
6803 | | while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
6804 | | { |
6805 | | start_match++; |
6806 | | ACROSSCHAR(start_match < end_subject, *start_match, |
6807 | | start_match++); |
6808 | | } |
6809 | | } |
6810 | | else |
6811 | | #endif |
6812 | 0 | while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
6813 | 0 | start_match++; |
6814 | | |
6815 | | /* If we have just passed a CR and the newline option is ANY or ANYCRLF, |
6816 | | and we are now at a LF, advance the match position by one more character. |
6817 | | */ |
6818 | |
|
6819 | 0 | if (start_match[-1] == CHAR_CR && |
6820 | 0 | (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && |
6821 | 0 | start_match < end_subject && |
6822 | 0 | UCHAR21TEST(start_match) == CHAR_NL) |
6823 | 0 | start_match++; |
6824 | 0 | } |
6825 | 0 | } |
6826 | | |
6827 | | /* Or to a non-unique first byte after study */ |
6828 | | |
6829 | 0 | else if (start_bits != NULL) |
6830 | 0 | { |
6831 | 0 | while (start_match < end_subject) |
6832 | 0 | { |
6833 | 0 | register pcre_uint32 c = UCHAR21TEST(start_match); |
6834 | | #ifndef COMPILE_PCRE8 |
6835 | | if (c > 255) c = 255; |
6836 | | #endif |
6837 | 0 | if ((start_bits[c/8] & (1 << (c&7))) != 0) break; |
6838 | 0 | start_match++; |
6839 | 0 | } |
6840 | 0 | } |
6841 | 28.4k | } /* Starting optimizations */ |
6842 | | |
6843 | | /* Restore fudged end_subject */ |
6844 | | |
6845 | 28.4k | end_subject = save_end_subject; |
6846 | | |
6847 | | /* The following two optimizations are disabled for partial matching or if |
6848 | | disabling is explicitly requested. */ |
6849 | | |
6850 | 28.4k | if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial) |
6851 | 28.4k | { |
6852 | | /* If the pattern was studied, a minimum subject length may be set. This is |
6853 | | a lower bound; no actual string of that length may actually match the |
6854 | | pattern. Although the value is, strictly, in characters, we treat it as |
6855 | | bytes to avoid spending too much time in this optimization. */ |
6856 | | |
6857 | 28.4k | if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && |
6858 | 28.4k | (pcre_uint32)(end_subject - start_match) < study->minlength) |
6859 | 0 | { |
6860 | 0 | rc = MATCH_NOMATCH; |
6861 | 0 | break; |
6862 | 0 | } |
6863 | | |
6864 | | /* If req_char is set, we know that that character must appear in the |
6865 | | subject for the match to succeed. If the first character is set, req_char |
6866 | | must be later in the subject; otherwise the test starts at the match point. |
6867 | | This optimization can save a huge amount of backtracking in patterns with |
6868 | | nested unlimited repeats that aren't going to match. Writing separate code |
6869 | | for cased/caseless versions makes it go faster, as does using an |
6870 | | autoincrement and backing off on a match. |
6871 | | |
6872 | | HOWEVER: when the subject string is very, very long, searching to its end |
6873 | | can take a long time, and give bad performance on quite ordinary patterns. |
6874 | | This showed up when somebody was matching something like /^\d+C/ on a |
6875 | | 32-megabyte string... so we don't do this when the string is sufficiently |
6876 | | long. */ |
6877 | | |
6878 | 28.4k | if (has_req_char && end_subject - start_match < REQ_BYTE_MAX) |
6879 | 28.4k | { |
6880 | 28.4k | register PCRE_PUCHAR p = start_match + (has_first_char? 1:0); |
6881 | | |
6882 | | /* We don't need to repeat the search if we haven't yet reached the |
6883 | | place we found it at last time. */ |
6884 | | |
6885 | 28.4k | if (p > req_char_ptr) |
6886 | 28.4k | { |
6887 | 28.4k | if (req_char != req_char2) |
6888 | 0 | { |
6889 | 0 | while (p < end_subject) |
6890 | 0 | { |
6891 | 0 | register pcre_uint32 pp = UCHAR21INCTEST(p); |
6892 | 0 | if (pp == req_char || pp == req_char2) { p--; break; } |
6893 | 0 | } |
6894 | 0 | } |
6895 | 28.4k | else |
6896 | 28.4k | { |
6897 | 28.4k | while (p < end_subject) |
6898 | 0 | { |
6899 | 0 | if (UCHAR21INCTEST(p) == req_char) { p--; break; } |
6900 | 0 | } |
6901 | 28.4k | } |
6902 | | |
6903 | | /* If we can't find the required character, break the matching loop, |
6904 | | forcing a match failure. */ |
6905 | | |
6906 | 28.4k | if (p >= end_subject) |
6907 | 28.4k | { |
6908 | 28.4k | rc = MATCH_NOMATCH; |
6909 | 28.4k | break; |
6910 | 28.4k | } |
6911 | | |
6912 | | /* If we have found the required character, save the point where we |
6913 | | found it, so that we don't search again next time round the loop if |
6914 | | the start hasn't passed this character yet. */ |
6915 | | |
6916 | 0 | req_char_ptr = p; |
6917 | 0 | } |
6918 | 28.4k | } |
6919 | 28.4k | } |
6920 | | |
6921 | | #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */ |
6922 | | printf(">>>> Match against: "); |
6923 | | pchars(start_match, end_subject - start_match, TRUE, md); |
6924 | | printf("\n"); |
6925 | | #endif |
6926 | | |
6927 | | /* OK, we can now run the match. If "hitend" is set afterwards, remember the |
6928 | | first starting point for which a partial match was found. */ |
6929 | | |
6930 | 0 | md->start_match_ptr = start_match; |
6931 | 0 | md->start_used_ptr = start_match; |
6932 | 0 | md->match_call_count = 0; |
6933 | 0 | md->match_function_type = 0; |
6934 | 0 | md->end_offset_top = 0; |
6935 | 0 | md->skip_arg_count = 0; |
6936 | 0 | rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0); |
6937 | 0 | if (md->hitend && start_partial == NULL) |
6938 | 0 | { |
6939 | 0 | start_partial = md->start_used_ptr; |
6940 | 0 | match_partial = start_match; |
6941 | 0 | } |
6942 | |
|
6943 | 0 | switch(rc) |
6944 | 0 | { |
6945 | | /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched |
6946 | | the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP |
6947 | | entirely. The only way we can do that is to re-do the match at the same |
6948 | | point, with a flag to force SKIP with an argument to be ignored. Just |
6949 | | treating this case as NOMATCH does not work because it does not check other |
6950 | | alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ |
6951 | | |
6952 | 0 | case MATCH_SKIP_ARG: |
6953 | 0 | new_start_match = start_match; |
6954 | 0 | md->ignore_skip_arg = md->skip_arg_count; |
6955 | 0 | break; |
6956 | | |
6957 | | /* SKIP passes back the next starting point explicitly, but if it is no |
6958 | | greater than the match we have just done, treat it as NOMATCH. */ |
6959 | | |
6960 | 0 | case MATCH_SKIP: |
6961 | 0 | if (md->start_match_ptr > start_match) |
6962 | 0 | { |
6963 | 0 | new_start_match = md->start_match_ptr; |
6964 | 0 | break; |
6965 | 0 | } |
6966 | | /* Fall through */ |
6967 | | |
6968 | | /* NOMATCH and PRUNE advance by one character. THEN at this level acts |
6969 | | exactly like PRUNE. Unset ignore SKIP-with-argument. */ |
6970 | | |
6971 | 0 | case MATCH_NOMATCH: |
6972 | 0 | case MATCH_PRUNE: |
6973 | 0 | case MATCH_THEN: |
6974 | 0 | md->ignore_skip_arg = 0; |
6975 | 0 | new_start_match = start_match + 1; |
6976 | | #ifdef SUPPORT_UTF |
6977 | | if (utf) |
6978 | | ACROSSCHAR(new_start_match < end_subject, *new_start_match, |
6979 | | new_start_match++); |
6980 | | #endif |
6981 | 0 | break; |
6982 | | |
6983 | | /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ |
6984 | | |
6985 | 0 | case MATCH_COMMIT: |
6986 | 0 | rc = MATCH_NOMATCH; |
6987 | 0 | goto ENDLOOP; |
6988 | | |
6989 | | /* Any other return is either a match, or some kind of error. */ |
6990 | | |
6991 | 0 | default: |
6992 | 0 | goto ENDLOOP; |
6993 | 0 | } |
6994 | | |
6995 | | /* Control reaches here for the various types of "no match at this point" |
6996 | | result. Reset the code to MATCH_NOMATCH for subsequent checking. */ |
6997 | | |
6998 | 0 | rc = MATCH_NOMATCH; |
6999 | | |
7000 | | /* If PCRE_FIRSTLINE is set, the match must happen before or at the first |
7001 | | newline in the subject (though it may continue over the newline). Therefore, |
7002 | | if we have just failed to match, starting at a newline, do not continue. */ |
7003 | |
|
7004 | 0 | if (firstline && IS_NEWLINE(start_match)) break; |
7005 | | |
7006 | | /* Advance to new matching position */ |
7007 | | |
7008 | 0 | start_match = new_start_match; |
7009 | | |
7010 | | /* Break the loop if the pattern is anchored or if we have passed the end of |
7011 | | the subject. */ |
7012 | |
|
7013 | 0 | if (anchored || start_match > end_subject) break; |
7014 | | |
7015 | | /* If we have just passed a CR and we are now at a LF, and the pattern does |
7016 | | not contain any explicit matches for \r or \n, and the newline option is CRLF |
7017 | | or ANY or ANYCRLF, advance the match position by one more character. In |
7018 | | normal matching start_match will aways be greater than the first position at |
7019 | | this stage, but a failed *SKIP can cause a return at the same point, which is |
7020 | | why the first test exists. */ |
7021 | | |
7022 | 0 | if (start_match > (PCRE_PUCHAR)subject + start_offset && |
7023 | 0 | start_match[-1] == CHAR_CR && |
7024 | 0 | start_match < end_subject && |
7025 | 0 | *start_match == CHAR_NL && |
7026 | 0 | (re->flags & PCRE_HASCRORLF) == 0 && |
7027 | 0 | (md->nltype == NLTYPE_ANY || |
7028 | 0 | md->nltype == NLTYPE_ANYCRLF || |
7029 | 0 | md->nllen == 2)) |
7030 | 0 | start_match++; |
7031 | |
|
7032 | 0 | md->mark = NULL; /* Reset for start of next match attempt */ |
7033 | 0 | } /* End of for(;;) "bumpalong" loop */ |
7034 | | |
7035 | | /* ==========================================================================*/ |
7036 | | |
7037 | | /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping |
7038 | | conditions is true: |
7039 | | |
7040 | | (1) The pattern is anchored or the match was failed by (*COMMIT); |
7041 | | |
7042 | | (2) We are past the end of the subject; |
7043 | | |
7044 | | (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because |
7045 | | this option requests that a match occur at or before the first newline in |
7046 | | the subject. |
7047 | | |
7048 | | When we have a match and the offset vector is big enough to deal with any |
7049 | | backreferences, captured substring offsets will already be set up. In the case |
7050 | | where we had to get some local store to hold offsets for backreference |
7051 | | processing, copy those that we can. In this case there need not be overflow if |
7052 | | certain parts of the pattern were not used, even though there are more |
7053 | | capturing parentheses than vector slots. */ |
7054 | | |
7055 | 28.4k | ENDLOOP: |
7056 | | |
7057 | 28.4k | if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) |
7058 | 0 | { |
7059 | 0 | if (using_temporary_offsets) |
7060 | 0 | { |
7061 | 0 | if (arg_offset_max >= 4) |
7062 | 0 | { |
7063 | 0 | memcpy(offsets + 2, md->offset_vector + 2, |
7064 | 0 | (arg_offset_max - 2) * sizeof(int)); |
7065 | 0 | DPRINTF(("Copied offsets from temporary memory\n")); |
7066 | 0 | } |
7067 | 0 | if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT; |
7068 | 0 | DPRINTF(("Freeing temporary memory\n")); |
7069 | 0 | (PUBL(free))(md->offset_vector); |
7070 | 0 | } |
7071 | | |
7072 | | /* Set the return code to the number of captured strings, or 0 if there were |
7073 | | too many to fit into the vector. */ |
7074 | |
|
7075 | 0 | rc = ((md->capture_last & OVFLBIT) != 0 && |
7076 | 0 | md->end_offset_top >= arg_offset_max)? |
7077 | 0 | 0 : md->end_offset_top/2; |
7078 | | |
7079 | | /* If there is space in the offset vector, set any unused pairs at the end of |
7080 | | the pattern to -1 for backwards compatibility. It is documented that this |
7081 | | happens. In earlier versions, the whole set of potential capturing offsets |
7082 | | was set to -1 each time round the loop, but this is handled differently now. |
7083 | | "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only |
7084 | | those at the end that need unsetting here. We can't just unset them all at |
7085 | | the start of the whole thing because they may get set in one branch that is |
7086 | | not the final matching branch. */ |
7087 | |
|
7088 | 0 | if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL) |
7089 | 0 | { |
7090 | 0 | register int *iptr, *iend; |
7091 | 0 | int resetcount = 2 + re->top_bracket * 2; |
7092 | 0 | if (resetcount > offsetcount) resetcount = offsetcount; |
7093 | 0 | iptr = offsets + md->end_offset_top; |
7094 | 0 | iend = offsets + resetcount; |
7095 | 0 | while (iptr < iend) *iptr++ = -1; |
7096 | 0 | } |
7097 | | |
7098 | | /* If there is space, set up the whole thing as substring 0. The value of |
7099 | | md->start_match_ptr might be modified if \K was encountered on the success |
7100 | | matching path. */ |
7101 | |
|
7102 | 0 | if (offsetcount < 2) rc = 0; else |
7103 | 0 | { |
7104 | 0 | offsets[0] = (int)(md->start_match_ptr - md->start_subject); |
7105 | 0 | offsets[1] = (int)(md->end_match_ptr - md->start_subject); |
7106 | 0 | } |
7107 | | |
7108 | | /* Return MARK data if requested */ |
7109 | |
|
7110 | 0 | if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) |
7111 | 0 | *(extra_data->mark) = (pcre_uchar *)md->mark; |
7112 | 0 | DPRINTF((">>>> returning %d\n", rc)); |
7113 | 0 | #ifdef NO_RECURSE |
7114 | 0 | release_match_heapframes(&frame_zero); |
7115 | 0 | #endif |
7116 | 0 | return rc; |
7117 | 0 | } |
7118 | | |
7119 | | /* Control gets here if there has been an error, or if the overall match |
7120 | | attempt has failed at all permitted starting positions. */ |
7121 | | |
7122 | 28.4k | if (using_temporary_offsets) |
7123 | 0 | { |
7124 | 0 | DPRINTF(("Freeing temporary memory\n")); |
7125 | 0 | (PUBL(free))(md->offset_vector); |
7126 | 0 | } |
7127 | | |
7128 | | /* For anything other than nomatch or partial match, just return the code. */ |
7129 | | |
7130 | 28.4k | if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) |
7131 | 0 | { |
7132 | 0 | DPRINTF((">>>> error: returning %d\n", rc)); |
7133 | 0 | #ifdef NO_RECURSE |
7134 | 0 | release_match_heapframes(&frame_zero); |
7135 | 0 | #endif |
7136 | 0 | return rc; |
7137 | 0 | } |
7138 | | |
7139 | | /* Handle partial matches - disable any mark data */ |
7140 | | |
7141 | 28.4k | if (match_partial != NULL) |
7142 | 0 | { |
7143 | 0 | DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); |
7144 | 0 | md->mark = NULL; |
7145 | 0 | if (offsetcount > 1) |
7146 | 0 | { |
7147 | 0 | offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject); |
7148 | 0 | offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); |
7149 | 0 | if (offsetcount > 2) |
7150 | 0 | offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject); |
7151 | 0 | } |
7152 | 0 | rc = PCRE_ERROR_PARTIAL; |
7153 | 0 | } |
7154 | | |
7155 | | /* This is the classic nomatch case */ |
7156 | | |
7157 | 28.4k | else |
7158 | 28.4k | { |
7159 | 28.4k | DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); |
7160 | 28.4k | rc = PCRE_ERROR_NOMATCH; |
7161 | 28.4k | } |
7162 | | |
7163 | | /* Return the MARK data if it has been requested. */ |
7164 | | |
7165 | 28.4k | if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) |
7166 | 0 | *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark; |
7167 | 28.4k | #ifdef NO_RECURSE |
7168 | 28.4k | release_match_heapframes(&frame_zero); |
7169 | 28.4k | #endif |
7170 | 28.4k | return rc; |
7171 | 28.4k | } |
7172 | | |
7173 | | /* End of pcre_exec.c */ |