Coverage Report

Created: 2025-11-16 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/pcre/pcre2lib/pcre2_dfa_match.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2023 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
/* This module contains the external function pcre2_dfa_match(), which is an
43
alternative matching function that uses a sort of DFA algorithm (not a true
44
FSM). This is NOT Perl-compatible, but it has advantages in certain
45
applications. */
46
47
48
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49
the performance of his patterns greatly. I could not use it as it stood, as it
50
was not thread safe, and made assumptions about pattern sizes. Also, it caused
51
test 7 to loop, and test 9 to crash with a segfault.
52
53
The issue is the check for duplicate states, which is done by a simple linear
54
search up the state list. (Grep for "duplicate" below to find the code.) For
55
many patterns, there will never be many states active at one time, so a simple
56
linear search is fine. In patterns that have many active states, it might be a
57
bottleneck. The suggested code used an indexing scheme to remember which states
58
had previously been used for each character, and avoided the linear search when
59
it knew there was no chance of a duplicate. This was implemented when adding
60
states to the state lists.
61
62
I wrote some thread-safe, not-limited code to try something similar at the time
63
of checking for duplicates (instead of when adding states), using index vectors
64
on the stack. It did give a 13% improvement with one specially constructed
65
pattern for certain subject strings, but on other strings and on many of the
66
simpler patterns in the test suite it did worse. The major problem, I think,
67
was the extra time to initialize the index. This had to be done for each call
68
of internal_dfa_match(). (The supplied patch used a static vector, initialized
69
only once - I suspect this was the cause of the problems with the tests.)
70
71
Overall, I concluded that the gains in some cases did not outweigh the losses
72
in others, so I abandoned this code. */
73
74
75
#ifdef HAVE_CONFIG_H
76
#include "config.h"
77
#endif
78
79
0
#define NLBLOCK mb             /* Block containing newline information */
80
0
#define PSSTART start_subject  /* Field containing processed string start */
81
0
#define PSEND   end_subject    /* Field containing processed string end */
82
83
#include "pcre2_internal.h"
84
85
#define PUBLIC_DFA_MATCH_OPTIONS \
86
0
  (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87
0
   PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88
0
   PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89
0
   PCRE2_COPY_MATCHED_SUBJECT)
90
91
92
/*************************************************
93
*      Code parameters and static tables         *
94
*************************************************/
95
96
/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97
into others, under special conditions. A gap of 20 between the blocks should be
98
enough. The resulting opcodes don't have to be less than 256 because they are
99
never stored, so we push them well clear of the normal opcodes. */
100
101
0
#define OP_PROP_EXTRA       300
102
0
#define OP_EXTUNI_EXTRA     320
103
0
#define OP_ANYNL_EXTRA      340
104
0
#define OP_HSPACE_EXTRA     360
105
0
#define OP_VSPACE_EXTRA     380
106
107
108
/* This table identifies those opcodes that are followed immediately by a
109
character that is to be tested in some way. This makes it possible to
110
centralize the loading of these characters. In the case of Type * etc, the
111
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112
small value. Non-zero values in the table are the offsets from the opcode where
113
the character is to be found. ***NOTE*** If the start of this table is
114
modified, the three tables that follow must also be modified. */
115
116
static const uint8_t coptable[] = {
117
  0,                             /* End                                    */
118
  0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119
  0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120
  0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121
  0, 0,                          /* \P, \p                                 */
122
  0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123
  0,                             /* \X                                     */
124
  0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
125
  1,                             /* Char                                   */
126
  1,                             /* Chari                                  */
127
  1,                             /* not                                    */
128
  1,                             /* noti                                   */
129
  /* Positive single-char repeats                                          */
130
  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132
  1+IMM2_SIZE,                   /* exact                                  */
133
  1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134
  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136
  1+IMM2_SIZE,                   /* exact I                                */
137
  1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138
  /* Negative single-char repeats - only for chars < 256                   */
139
  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141
  1+IMM2_SIZE,                   /* NOT exact                              */
142
  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143
  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145
  1+IMM2_SIZE,                   /* NOT exact I                            */
146
  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147
  /* Positive type repeats                                                 */
148
  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150
  1+IMM2_SIZE,                   /* Type exact                             */
151
  1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152
  /* Character class & ref repeats                                         */
153
  0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154
  0, 0,                          /* CRRANGE, CRMINRANGE                    */
155
  0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
156
  0,                             /* CLASS                                  */
157
  0,                             /* NCLASS                                 */
158
  0,                             /* XCLASS - variable length               */
159
  0,                             /* REF                                    */
160
  0,                             /* REFI                                   */
161
  0,                             /* DNREF                                  */
162
  0,                             /* DNREFI                                 */
163
  0,                             /* RECURSE                                */
164
  0,                             /* CALLOUT                                */
165
  0,                             /* CALLOUT_STR                            */
166
  0,                             /* Alt                                    */
167
  0,                             /* Ket                                    */
168
  0,                             /* KetRmax                                */
169
  0,                             /* KetRmin                                */
170
  0,                             /* KetRpos                                */
171
  0, 0,                          /* Reverse, Vreverse                      */
172
  0,                             /* Assert                                 */
173
  0,                             /* Assert not                             */
174
  0,                             /* Assert behind                          */
175
  0,                             /* Assert behind not                      */
176
  0,                             /* NA assert                              */
177
  0,                             /* NA assert behind                       */
178
  0,                             /* ONCE                                   */
179
  0,                             /* SCRIPT_RUN                             */
180
  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
181
  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
182
  0, 0,                          /* CREF, DNCREF                           */
183
  0, 0,                          /* RREF, DNRREF                           */
184
  0, 0,                          /* FALSE, TRUE                            */
185
  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
186
  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
187
  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
188
  0, 0,                          /* COMMIT, COMMIT_ARG                     */
189
  0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
190
  0, 0, 0,                       /* CLOSE, SKIPZERO, DEFINE                */
191
  0, 0                           /* \B and \b in UCP mode                  */
192
};
193
194
/* This table identifies those opcodes that inspect a character. It is used to
195
remember the fact that a character could have been inspected when the end of
196
the subject is reached. ***NOTE*** If the start of this table is modified, the
197
two tables that follow must also be modified. */
198
199
static const uint8_t poptable[] = {
200
  0,                             /* End                                    */
201
  0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
202
  1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
203
  1, 1, 1,                       /* Any, AllAny, Anybyte                   */
204
  1, 1,                          /* \P, \p                                 */
205
  1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
206
  1,                             /* \X                                     */
207
  0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
208
  1,                             /* Char                                   */
209
  1,                             /* Chari                                  */
210
  1,                             /* not                                    */
211
  1,                             /* noti                                   */
212
  /* Positive single-char repeats                                          */
213
  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
214
  1, 1, 1,                       /* upto, minupto, exact                   */
215
  1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
216
  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
217
  1, 1, 1,                       /* upto I, minupto I, exact I             */
218
  1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
219
  /* Negative single-char repeats - only for chars < 256                   */
220
  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
221
  1, 1, 1,                       /* NOT upto, minupto, exact               */
222
  1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
223
  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
224
  1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
225
  1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
226
  /* Positive type repeats                                                 */
227
  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
228
  1, 1, 1,                       /* Type upto, minupto, exact              */
229
  1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
230
  /* Character class & ref repeats                                         */
231
  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
232
  1, 1,                          /* CRRANGE, CRMINRANGE                    */
233
  1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
234
  1,                             /* CLASS                                  */
235
  1,                             /* NCLASS                                 */
236
  1,                             /* XCLASS - variable length               */
237
  0,                             /* REF                                    */
238
  0,                             /* REFI                                   */
239
  0,                             /* DNREF                                  */
240
  0,                             /* DNREFI                                 */
241
  0,                             /* RECURSE                                */
242
  0,                             /* CALLOUT                                */
243
  0,                             /* CALLOUT_STR                            */
244
  0,                             /* Alt                                    */
245
  0,                             /* Ket                                    */
246
  0,                             /* KetRmax                                */
247
  0,                             /* KetRmin                                */
248
  0,                             /* KetRpos                                */
249
  0, 0,                          /* Reverse, Vreverse                      */
250
  0,                             /* Assert                                 */
251
  0,                             /* Assert not                             */
252
  0,                             /* Assert behind                          */
253
  0,                             /* Assert behind not                      */
254
  0,                             /* NA assert                              */
255
  0,                             /* NA assert behind                       */
256
  0,                             /* ONCE                                   */
257
  0,                             /* SCRIPT_RUN                             */
258
  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
259
  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
260
  0, 0,                          /* CREF, DNCREF                           */
261
  0, 0,                          /* RREF, DNRREF                           */
262
  0, 0,                          /* FALSE, TRUE                            */
263
  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
264
  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
265
  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
266
  0, 0,                          /* COMMIT, COMMIT_ARG                     */
267
  0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
268
  0, 0, 0,                       /* CLOSE, SKIPZERO, DEFINE                */
269
  1, 1                           /* \B and \b in UCP mode                  */
270
};
271
272
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
273
and \w */
274
275
static const uint8_t toptable1[] = {
276
  0, 0, 0, 0, 0, 0,
277
  ctype_digit, ctype_digit,
278
  ctype_space, ctype_space,
279
  ctype_word,  ctype_word,
280
  0, 0                            /* OP_ANY, OP_ALLANY */
281
};
282
283
static const uint8_t toptable2[] = {
284
  0, 0, 0, 0, 0, 0,
285
  ctype_digit, 0,
286
  ctype_space, 0,
287
  ctype_word,  0,
288
  1, 1                            /* OP_ANY, OP_ALLANY */
289
};
290
291
292
/* Structure for holding data about a particular state, which is in effect the
293
current data for an active path through the match tree. It must consist
294
entirely of ints because the working vector we are passed, and which we put
295
these structures in, is a vector of ints. */
296
297
typedef struct stateblock {
298
  int offset;                     /* Offset to opcode (-ve has meaning) */
299
  int count;                      /* Count for repeats */
300
  int data;                       /* Some use extra data */
301
} stateblock;
302
303
0
#define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
304
305
306
/* Before version 10.32 the recursive calls of internal_dfa_match() were passed
307
local working space and output vectors that were created on the stack. This has
308
caused issues for some patterns, especially in small-stack environments such as
309
Windows. A new scheme is now in use which sets up a vector on the stack, but if
310
this is too small, heap memory is used, up to the heap_limit. The main
311
parameters are all numbers of ints because the workspace is a vector of ints.
312
313
The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
314
defined in pcre2_internal.h so as to be available to pcre2test when it is
315
finding the minimum heap requirement for a match. */
316
317
0
#define OVEC_UNIT  (sizeof(PCRE2_SIZE)/sizeof(int))
318
319
0
#define RWS_BASE_SIZE   (DFA_START_RWS_SIZE/sizeof(int))  /* Stack vector */
320
0
#define RWS_RSIZE       1000                    /* Work size for recursion */
321
0
#define RWS_OVEC_RSIZE  (1000*OVEC_UNIT)        /* Ovector for recursion */
322
0
#define RWS_OVEC_OSIZE  (2*OVEC_UNIT)           /* Ovector in other cases */
323
324
/* This structure is at the start of each workspace block. */
325
326
typedef struct RWS_anchor {
327
  struct RWS_anchor *next;
328
  uint32_t size;  /* Number of ints */
329
  uint32_t free;  /* Number of ints */
330
} RWS_anchor;
331
332
0
#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
333
334
335
336
/*************************************************
337
*               Process a callout                *
338
*************************************************/
339
340
/* This function is called to perform a callout.
341
342
Arguments:
343
  code              current code pointer
344
  offsets           points to current capture offsets
345
  current_subject   start of current subject match
346
  ptr               current position in subject
347
  mb                the match block
348
  extracode         extra code offset when called from condition
349
  lengthptr         where to return the callout length
350
351
Returns:            the return from the callout
352
*/
353
354
static int
355
do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
356
  PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
357
  PCRE2_SIZE *lengthptr)
358
0
{
359
0
pcre2_callout_block *cb = mb->cb;
360
361
0
*lengthptr = (code[extracode] == OP_CALLOUT)?
362
0
  (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
363
0
  (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
364
365
0
if (mb->callout == NULL) return 0;    /* No callout provided */
366
367
/* Fixed fields in the callout block are set once and for all at the start of
368
matching. */
369
370
0
cb->offset_vector    = offsets;
371
0
cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
372
0
cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
373
0
cb->pattern_position = GET(code, 1 + extracode);
374
0
cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
375
376
0
if (code[extracode] == OP_CALLOUT)
377
0
  {
378
0
  cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
379
0
  cb->callout_string_offset = 0;
380
0
  cb->callout_string = NULL;
381
0
  cb->callout_string_length = 0;
382
0
  }
383
0
else
384
0
  {
385
0
  cb->callout_number = 0;
386
0
  cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
387
0
  cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
388
0
  cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
389
0
  }
390
391
0
return (mb->callout)(cb, mb->callout_data);
392
0
}
393
394
395
396
/*************************************************
397
*         Expand local workspace memory          *
398
*************************************************/
399
400
/* This function is called when internal_dfa_match() is about to be called
401
recursively and there is insufficient working space left in the current
402
workspace block. If there's an existing next block, use it; otherwise get a new
403
block unless the heap limit is reached.
404
405
Arguments:
406
  rwsptr     pointer to block pointer (updated)
407
  ovecsize   space needed for an ovector
408
  mb         the match block
409
410
Returns:     0 rwsptr has been updated
411
            !0 an error code
412
*/
413
414
static int
415
more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
416
0
{
417
0
RWS_anchor *rws = *rwsptr;
418
0
RWS_anchor *new;
419
420
0
if (rws->next != NULL)
421
0
  {
422
0
  new = rws->next;
423
0
  }
424
425
/* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
426
mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
427
overflow. */
428
429
0
else
430
0
  {
431
0
  uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
432
0
  uint32_t newsizeK = newsize/(1024/sizeof(int));
433
434
0
  if (newsizeK + mb->heap_used > mb->heap_limit)
435
0
    newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
436
0
  newsize = newsizeK*(1024/sizeof(int));
437
438
0
  if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
439
0
    return PCRE2_ERROR_HEAPLIMIT;
440
0
  new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
441
0
  if (new == NULL) return PCRE2_ERROR_NOMEMORY;
442
0
  mb->heap_used += newsizeK;
443
0
  new->next = NULL;
444
0
  new->size = newsize;
445
0
  rws->next = new;
446
0
  }
447
448
0
new->free = new->size - RWS_ANCHOR_SIZE;
449
0
*rwsptr = new;
450
0
return 0;
451
0
}
452
453
454
455
/*************************************************
456
*     Match a Regular Expression - DFA engine    *
457
*************************************************/
458
459
/* This internal function applies a compiled pattern to a subject string,
460
starting at a given point, using a DFA engine. This function is called from the
461
external one, possibly multiple times if the pattern is not anchored. The
462
function calls itself recursively for some kinds of subpattern.
463
464
Arguments:
465
  mb                the match_data block with fixed information
466
  this_start_code   the opening bracket of this subexpression's code
467
  current_subject   where we currently are in the subject string
468
  start_offset      start offset in the subject string
469
  offsets           vector to contain the matching string offsets
470
  offsetcount       size of same
471
  workspace         vector of workspace
472
  wscount           size of same
473
  rlevel            function call recursion level
474
475
Returns:            > 0 => number of match offset pairs placed in offsets
476
                    = 0 => offsets overflowed; longest matches are present
477
                     -1 => failed to match
478
                   < -1 => some kind of unexpected problem
479
480
The following macros are used for adding states to the two state vectors (one
481
for the current character, one for the following character). */
482
483
#define ADD_ACTIVE(x,y) \
484
0
  if (active_count++ < wscount) \
485
0
    { \
486
0
    next_active_state->offset = (x); \
487
0
    next_active_state->count  = (y); \
488
0
    next_active_state++; \
489
0
    } \
490
0
  else return PCRE2_ERROR_DFA_WSSIZE
491
492
#define ADD_ACTIVE_DATA(x,y,z) \
493
  if (active_count++ < wscount) \
494
    { \
495
    next_active_state->offset = (x); \
496
    next_active_state->count  = (y); \
497
    next_active_state->data   = (z); \
498
    next_active_state++; \
499
    } \
500
  else return PCRE2_ERROR_DFA_WSSIZE
501
502
#define ADD_NEW(x,y) \
503
0
  if (new_count++ < wscount) \
504
0
    { \
505
0
    next_new_state->offset = (x); \
506
0
    next_new_state->count  = (y); \
507
0
    next_new_state++; \
508
0
    } \
509
0
  else return PCRE2_ERROR_DFA_WSSIZE
510
511
#define ADD_NEW_DATA(x,y,z) \
512
0
  if (new_count++ < wscount) \
513
0
    { \
514
0
    next_new_state->offset = (x); \
515
0
    next_new_state->count  = (y); \
516
0
    next_new_state->data   = (z); \
517
0
    next_new_state++; \
518
0
    } \
519
0
  else return PCRE2_ERROR_DFA_WSSIZE
520
521
/* And now, here is the code */
522
523
static int
524
internal_dfa_match(
525
  dfa_match_block *mb,
526
  PCRE2_SPTR this_start_code,
527
  PCRE2_SPTR current_subject,
528
  PCRE2_SIZE start_offset,
529
  PCRE2_SIZE *offsets,
530
  uint32_t offsetcount,
531
  int *workspace,
532
  int wscount,
533
  uint32_t rlevel,
534
  int *RWS)
535
0
{
536
0
stateblock *active_states, *new_states, *temp_states;
537
0
stateblock *next_active_state, *next_new_state;
538
0
const uint8_t *ctypes, *lcc, *fcc;
539
0
PCRE2_SPTR ptr;
540
0
PCRE2_SPTR end_code;
541
0
dfa_recursion_info new_recursive;
542
0
int active_count, new_count, match_count;
543
544
/* Some fields in the mb block are frequently referenced, so we load them into
545
independent variables in the hope that this will perform better. */
546
547
0
PCRE2_SPTR start_subject = mb->start_subject;
548
0
PCRE2_SPTR end_subject = mb->end_subject;
549
0
PCRE2_SPTR start_code = mb->start_code;
550
551
0
#ifdef SUPPORT_UNICODE
552
0
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
553
0
BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
554
#else
555
BOOL utf = FALSE;
556
#endif
557
558
0
BOOL reset_could_continue = FALSE;
559
560
0
if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
561
0
if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
562
0
offsetcount &= (uint32_t)(-2);  /* Round down */
563
564
0
wscount -= 2;
565
0
wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
566
0
          (2 * INTS_PER_STATEBLOCK);
567
568
0
ctypes = mb->tables + ctypes_offset;
569
0
lcc = mb->tables + lcc_offset;
570
0
fcc = mb->tables + fcc_offset;
571
572
0
match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
573
574
0
active_states = (stateblock *)(workspace + 2);
575
0
next_new_state = new_states = active_states + wscount;
576
0
new_count = 0;
577
578
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
579
the alternative states onto the list, and find out where the end is. This
580
makes is possible to use this function recursively, when we want to stop at a
581
matching internal ket rather than at the end.
582
583
If we are dealing with a backward assertion we have to find out the maximum
584
amount to move back, and set up each alternative appropriately. */
585
586
0
if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
587
0
  {
588
0
  size_t max_back = 0;
589
0
  size_t gone_back;
590
591
0
  end_code = this_start_code;
592
0
  do
593
0
    {
594
0
    size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
595
0
    if (back > max_back) max_back = back;
596
0
    end_code += GET(end_code, 1);
597
0
    }
598
0
  while (*end_code == OP_ALT);
599
600
  /* If we can't go back the amount required for the longest lookbehind
601
  pattern, go back as far as we can; some alternatives may still be viable. */
602
603
0
#ifdef SUPPORT_UNICODE
604
  /* In character mode we have to step back character by character */
605
606
0
  if (utf)
607
0
    {
608
0
    for (gone_back = 0; gone_back < max_back; gone_back++)
609
0
      {
610
0
      if (current_subject <= start_subject) break;
611
0
      current_subject--;
612
0
      ACROSSCHAR(current_subject > start_subject, current_subject,
613
0
        current_subject--);
614
0
      }
615
0
    }
616
0
  else
617
0
#endif
618
619
  /* In byte-mode we can do this quickly. */
620
621
0
    {
622
0
    size_t current_offset = (size_t)(current_subject - start_subject);
623
0
    gone_back = (current_offset < max_back)? current_offset : max_back;
624
0
    current_subject -= gone_back;
625
0
    }
626
627
  /* Save the earliest consulted character */
628
629
0
  if (current_subject < mb->start_used_ptr)
630
0
    mb->start_used_ptr = current_subject;
631
632
  /* Now we can process the individual branches. There will be an OP_REVERSE at
633
  the start of each branch, except when the length of the branch is zero. */
634
635
0
  end_code = this_start_code;
636
0
  do
637
0
    {
638
0
    uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
639
0
    size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
640
0
    if (back <= gone_back)
641
0
      {
642
0
      int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
643
0
      ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
644
0
      }
645
0
    end_code += GET(end_code, 1);
646
0
    }
647
0
  while (*end_code == OP_ALT);
648
0
 }
649
650
/* This is the code for a "normal" subpattern (not a backward assertion). The
651
start of a whole pattern is always one of these. If we are at the top level,
652
we may be asked to restart matching from the same point that we reached for a
653
previous partial match. We still have to scan through the top-level branches to
654
find the end state. */
655
656
0
else
657
0
  {
658
0
  end_code = this_start_code;
659
660
  /* Restarting */
661
662
0
  if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
663
0
    {
664
0
    do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
665
0
    new_count = workspace[1];
666
0
    if (!workspace[0])
667
0
      memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
668
0
    }
669
670
  /* Not restarting */
671
672
0
  else
673
0
    {
674
0
    int length = 1 + LINK_SIZE +
675
0
      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
676
0
        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
677
0
        ? IMM2_SIZE:0);
678
0
    do
679
0
      {
680
0
      ADD_NEW((int)(end_code - start_code + length), 0);
681
0
      end_code += GET(end_code, 1);
682
0
      length = 1 + LINK_SIZE;
683
0
      }
684
0
    while (*end_code == OP_ALT);
685
0
    }
686
0
  }
687
688
0
workspace[0] = 0;    /* Bit indicating which vector is current */
689
690
/* Loop for scanning the subject */
691
692
0
ptr = current_subject;
693
0
for (;;)
694
0
  {
695
0
  int i, j;
696
0
  int clen, dlen;
697
0
  uint32_t c, d;
698
0
  int forced_fail = 0;
699
0
  BOOL partial_newline = FALSE;
700
0
  BOOL could_continue = reset_could_continue;
701
0
  reset_could_continue = FALSE;
702
703
0
  if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
704
705
  /* Make the new state list into the active state list and empty the
706
  new state list. */
707
708
0
  temp_states = active_states;
709
0
  active_states = new_states;
710
0
  new_states = temp_states;
711
0
  active_count = new_count;
712
0
  new_count = 0;
713
714
0
  workspace[0] ^= 1;              /* Remember for the restarting feature */
715
0
  workspace[1] = active_count;
716
717
  /* Set the pointers for adding new states */
718
719
0
  next_active_state = active_states + active_count;
720
0
  next_new_state = new_states;
721
722
  /* Load the current character from the subject outside the loop, as many
723
  different states may want to look at it, and we assume that at least one
724
  will. */
725
726
0
  if (ptr < end_subject)
727
0
    {
728
0
    clen = 1;        /* Number of data items in the character */
729
0
#ifdef SUPPORT_UNICODE
730
0
    GETCHARLENTEST(c, ptr, clen);
731
#else
732
    c = *ptr;
733
#endif  /* SUPPORT_UNICODE */
734
0
    }
735
0
  else
736
0
    {
737
0
    clen = 0;        /* This indicates the end of the subject */
738
0
    c = NOTACHAR;    /* This value should never actually be used */
739
0
    }
740
741
  /* Scan up the active states and act on each one. The result of an action
742
  may be to add more states to the currently active list (e.g. on hitting a
743
  parenthesis) or it may be to put states on the new list, for considering
744
  when we move the character pointer on. */
745
746
0
  for (i = 0; i < active_count; i++)
747
0
    {
748
0
    stateblock *current_state = active_states + i;
749
0
    BOOL caseless = FALSE;
750
0
    PCRE2_SPTR code;
751
0
    uint32_t codevalue;
752
0
    int state_offset = current_state->offset;
753
0
    int rrc;
754
0
    int count;
755
756
    /* A negative offset is a special case meaning "hold off going to this
757
    (negated) state until the number of characters in the data field have
758
    been skipped". If the could_continue flag was passed over from a previous
759
    state, arrange for it to passed on. */
760
761
0
    if (state_offset < 0)
762
0
      {
763
0
      if (current_state->data > 0)
764
0
        {
765
0
        ADD_NEW_DATA(state_offset, current_state->count,
766
0
          current_state->data - 1);
767
0
        if (could_continue) reset_could_continue = TRUE;
768
0
        continue;
769
0
        }
770
0
      else
771
0
        {
772
0
        current_state->offset = state_offset = -state_offset;
773
0
        }
774
0
      }
775
776
    /* Check for a duplicate state with the same count, and skip if found.
777
    See the note at the head of this module about the possibility of improving
778
    performance here. */
779
780
0
    for (j = 0; j < i; j++)
781
0
      {
782
0
      if (active_states[j].offset == state_offset &&
783
0
          active_states[j].count == current_state->count)
784
0
        goto NEXT_ACTIVE_STATE;
785
0
      }
786
787
    /* The state offset is the offset to the opcode */
788
789
0
    code = start_code + state_offset;
790
0
    codevalue = *code;
791
792
    /* If this opcode inspects a character, but we are at the end of the
793
    subject, remember the fact for use when testing for a partial match. */
794
795
0
    if (clen == 0 && poptable[codevalue] != 0)
796
0
      could_continue = TRUE;
797
798
    /* If this opcode is followed by an inline character, load it. It is
799
    tempting to test for the presence of a subject character here, but that
800
    is wrong, because sometimes zero repetitions of the subject are
801
    permitted.
802
803
    We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
804
    argument that is not a data character - but is always one byte long because
805
    the values are small. We have to take special action to deal with  \P, \p,
806
    \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
807
    these ones to new opcodes. */
808
809
0
    if (coptable[codevalue] > 0)
810
0
      {
811
0
      dlen = 1;
812
0
#ifdef SUPPORT_UNICODE
813
0
      if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
814
0
#endif  /* SUPPORT_UNICODE */
815
0
      d = code[coptable[codevalue]];
816
0
      if (codevalue >= OP_TYPESTAR)
817
0
        {
818
0
        switch(d)
819
0
          {
820
0
          case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
821
0
          case OP_NOTPROP:
822
0
          case OP_PROP: codevalue += OP_PROP_EXTRA; break;
823
0
          case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
824
0
          case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
825
0
          case OP_NOT_HSPACE:
826
0
          case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
827
0
          case OP_NOT_VSPACE:
828
0
          case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
829
0
          default: break;
830
0
          }
831
0
        }
832
0
      }
833
0
    else
834
0
      {
835
0
      dlen = 0;         /* Not strictly necessary, but compilers moan */
836
0
      d = NOTACHAR;     /* if these variables are not set. */
837
0
      }
838
839
840
    /* Now process the individual opcodes */
841
842
0
    switch (codevalue)
843
0
      {
844
/* ========================================================================== */
845
      /* These cases are never obeyed. This is a fudge that causes a compile-
846
      time error if the vectors coptable or poptable, which are indexed by
847
      opcode, are not the correct length. It seems to be the only way to do
848
      such a check at compile time, as the sizeof() operator does not work
849
      in the C preprocessor. */
850
851
0
      case OP_TABLE_LENGTH:
852
0
      case OP_TABLE_LENGTH +
853
0
        ((sizeof(coptable) == OP_TABLE_LENGTH) &&
854
0
         (sizeof(poptable) == OP_TABLE_LENGTH)):
855
0
      return 0;
856
857
/* ========================================================================== */
858
      /* Reached a closing bracket. If not at the end of the pattern, carry
859
      on with the next opcode. For repeating opcodes, also add the repeat
860
      state. Note that KETRPOS will always be encountered at the end of the
861
      subpattern, because the possessive subpattern repeats are always handled
862
      using recursive calls. Thus, it never adds any new states.
863
864
      At the end of the (sub)pattern, unless we have an empty string and
865
      PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
866
      start of the subject, save the match data, shifting up all previous
867
      matches so we always have the longest first. */
868
869
0
      case OP_KET:
870
0
      case OP_KETRMIN:
871
0
      case OP_KETRMAX:
872
0
      case OP_KETRPOS:
873
0
      if (code != end_code)
874
0
        {
875
0
        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
876
0
        if (codevalue != OP_KET)
877
0
          {
878
0
          ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
879
0
          }
880
0
        }
881
0
      else
882
0
        {
883
0
        if (ptr > current_subject ||
884
0
            ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
885
0
              ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
886
0
                current_subject > start_subject + mb->start_offset)))
887
0
          {
888
0
          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
889
0
            else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
890
0
              match_count = 0;
891
0
          count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
892
0
          if (count > 0) (void)memmove(offsets + 2, offsets,
893
0
            (size_t)count * sizeof(PCRE2_SIZE));
894
0
          if (offsetcount >= 2)
895
0
            {
896
0
            offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
897
0
            offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
898
0
            }
899
0
          if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
900
0
          }
901
0
        }
902
0
      break;
903
904
/* ========================================================================== */
905
      /* These opcodes add to the current list of states without looking
906
      at the current character. */
907
908
      /*-----------------------------------------------------------------*/
909
0
      case OP_ALT:
910
0
      do { code += GET(code, 1); } while (*code == OP_ALT);
911
0
      ADD_ACTIVE((int)(code - start_code), 0);
912
0
      break;
913
914
      /*-----------------------------------------------------------------*/
915
0
      case OP_BRA:
916
0
      case OP_SBRA:
917
0
      do
918
0
        {
919
0
        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
920
0
        code += GET(code, 1);
921
0
        }
922
0
      while (*code == OP_ALT);
923
0
      break;
924
925
      /*-----------------------------------------------------------------*/
926
0
      case OP_CBRA:
927
0
      case OP_SCBRA:
928
0
      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
929
0
      code += GET(code, 1);
930
0
      while (*code == OP_ALT)
931
0
        {
932
0
        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
933
0
        code += GET(code, 1);
934
0
        }
935
0
      break;
936
937
      /*-----------------------------------------------------------------*/
938
0
      case OP_BRAZERO:
939
0
      case OP_BRAMINZERO:
940
0
      ADD_ACTIVE(state_offset + 1, 0);
941
0
      code += 1 + GET(code, 2);
942
0
      while (*code == OP_ALT) code += GET(code, 1);
943
0
      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
944
0
      break;
945
946
      /*-----------------------------------------------------------------*/
947
0
      case OP_SKIPZERO:
948
0
      code += 1 + GET(code, 2);
949
0
      while (*code == OP_ALT) code += GET(code, 1);
950
0
      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
951
0
      break;
952
953
      /*-----------------------------------------------------------------*/
954
0
      case OP_CIRC:
955
0
      if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
956
0
        { ADD_ACTIVE(state_offset + 1, 0); }
957
0
      break;
958
959
      /*-----------------------------------------------------------------*/
960
0
      case OP_CIRCM:
961
0
      if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
962
0
          ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
963
0
            && WAS_NEWLINE(ptr)))
964
0
        { ADD_ACTIVE(state_offset + 1, 0); }
965
0
      break;
966
967
      /*-----------------------------------------------------------------*/
968
0
      case OP_EOD:
969
0
      if (ptr >= end_subject)
970
0
        {
971
0
        if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
972
0
          return PCRE2_ERROR_PARTIAL;
973
0
        else { ADD_ACTIVE(state_offset + 1, 0); }
974
0
        }
975
0
      break;
976
977
      /*-----------------------------------------------------------------*/
978
0
      case OP_SOD:
979
0
      if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
980
0
      break;
981
982
      /*-----------------------------------------------------------------*/
983
0
      case OP_SOM:
984
0
      if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
985
0
      break;
986
987
988
/* ========================================================================== */
989
      /* These opcodes inspect the next subject character, and sometimes
990
      the previous one as well, but do not have an argument. The variable
991
      clen contains the length of the current character and is zero if we are
992
      at the end of the subject. */
993
994
      /*-----------------------------------------------------------------*/
995
0
      case OP_ANY:
996
0
      if (clen > 0 && !IS_NEWLINE(ptr))
997
0
        {
998
0
        if (ptr + 1 >= mb->end_subject &&
999
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1000
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1001
0
            NLBLOCK->nllen == 2 &&
1002
0
            c == NLBLOCK->nl[0])
1003
0
          {
1004
0
          could_continue = partial_newline = TRUE;
1005
0
          }
1006
0
        else
1007
0
          {
1008
0
          ADD_NEW(state_offset + 1, 0);
1009
0
          }
1010
0
        }
1011
0
      break;
1012
1013
      /*-----------------------------------------------------------------*/
1014
0
      case OP_ALLANY:
1015
0
      if (clen > 0)
1016
0
        { ADD_NEW(state_offset + 1, 0); }
1017
0
      break;
1018
1019
      /*-----------------------------------------------------------------*/
1020
0
      case OP_EODN:
1021
0
      if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1022
0
        {
1023
0
        if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1024
0
          return PCRE2_ERROR_PARTIAL;
1025
0
        ADD_ACTIVE(state_offset + 1, 0);
1026
0
        }
1027
0
      break;
1028
1029
      /*-----------------------------------------------------------------*/
1030
0
      case OP_DOLL:
1031
0
      if ((mb->moptions & PCRE2_NOTEOL) == 0)
1032
0
        {
1033
0
        if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1034
0
          could_continue = TRUE;
1035
0
        else if (clen == 0 ||
1036
0
            ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1037
0
               (ptr == end_subject - mb->nllen)
1038
0
            ))
1039
0
          { ADD_ACTIVE(state_offset + 1, 0); }
1040
0
        else if (ptr + 1 >= mb->end_subject &&
1041
0
                 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1042
0
                 NLBLOCK->nltype == NLTYPE_FIXED &&
1043
0
                 NLBLOCK->nllen == 2 &&
1044
0
                 c == NLBLOCK->nl[0])
1045
0
          {
1046
0
          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1047
0
            {
1048
0
            reset_could_continue = TRUE;
1049
0
            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1050
0
            }
1051
0
          else could_continue = partial_newline = TRUE;
1052
0
          }
1053
0
        }
1054
0
      break;
1055
1056
      /*-----------------------------------------------------------------*/
1057
0
      case OP_DOLLM:
1058
0
      if ((mb->moptions & PCRE2_NOTEOL) == 0)
1059
0
        {
1060
0
        if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1061
0
          could_continue = TRUE;
1062
0
        else if (clen == 0 ||
1063
0
            ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1064
0
          { ADD_ACTIVE(state_offset + 1, 0); }
1065
0
        else if (ptr + 1 >= mb->end_subject &&
1066
0
                 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1067
0
                 NLBLOCK->nltype == NLTYPE_FIXED &&
1068
0
                 NLBLOCK->nllen == 2 &&
1069
0
                 c == NLBLOCK->nl[0])
1070
0
          {
1071
0
          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1072
0
            {
1073
0
            reset_could_continue = TRUE;
1074
0
            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1075
0
            }
1076
0
          else could_continue = partial_newline = TRUE;
1077
0
          }
1078
0
        }
1079
0
      else if (IS_NEWLINE(ptr))
1080
0
        { ADD_ACTIVE(state_offset + 1, 0); }
1081
0
      break;
1082
1083
      /*-----------------------------------------------------------------*/
1084
1085
0
      case OP_DIGIT:
1086
0
      case OP_WHITESPACE:
1087
0
      case OP_WORDCHAR:
1088
0
      if (clen > 0 && c < 256 &&
1089
0
            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1090
0
        { ADD_NEW(state_offset + 1, 0); }
1091
0
      break;
1092
1093
      /*-----------------------------------------------------------------*/
1094
0
      case OP_NOT_DIGIT:
1095
0
      case OP_NOT_WHITESPACE:
1096
0
      case OP_NOT_WORDCHAR:
1097
0
      if (clen > 0 && (c >= 256 ||
1098
0
            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1099
0
        { ADD_NEW(state_offset + 1, 0); }
1100
0
      break;
1101
1102
      /*-----------------------------------------------------------------*/
1103
0
      case OP_WORD_BOUNDARY:
1104
0
      case OP_NOT_WORD_BOUNDARY:
1105
0
      case OP_NOT_UCP_WORD_BOUNDARY:
1106
0
      case OP_UCP_WORD_BOUNDARY:
1107
0
        {
1108
0
        int left_word, right_word;
1109
1110
0
        if (ptr > start_subject)
1111
0
          {
1112
0
          PCRE2_SPTR temp = ptr - 1;
1113
0
          if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1114
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1115
0
          if (utf) { BACKCHAR(temp); }
1116
0
#endif
1117
0
          GETCHARTEST(d, temp);
1118
0
#ifdef SUPPORT_UNICODE
1119
0
          if (codevalue == OP_UCP_WORD_BOUNDARY ||
1120
0
              codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1121
0
            {
1122
0
            int chartype = UCD_CHARTYPE(d);
1123
0
            int category = PRIV(ucp_gentype)[chartype];
1124
0
            left_word = (category == ucp_L || category == ucp_N ||
1125
0
              chartype == ucp_Mn || chartype == ucp_Pc);
1126
0
            }
1127
0
          else
1128
0
#endif
1129
0
          left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1130
0
          }
1131
0
        else left_word = FALSE;
1132
1133
0
        if (clen > 0)
1134
0
          {
1135
0
          if (ptr >= mb->last_used_ptr)
1136
0
            {
1137
0
            PCRE2_SPTR temp = ptr + 1;
1138
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1139
0
            if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1140
0
#endif
1141
0
            mb->last_used_ptr = temp;
1142
0
            }
1143
0
#ifdef SUPPORT_UNICODE
1144
0
          if (codevalue == OP_UCP_WORD_BOUNDARY ||
1145
0
              codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1146
0
            {
1147
0
            int chartype = UCD_CHARTYPE(c);
1148
0
            int category = PRIV(ucp_gentype)[chartype];
1149
0
            right_word = (category == ucp_L || category == ucp_N ||
1150
0
              chartype == ucp_Mn || chartype == ucp_Pc);
1151
0
            }
1152
0
          else
1153
0
#endif
1154
0
          right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1155
0
          }
1156
0
        else right_word = FALSE;
1157
1158
0
        if ((left_word == right_word) ==
1159
0
            (codevalue == OP_NOT_WORD_BOUNDARY ||
1160
0
             codevalue == OP_NOT_UCP_WORD_BOUNDARY))
1161
0
          { ADD_ACTIVE(state_offset + 1, 0); }
1162
0
        }
1163
0
      break;
1164
1165
1166
      /*-----------------------------------------------------------------*/
1167
      /* Check the next character by Unicode property. We will get here only
1168
      if the support is in the binary; otherwise a compile-time error occurs.
1169
      */
1170
1171
0
#ifdef SUPPORT_UNICODE
1172
0
      case OP_PROP:
1173
0
      case OP_NOTPROP:
1174
0
      if (clen > 0)
1175
0
        {
1176
0
        BOOL OK;
1177
0
        int chartype;
1178
0
        const uint32_t *cp;
1179
0
        const ucd_record * prop = GET_UCD(c);
1180
0
        switch(code[1])
1181
0
          {
1182
0
          case PT_ANY:
1183
0
          OK = TRUE;
1184
0
          break;
1185
1186
0
          case PT_LAMP:
1187
0
          chartype = prop->chartype;
1188
0
          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1189
0
               chartype == ucp_Lt;
1190
0
          break;
1191
1192
0
          case PT_GC:
1193
0
          OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1194
0
          break;
1195
1196
0
          case PT_PC:
1197
0
          OK = prop->chartype == code[2];
1198
0
          break;
1199
1200
0
          case PT_SC:
1201
0
          OK = prop->script == code[2];
1202
0
          break;
1203
1204
0
          case PT_SCX:
1205
0
          OK = (prop->script == code[2] ||
1206
0
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
1207
0
          break;
1208
1209
          /* These are specials for combination cases. */
1210
1211
0
          case PT_ALNUM:
1212
0
          chartype = prop->chartype;
1213
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1214
0
               PRIV(ucp_gentype)[chartype] == ucp_N;
1215
0
          break;
1216
1217
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1218
          which means that Perl space and POSIX space are now identical. PCRE
1219
          was changed at release 8.34. */
1220
1221
0
          case PT_SPACE:    /* Perl space */
1222
0
          case PT_PXSPACE:  /* POSIX space */
1223
0
          switch(c)
1224
0
            {
1225
0
            HSPACE_CASES:
1226
0
            VSPACE_CASES:
1227
0
            OK = TRUE;
1228
0
            break;
1229
1230
0
            default:
1231
0
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1232
0
            break;
1233
0
            }
1234
0
          break;
1235
1236
0
          case PT_WORD:
1237
0
          chartype = prop->chartype;
1238
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1239
0
               PRIV(ucp_gentype)[chartype] == ucp_N ||
1240
0
               chartype == ucp_Mn || chartype == ucp_Pc;
1241
0
          break;
1242
1243
0
          case PT_CLIST:
1244
#if PCRE2_CODE_UNIT_WIDTH == 32
1245
          if (c > MAX_UTF_CODE_POINT)
1246
            {
1247
            OK = FALSE;
1248
            break;
1249
            }
1250
#endif
1251
0
          cp = PRIV(ucd_caseless_sets) + code[2];
1252
0
          for (;;)
1253
0
            {
1254
0
            if (c < *cp) { OK = FALSE; break; }
1255
0
            if (c == *cp++) { OK = TRUE; break; }
1256
0
            }
1257
0
          break;
1258
1259
0
          case PT_UCNC:
1260
0
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1261
0
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1262
0
               c >= 0xe000;
1263
0
          break;
1264
1265
0
          case PT_BIDICL:
1266
0
          OK = UCD_BIDICLASS(c) == code[2];
1267
0
          break;
1268
1269
0
          case PT_BOOL:
1270
0
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1271
0
            UCD_BPROPS_PROP(prop), code[2]) != 0;
1272
0
          break;
1273
1274
          /* Should never occur, but keep compilers from grumbling. */
1275
1276
0
          default:
1277
0
          OK = codevalue != OP_PROP;
1278
0
          break;
1279
0
          }
1280
1281
0
        if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1282
0
        }
1283
0
      break;
1284
0
#endif
1285
1286
1287
1288
/* ========================================================================== */
1289
      /* These opcodes likewise inspect the subject character, but have an
1290
      argument that is not a data character. It is one of these opcodes:
1291
      OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1292
      OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1293
1294
0
      case OP_TYPEPLUS:
1295
0
      case OP_TYPEMINPLUS:
1296
0
      case OP_TYPEPOSPLUS:
1297
0
      count = current_state->count;  /* Already matched */
1298
0
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1299
0
      if (clen > 0)
1300
0
        {
1301
0
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1302
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1303
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1304
0
            NLBLOCK->nllen == 2 &&
1305
0
            c == NLBLOCK->nl[0])
1306
0
          {
1307
0
          could_continue = partial_newline = TRUE;
1308
0
          }
1309
0
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1310
0
            (c < 256 &&
1311
0
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1312
0
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1313
0
          {
1314
0
          if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1315
0
            {
1316
0
            active_count--;            /* Remove non-match possibility */
1317
0
            next_active_state--;
1318
0
            }
1319
0
          count++;
1320
0
          ADD_NEW(state_offset, count);
1321
0
          }
1322
0
        }
1323
0
      break;
1324
1325
      /*-----------------------------------------------------------------*/
1326
0
      case OP_TYPEQUERY:
1327
0
      case OP_TYPEMINQUERY:
1328
0
      case OP_TYPEPOSQUERY:
1329
0
      ADD_ACTIVE(state_offset + 2, 0);
1330
0
      if (clen > 0)
1331
0
        {
1332
0
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1333
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1334
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1335
0
            NLBLOCK->nllen == 2 &&
1336
0
            c == NLBLOCK->nl[0])
1337
0
          {
1338
0
          could_continue = partial_newline = TRUE;
1339
0
          }
1340
0
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1341
0
            (c < 256 &&
1342
0
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1343
0
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1344
0
          {
1345
0
          if (codevalue == OP_TYPEPOSQUERY)
1346
0
            {
1347
0
            active_count--;            /* Remove non-match possibility */
1348
0
            next_active_state--;
1349
0
            }
1350
0
          ADD_NEW(state_offset + 2, 0);
1351
0
          }
1352
0
        }
1353
0
      break;
1354
1355
      /*-----------------------------------------------------------------*/
1356
0
      case OP_TYPESTAR:
1357
0
      case OP_TYPEMINSTAR:
1358
0
      case OP_TYPEPOSSTAR:
1359
0
      ADD_ACTIVE(state_offset + 2, 0);
1360
0
      if (clen > 0)
1361
0
        {
1362
0
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1363
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1364
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1365
0
            NLBLOCK->nllen == 2 &&
1366
0
            c == NLBLOCK->nl[0])
1367
0
          {
1368
0
          could_continue = partial_newline = TRUE;
1369
0
          }
1370
0
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1371
0
            (c < 256 &&
1372
0
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1373
0
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1374
0
          {
1375
0
          if (codevalue == OP_TYPEPOSSTAR)
1376
0
            {
1377
0
            active_count--;            /* Remove non-match possibility */
1378
0
            next_active_state--;
1379
0
            }
1380
0
          ADD_NEW(state_offset, 0);
1381
0
          }
1382
0
        }
1383
0
      break;
1384
1385
      /*-----------------------------------------------------------------*/
1386
0
      case OP_TYPEEXACT:
1387
0
      count = current_state->count;  /* Number already matched */
1388
0
      if (clen > 0)
1389
0
        {
1390
0
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1391
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1392
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1393
0
            NLBLOCK->nllen == 2 &&
1394
0
            c == NLBLOCK->nl[0])
1395
0
          {
1396
0
          could_continue = partial_newline = TRUE;
1397
0
          }
1398
0
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1399
0
            (c < 256 &&
1400
0
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1401
0
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1402
0
          {
1403
0
          if (++count >= (int)GET2(code, 1))
1404
0
            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1405
0
          else
1406
0
            { ADD_NEW(state_offset, count); }
1407
0
          }
1408
0
        }
1409
0
      break;
1410
1411
      /*-----------------------------------------------------------------*/
1412
0
      case OP_TYPEUPTO:
1413
0
      case OP_TYPEMINUPTO:
1414
0
      case OP_TYPEPOSUPTO:
1415
0
      ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1416
0
      count = current_state->count;  /* Number already matched */
1417
0
      if (clen > 0)
1418
0
        {
1419
0
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1420
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1421
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1422
0
            NLBLOCK->nllen == 2 &&
1423
0
            c == NLBLOCK->nl[0])
1424
0
          {
1425
0
          could_continue = partial_newline = TRUE;
1426
0
          }
1427
0
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1428
0
            (c < 256 &&
1429
0
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1430
0
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1431
0
          {
1432
0
          if (codevalue == OP_TYPEPOSUPTO)
1433
0
            {
1434
0
            active_count--;           /* Remove non-match possibility */
1435
0
            next_active_state--;
1436
0
            }
1437
0
          if (++count >= (int)GET2(code, 1))
1438
0
            { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1439
0
          else
1440
0
            { ADD_NEW(state_offset, count); }
1441
0
          }
1442
0
        }
1443
0
      break;
1444
1445
/* ========================================================================== */
1446
      /* These are virtual opcodes that are used when something like
1447
      OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1448
      argument. It keeps the code above fast for the other cases. The argument
1449
      is in the d variable. */
1450
1451
0
#ifdef SUPPORT_UNICODE
1452
0
      case OP_PROP_EXTRA + OP_TYPEPLUS:
1453
0
      case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1454
0
      case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1455
0
      count = current_state->count;           /* Already matched */
1456
0
      if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1457
0
      if (clen > 0)
1458
0
        {
1459
0
        BOOL OK;
1460
0
        int chartype;
1461
0
        const uint32_t *cp;
1462
0
        const ucd_record * prop = GET_UCD(c);
1463
0
        switch(code[2])
1464
0
          {
1465
0
          case PT_ANY:
1466
0
          OK = TRUE;
1467
0
          break;
1468
1469
0
          case PT_LAMP:
1470
0
          chartype = prop->chartype;
1471
0
          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1472
0
          break;
1473
1474
0
          case PT_GC:
1475
0
          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1476
0
          break;
1477
1478
0
          case PT_PC:
1479
0
          OK = prop->chartype == code[3];
1480
0
          break;
1481
1482
0
          case PT_SC:
1483
0
          OK = prop->script == code[3];
1484
0
          break;
1485
1486
0
          case PT_SCX:
1487
0
          OK = (prop->script == code[3] ||
1488
0
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1489
0
          break;
1490
1491
          /* These are specials for combination cases. */
1492
1493
0
          case PT_ALNUM:
1494
0
          chartype = prop->chartype;
1495
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1496
0
               PRIV(ucp_gentype)[chartype] == ucp_N;
1497
0
          break;
1498
1499
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1500
          which means that Perl space and POSIX space are now identical. PCRE
1501
          was changed at release 8.34. */
1502
1503
0
          case PT_SPACE:    /* Perl space */
1504
0
          case PT_PXSPACE:  /* POSIX space */
1505
0
          switch(c)
1506
0
            {
1507
0
            HSPACE_CASES:
1508
0
            VSPACE_CASES:
1509
0
            OK = TRUE;
1510
0
            break;
1511
1512
0
            default:
1513
0
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1514
0
            break;
1515
0
            }
1516
0
          break;
1517
1518
0
          case PT_WORD:
1519
0
          chartype = prop->chartype;
1520
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1521
0
               PRIV(ucp_gentype)[chartype] == ucp_N ||
1522
0
               chartype == ucp_Mn || chartype == ucp_Pc;
1523
0
          break;
1524
1525
0
          case PT_CLIST:
1526
#if PCRE2_CODE_UNIT_WIDTH == 32
1527
          if (c > MAX_UTF_CODE_POINT)
1528
            {
1529
            OK = FALSE;
1530
            break;
1531
            }
1532
#endif
1533
0
          cp = PRIV(ucd_caseless_sets) + code[3];
1534
0
          for (;;)
1535
0
            {
1536
0
            if (c < *cp) { OK = FALSE; break; }
1537
0
            if (c == *cp++) { OK = TRUE; break; }
1538
0
            }
1539
0
          break;
1540
1541
0
          case PT_UCNC:
1542
0
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1543
0
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1544
0
               c >= 0xe000;
1545
0
          break;
1546
1547
0
          case PT_BIDICL:
1548
0
          OK = UCD_BIDICLASS(c) == code[3];
1549
0
          break;
1550
1551
0
          case PT_BOOL:
1552
0
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1553
0
            UCD_BPROPS_PROP(prop), code[3]) != 0;
1554
0
          break;
1555
1556
          /* Should never occur, but keep compilers from grumbling. */
1557
1558
0
          default:
1559
0
          OK = codevalue != OP_PROP;
1560
0
          break;
1561
0
          }
1562
1563
0
        if (OK == (d == OP_PROP))
1564
0
          {
1565
0
          if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1566
0
            {
1567
0
            active_count--;           /* Remove non-match possibility */
1568
0
            next_active_state--;
1569
0
            }
1570
0
          count++;
1571
0
          ADD_NEW(state_offset, count);
1572
0
          }
1573
0
        }
1574
0
      break;
1575
1576
      /*-----------------------------------------------------------------*/
1577
0
      case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1578
0
      case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1579
0
      case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1580
0
      count = current_state->count;  /* Already matched */
1581
0
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1582
0
      if (clen > 0)
1583
0
        {
1584
0
        int ncount = 0;
1585
0
        if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1586
0
          {
1587
0
          active_count--;           /* Remove non-match possibility */
1588
0
          next_active_state--;
1589
0
          }
1590
0
        (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1591
0
          &ncount);
1592
0
        count++;
1593
0
        ADD_NEW_DATA(-state_offset, count, ncount);
1594
0
        }
1595
0
      break;
1596
0
#endif
1597
1598
      /*-----------------------------------------------------------------*/
1599
0
      case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1600
0
      case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1601
0
      case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1602
0
      count = current_state->count;  /* Already matched */
1603
0
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1604
0
      if (clen > 0)
1605
0
        {
1606
0
        int ncount = 0;
1607
0
        switch (c)
1608
0
          {
1609
0
          case CHAR_VT:
1610
0
          case CHAR_FF:
1611
0
          case CHAR_NEL:
1612
0
#ifndef EBCDIC
1613
0
          case 0x2028:
1614
0
          case 0x2029:
1615
0
#endif  /* Not EBCDIC */
1616
0
          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1617
0
          goto ANYNL01;
1618
1619
0
          case CHAR_CR:
1620
0
          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1621
          /* Fall through */
1622
1623
0
          ANYNL01:
1624
0
          case CHAR_LF:
1625
0
          if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1626
0
            {
1627
0
            active_count--;           /* Remove non-match possibility */
1628
0
            next_active_state--;
1629
0
            }
1630
0
          count++;
1631
0
          ADD_NEW_DATA(-state_offset, count, ncount);
1632
0
          break;
1633
1634
0
          default:
1635
0
          break;
1636
0
          }
1637
0
        }
1638
0
      break;
1639
1640
      /*-----------------------------------------------------------------*/
1641
0
      case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1642
0
      case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1643
0
      case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1644
0
      count = current_state->count;  /* Already matched */
1645
0
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1646
0
      if (clen > 0)
1647
0
        {
1648
0
        BOOL OK;
1649
0
        switch (c)
1650
0
          {
1651
0
          VSPACE_CASES:
1652
0
          OK = TRUE;
1653
0
          break;
1654
1655
0
          default:
1656
0
          OK = FALSE;
1657
0
          break;
1658
0
          }
1659
1660
0
        if (OK == (d == OP_VSPACE))
1661
0
          {
1662
0
          if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1663
0
            {
1664
0
            active_count--;           /* Remove non-match possibility */
1665
0
            next_active_state--;
1666
0
            }
1667
0
          count++;
1668
0
          ADD_NEW_DATA(-state_offset, count, 0);
1669
0
          }
1670
0
        }
1671
0
      break;
1672
1673
      /*-----------------------------------------------------------------*/
1674
0
      case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1675
0
      case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1676
0
      case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1677
0
      count = current_state->count;  /* Already matched */
1678
0
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1679
0
      if (clen > 0)
1680
0
        {
1681
0
        BOOL OK;
1682
0
        switch (c)
1683
0
          {
1684
0
          HSPACE_CASES:
1685
0
          OK = TRUE;
1686
0
          break;
1687
1688
0
          default:
1689
0
          OK = FALSE;
1690
0
          break;
1691
0
          }
1692
1693
0
        if (OK == (d == OP_HSPACE))
1694
0
          {
1695
0
          if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1696
0
            {
1697
0
            active_count--;           /* Remove non-match possibility */
1698
0
            next_active_state--;
1699
0
            }
1700
0
          count++;
1701
0
          ADD_NEW_DATA(-state_offset, count, 0);
1702
0
          }
1703
0
        }
1704
0
      break;
1705
1706
      /*-----------------------------------------------------------------*/
1707
0
#ifdef SUPPORT_UNICODE
1708
0
      case OP_PROP_EXTRA + OP_TYPEQUERY:
1709
0
      case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1710
0
      case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1711
0
      count = 4;
1712
0
      goto QS1;
1713
1714
0
      case OP_PROP_EXTRA + OP_TYPESTAR:
1715
0
      case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1716
0
      case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1717
0
      count = 0;
1718
1719
0
      QS1:
1720
1721
0
      ADD_ACTIVE(state_offset + 4, 0);
1722
0
      if (clen > 0)
1723
0
        {
1724
0
        BOOL OK;
1725
0
        int chartype;
1726
0
        const uint32_t *cp;
1727
0
        const ucd_record * prop = GET_UCD(c);
1728
0
        switch(code[2])
1729
0
          {
1730
0
          case PT_ANY:
1731
0
          OK = TRUE;
1732
0
          break;
1733
1734
0
          case PT_LAMP:
1735
0
          chartype = prop->chartype;
1736
0
          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1737
0
          break;
1738
1739
0
          case PT_GC:
1740
0
          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1741
0
          break;
1742
1743
0
          case PT_PC:
1744
0
          OK = prop->chartype == code[3];
1745
0
          break;
1746
1747
0
          case PT_SC:
1748
0
          OK = prop->script == code[3];
1749
0
          break;
1750
1751
0
          case PT_SCX:
1752
0
          OK = (prop->script == code[3] ||
1753
0
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1754
0
          break;
1755
1756
          /* These are specials for combination cases. */
1757
1758
0
          case PT_ALNUM:
1759
0
          chartype = prop->chartype;
1760
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1761
0
               PRIV(ucp_gentype)[chartype] == ucp_N;
1762
0
          break;
1763
1764
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1765
          which means that Perl space and POSIX space are now identical. PCRE
1766
          was changed at release 8.34. */
1767
1768
0
          case PT_SPACE:    /* Perl space */
1769
0
          case PT_PXSPACE:  /* POSIX space */
1770
0
          switch(c)
1771
0
            {
1772
0
            HSPACE_CASES:
1773
0
            VSPACE_CASES:
1774
0
            OK = TRUE;
1775
0
            break;
1776
1777
0
            default:
1778
0
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1779
0
            break;
1780
0
            }
1781
0
          break;
1782
1783
0
          case PT_WORD:
1784
0
          chartype = prop->chartype;
1785
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1786
0
               PRIV(ucp_gentype)[chartype] == ucp_N ||
1787
0
               chartype == ucp_Mn || chartype == ucp_Pc;
1788
0
          break;
1789
1790
0
          case PT_CLIST:
1791
#if PCRE2_CODE_UNIT_WIDTH == 32
1792
          if (c > MAX_UTF_CODE_POINT)
1793
            {
1794
            OK = FALSE;
1795
            break;
1796
            }
1797
#endif
1798
0
          cp = PRIV(ucd_caseless_sets) + code[3];
1799
0
          for (;;)
1800
0
            {
1801
0
            if (c < *cp) { OK = FALSE; break; }
1802
0
            if (c == *cp++) { OK = TRUE; break; }
1803
0
            }
1804
0
          break;
1805
1806
0
          case PT_UCNC:
1807
0
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1808
0
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1809
0
               c >= 0xe000;
1810
0
          break;
1811
1812
0
          case PT_BIDICL:
1813
0
          OK = UCD_BIDICLASS(c) == code[3];
1814
0
          break;
1815
1816
0
          case PT_BOOL:
1817
0
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1818
0
            UCD_BPROPS_PROP(prop), code[3]) != 0;
1819
0
          break;
1820
1821
          /* Should never occur, but keep compilers from grumbling. */
1822
1823
0
          default:
1824
0
          OK = codevalue != OP_PROP;
1825
0
          break;
1826
0
          }
1827
1828
0
        if (OK == (d == OP_PROP))
1829
0
          {
1830
0
          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1831
0
              codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1832
0
            {
1833
0
            active_count--;           /* Remove non-match possibility */
1834
0
            next_active_state--;
1835
0
            }
1836
0
          ADD_NEW(state_offset + count, 0);
1837
0
          }
1838
0
        }
1839
0
      break;
1840
1841
      /*-----------------------------------------------------------------*/
1842
0
      case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1843
0
      case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1844
0
      case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1845
0
      count = 2;
1846
0
      goto QS2;
1847
1848
0
      case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1849
0
      case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1850
0
      case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1851
0
      count = 0;
1852
1853
0
      QS2:
1854
1855
0
      ADD_ACTIVE(state_offset + 2, 0);
1856
0
      if (clen > 0)
1857
0
        {
1858
0
        int ncount = 0;
1859
0
        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1860
0
            codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1861
0
          {
1862
0
          active_count--;           /* Remove non-match possibility */
1863
0
          next_active_state--;
1864
0
          }
1865
0
        (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1866
0
          &ncount);
1867
0
        ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1868
0
        }
1869
0
      break;
1870
0
#endif
1871
1872
      /*-----------------------------------------------------------------*/
1873
0
      case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1874
0
      case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1875
0
      case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1876
0
      count = 2;
1877
0
      goto QS3;
1878
1879
0
      case OP_ANYNL_EXTRA + OP_TYPESTAR:
1880
0
      case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1881
0
      case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1882
0
      count = 0;
1883
1884
0
      QS3:
1885
0
      ADD_ACTIVE(state_offset + 2, 0);
1886
0
      if (clen > 0)
1887
0
        {
1888
0
        int ncount = 0;
1889
0
        switch (c)
1890
0
          {
1891
0
          case CHAR_VT:
1892
0
          case CHAR_FF:
1893
0
          case CHAR_NEL:
1894
0
#ifndef EBCDIC
1895
0
          case 0x2028:
1896
0
          case 0x2029:
1897
0
#endif  /* Not EBCDIC */
1898
0
          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1899
0
          goto ANYNL02;
1900
1901
0
          case CHAR_CR:
1902
0
          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1903
          /* Fall through */
1904
1905
0
          ANYNL02:
1906
0
          case CHAR_LF:
1907
0
          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1908
0
              codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1909
0
            {
1910
0
            active_count--;           /* Remove non-match possibility */
1911
0
            next_active_state--;
1912
0
            }
1913
0
          ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1914
0
          break;
1915
1916
0
          default:
1917
0
          break;
1918
0
          }
1919
0
        }
1920
0
      break;
1921
1922
      /*-----------------------------------------------------------------*/
1923
0
      case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1924
0
      case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1925
0
      case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1926
0
      count = 2;
1927
0
      goto QS4;
1928
1929
0
      case OP_VSPACE_EXTRA + OP_TYPESTAR:
1930
0
      case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1931
0
      case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1932
0
      count = 0;
1933
1934
0
      QS4:
1935
0
      ADD_ACTIVE(state_offset + 2, 0);
1936
0
      if (clen > 0)
1937
0
        {
1938
0
        BOOL OK;
1939
0
        switch (c)
1940
0
          {
1941
0
          VSPACE_CASES:
1942
0
          OK = TRUE;
1943
0
          break;
1944
1945
0
          default:
1946
0
          OK = FALSE;
1947
0
          break;
1948
0
          }
1949
0
        if (OK == (d == OP_VSPACE))
1950
0
          {
1951
0
          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1952
0
              codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1953
0
            {
1954
0
            active_count--;           /* Remove non-match possibility */
1955
0
            next_active_state--;
1956
0
            }
1957
0
          ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1958
0
          }
1959
0
        }
1960
0
      break;
1961
1962
      /*-----------------------------------------------------------------*/
1963
0
      case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1964
0
      case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1965
0
      case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1966
0
      count = 2;
1967
0
      goto QS5;
1968
1969
0
      case OP_HSPACE_EXTRA + OP_TYPESTAR:
1970
0
      case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1971
0
      case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1972
0
      count = 0;
1973
1974
0
      QS5:
1975
0
      ADD_ACTIVE(state_offset + 2, 0);
1976
0
      if (clen > 0)
1977
0
        {
1978
0
        BOOL OK;
1979
0
        switch (c)
1980
0
          {
1981
0
          HSPACE_CASES:
1982
0
          OK = TRUE;
1983
0
          break;
1984
1985
0
          default:
1986
0
          OK = FALSE;
1987
0
          break;
1988
0
          }
1989
1990
0
        if (OK == (d == OP_HSPACE))
1991
0
          {
1992
0
          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1993
0
              codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1994
0
            {
1995
0
            active_count--;           /* Remove non-match possibility */
1996
0
            next_active_state--;
1997
0
            }
1998
0
          ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1999
0
          }
2000
0
        }
2001
0
      break;
2002
2003
      /*-----------------------------------------------------------------*/
2004
0
#ifdef SUPPORT_UNICODE
2005
0
      case OP_PROP_EXTRA + OP_TYPEEXACT:
2006
0
      case OP_PROP_EXTRA + OP_TYPEUPTO:
2007
0
      case OP_PROP_EXTRA + OP_TYPEMINUPTO:
2008
0
      case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
2009
0
      if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
2010
0
        { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
2011
0
      count = current_state->count;  /* Number already matched */
2012
0
      if (clen > 0)
2013
0
        {
2014
0
        BOOL OK;
2015
0
        int chartype;
2016
0
        const uint32_t *cp;
2017
0
        const ucd_record * prop = GET_UCD(c);
2018
0
        switch(code[1 + IMM2_SIZE + 1])
2019
0
          {
2020
0
          case PT_ANY:
2021
0
          OK = TRUE;
2022
0
          break;
2023
2024
0
          case PT_LAMP:
2025
0
          chartype = prop->chartype;
2026
0
          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
2027
0
          break;
2028
2029
0
          case PT_GC:
2030
0
          OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
2031
0
          break;
2032
2033
0
          case PT_PC:
2034
0
          OK = prop->chartype == code[1 + IMM2_SIZE + 2];
2035
0
          break;
2036
2037
0
          case PT_SC:
2038
0
          OK = prop->script == code[1 + IMM2_SIZE + 2];
2039
0
          break;
2040
2041
0
          case PT_SCX:
2042
0
          OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
2043
0
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2044
0
                  code[1 + IMM2_SIZE + 2]) != 0);
2045
0
          break;
2046
2047
          /* These are specials for combination cases. */
2048
2049
0
          case PT_ALNUM:
2050
0
          chartype = prop->chartype;
2051
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2052
0
               PRIV(ucp_gentype)[chartype] == ucp_N;
2053
0
          break;
2054
2055
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2056
          which means that Perl space and POSIX space are now identical. PCRE
2057
          was changed at release 8.34. */
2058
2059
0
          case PT_SPACE:    /* Perl space */
2060
0
          case PT_PXSPACE:  /* POSIX space */
2061
0
          switch(c)
2062
0
            {
2063
0
            HSPACE_CASES:
2064
0
            VSPACE_CASES:
2065
0
            OK = TRUE;
2066
0
            break;
2067
2068
0
            default:
2069
0
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2070
0
            break;
2071
0
            }
2072
0
          break;
2073
2074
0
          case PT_WORD:
2075
0
          chartype = prop->chartype;
2076
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2077
0
               PRIV(ucp_gentype)[chartype] == ucp_N ||
2078
0
               chartype == ucp_Mn || chartype == ucp_Pc;
2079
0
          break;
2080
2081
0
          case PT_CLIST:
2082
#if PCRE2_CODE_UNIT_WIDTH == 32
2083
          if (c > MAX_UTF_CODE_POINT)
2084
            {
2085
            OK = FALSE;
2086
            break;
2087
            }
2088
#endif
2089
0
          cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
2090
0
          for (;;)
2091
0
            {
2092
0
            if (c < *cp) { OK = FALSE; break; }
2093
0
            if (c == *cp++) { OK = TRUE; break; }
2094
0
            }
2095
0
          break;
2096
2097
0
          case PT_UCNC:
2098
0
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2099
0
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2100
0
               c >= 0xe000;
2101
0
          break;
2102
2103
0
          case PT_BIDICL:
2104
0
          OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
2105
0
          break;
2106
2107
0
          case PT_BOOL:
2108
0
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2109
0
            UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
2110
0
          break;
2111
2112
          /* Should never occur, but keep compilers from grumbling. */
2113
2114
0
          default:
2115
0
          OK = codevalue != OP_PROP;
2116
0
          break;
2117
0
          }
2118
2119
0
        if (OK == (d == OP_PROP))
2120
0
          {
2121
0
          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2122
0
            {
2123
0
            active_count--;           /* Remove non-match possibility */
2124
0
            next_active_state--;
2125
0
            }
2126
0
          if (++count >= (int)GET2(code, 1))
2127
0
            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2128
0
          else
2129
0
            { ADD_NEW(state_offset, count); }
2130
0
          }
2131
0
        }
2132
0
      break;
2133
2134
      /*-----------------------------------------------------------------*/
2135
0
      case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2136
0
      case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2137
0
      case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2138
0
      case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2139
0
      if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2140
0
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2141
0
      count = current_state->count;  /* Number already matched */
2142
0
      if (clen > 0)
2143
0
        {
2144
0
        PCRE2_SPTR nptr;
2145
0
        int ncount = 0;
2146
0
        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2147
0
          {
2148
0
          active_count--;           /* Remove non-match possibility */
2149
0
          next_active_state--;
2150
0
          }
2151
0
        nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2152
0
          &ncount);
2153
0
        if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2154
0
            reset_could_continue = TRUE;
2155
0
        if (++count >= (int)GET2(code, 1))
2156
0
          { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2157
0
        else
2158
0
          { ADD_NEW_DATA(-state_offset, count, ncount); }
2159
0
        }
2160
0
      break;
2161
0
#endif
2162
2163
      /*-----------------------------------------------------------------*/
2164
0
      case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2165
0
      case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2166
0
      case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2167
0
      case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2168
0
      if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2169
0
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2170
0
      count = current_state->count;  /* Number already matched */
2171
0
      if (clen > 0)
2172
0
        {
2173
0
        int ncount = 0;
2174
0
        switch (c)
2175
0
          {
2176
0
          case CHAR_VT:
2177
0
          case CHAR_FF:
2178
0
          case CHAR_NEL:
2179
0
#ifndef EBCDIC
2180
0
          case 0x2028:
2181
0
          case 0x2029:
2182
0
#endif  /* Not EBCDIC */
2183
0
          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2184
0
          goto ANYNL03;
2185
2186
0
          case CHAR_CR:
2187
0
          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2188
          /* Fall through */
2189
2190
0
          ANYNL03:
2191
0
          case CHAR_LF:
2192
0
          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2193
0
            {
2194
0
            active_count--;           /* Remove non-match possibility */
2195
0
            next_active_state--;
2196
0
            }
2197
0
          if (++count >= (int)GET2(code, 1))
2198
0
            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2199
0
          else
2200
0
            { ADD_NEW_DATA(-state_offset, count, ncount); }
2201
0
          break;
2202
2203
0
          default:
2204
0
          break;
2205
0
          }
2206
0
        }
2207
0
      break;
2208
2209
      /*-----------------------------------------------------------------*/
2210
0
      case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2211
0
      case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2212
0
      case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2213
0
      case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2214
0
      if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2215
0
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2216
0
      count = current_state->count;  /* Number already matched */
2217
0
      if (clen > 0)
2218
0
        {
2219
0
        BOOL OK;
2220
0
        switch (c)
2221
0
          {
2222
0
          VSPACE_CASES:
2223
0
          OK = TRUE;
2224
0
          break;
2225
2226
0
          default:
2227
0
          OK = FALSE;
2228
0
          }
2229
2230
0
        if (OK == (d == OP_VSPACE))
2231
0
          {
2232
0
          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2233
0
            {
2234
0
            active_count--;           /* Remove non-match possibility */
2235
0
            next_active_state--;
2236
0
            }
2237
0
          if (++count >= (int)GET2(code, 1))
2238
0
            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2239
0
          else
2240
0
            { ADD_NEW_DATA(-state_offset, count, 0); }
2241
0
          }
2242
0
        }
2243
0
      break;
2244
2245
      /*-----------------------------------------------------------------*/
2246
0
      case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2247
0
      case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2248
0
      case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2249
0
      case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2250
0
      if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2251
0
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2252
0
      count = current_state->count;  /* Number already matched */
2253
0
      if (clen > 0)
2254
0
        {
2255
0
        BOOL OK;
2256
0
        switch (c)
2257
0
          {
2258
0
          HSPACE_CASES:
2259
0
          OK = TRUE;
2260
0
          break;
2261
2262
0
          default:
2263
0
          OK = FALSE;
2264
0
          break;
2265
0
          }
2266
2267
0
        if (OK == (d == OP_HSPACE))
2268
0
          {
2269
0
          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2270
0
            {
2271
0
            active_count--;           /* Remove non-match possibility */
2272
0
            next_active_state--;
2273
0
            }
2274
0
          if (++count >= (int)GET2(code, 1))
2275
0
            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2276
0
          else
2277
0
            { ADD_NEW_DATA(-state_offset, count, 0); }
2278
0
          }
2279
0
        }
2280
0
      break;
2281
2282
/* ========================================================================== */
2283
      /* These opcodes are followed by a character that is usually compared
2284
      to the current subject character; it is loaded into d. We still get
2285
      here even if there is no subject character, because in some cases zero
2286
      repetitions are permitted. */
2287
2288
      /*-----------------------------------------------------------------*/
2289
0
      case OP_CHAR:
2290
0
      if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2291
0
      break;
2292
2293
      /*-----------------------------------------------------------------*/
2294
0
      case OP_CHARI:
2295
0
      if (clen == 0) break;
2296
2297
0
#ifdef SUPPORT_UNICODE
2298
0
      if (utf_or_ucp)
2299
0
        {
2300
0
        if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2301
0
          {
2302
0
          unsigned int othercase;
2303
0
          if (c < 128)
2304
0
            othercase = fcc[c];
2305
0
          else
2306
0
            othercase = UCD_OTHERCASE(c);
2307
0
          if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2308
0
          }
2309
0
        }
2310
0
      else
2311
0
#endif  /* SUPPORT_UNICODE */
2312
      /* Not UTF or UCP mode */
2313
0
        {
2314
0
        if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2315
0
          { ADD_NEW(state_offset + 2, 0); }
2316
0
        }
2317
0
      break;
2318
2319
2320
0
#ifdef SUPPORT_UNICODE
2321
      /*-----------------------------------------------------------------*/
2322
      /* This is a tricky one because it can match more than one character.
2323
      Find out how many characters to skip, and then set up a negative state
2324
      to wait for them to pass before continuing. */
2325
2326
0
      case OP_EXTUNI:
2327
0
      if (clen > 0)
2328
0
        {
2329
0
        int ncount = 0;
2330
0
        PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2331
0
          end_subject, utf, &ncount);
2332
0
        if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2333
0
            reset_could_continue = TRUE;
2334
0
        ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2335
0
        }
2336
0
      break;
2337
0
#endif
2338
2339
      /*-----------------------------------------------------------------*/
2340
      /* This is a tricky like EXTUNI because it too can match more than one
2341
      character (when CR is followed by LF). In this case, set up a negative
2342
      state to wait for one character to pass before continuing. */
2343
2344
0
      case OP_ANYNL:
2345
0
      if (clen > 0) switch(c)
2346
0
        {
2347
0
        case CHAR_VT:
2348
0
        case CHAR_FF:
2349
0
        case CHAR_NEL:
2350
0
#ifndef EBCDIC
2351
0
        case 0x2028:
2352
0
        case 0x2029:
2353
0
#endif  /* Not EBCDIC */
2354
0
        if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2355
        /* Fall through */
2356
2357
0
        case CHAR_LF:
2358
0
        ADD_NEW(state_offset + 1, 0);
2359
0
        break;
2360
2361
0
        case CHAR_CR:
2362
0
        if (ptr + 1 >= end_subject)
2363
0
          {
2364
0
          ADD_NEW(state_offset + 1, 0);
2365
0
          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2366
0
            reset_could_continue = TRUE;
2367
0
          }
2368
0
        else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2369
0
          {
2370
0
          ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2371
0
          }
2372
0
        else
2373
0
          {
2374
0
          ADD_NEW(state_offset + 1, 0);
2375
0
          }
2376
0
        break;
2377
0
        }
2378
0
      break;
2379
2380
      /*-----------------------------------------------------------------*/
2381
0
      case OP_NOT_VSPACE:
2382
0
      if (clen > 0) switch(c)
2383
0
        {
2384
0
        VSPACE_CASES:
2385
0
        break;
2386
2387
0
        default:
2388
0
        ADD_NEW(state_offset + 1, 0);
2389
0
        break;
2390
0
        }
2391
0
      break;
2392
2393
      /*-----------------------------------------------------------------*/
2394
0
      case OP_VSPACE:
2395
0
      if (clen > 0) switch(c)
2396
0
        {
2397
0
        VSPACE_CASES:
2398
0
        ADD_NEW(state_offset + 1, 0);
2399
0
        break;
2400
2401
0
        default:
2402
0
        break;
2403
0
        }
2404
0
      break;
2405
2406
      /*-----------------------------------------------------------------*/
2407
0
      case OP_NOT_HSPACE:
2408
0
      if (clen > 0) switch(c)
2409
0
        {
2410
0
        HSPACE_CASES:
2411
0
        break;
2412
2413
0
        default:
2414
0
        ADD_NEW(state_offset + 1, 0);
2415
0
        break;
2416
0
        }
2417
0
      break;
2418
2419
      /*-----------------------------------------------------------------*/
2420
0
      case OP_HSPACE:
2421
0
      if (clen > 0) switch(c)
2422
0
        {
2423
0
        HSPACE_CASES:
2424
0
        ADD_NEW(state_offset + 1, 0);
2425
0
        break;
2426
2427
0
        default:
2428
0
        break;
2429
0
        }
2430
0
      break;
2431
2432
      /*-----------------------------------------------------------------*/
2433
      /* Match a negated single character casefully. */
2434
2435
0
      case OP_NOT:
2436
0
      if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2437
0
      break;
2438
2439
      /*-----------------------------------------------------------------*/
2440
      /* Match a negated single character caselessly. */
2441
2442
0
      case OP_NOTI:
2443
0
      if (clen > 0)
2444
0
        {
2445
0
        uint32_t otherd;
2446
0
#ifdef SUPPORT_UNICODE
2447
0
        if (utf_or_ucp && d >= 128)
2448
0
          otherd = UCD_OTHERCASE(d);
2449
0
        else
2450
0
#endif  /* SUPPORT_UNICODE */
2451
0
        otherd = TABLE_GET(d, fcc, d);
2452
0
        if (c != d && c != otherd)
2453
0
          { ADD_NEW(state_offset + dlen + 1, 0); }
2454
0
        }
2455
0
      break;
2456
2457
      /*-----------------------------------------------------------------*/
2458
0
      case OP_PLUSI:
2459
0
      case OP_MINPLUSI:
2460
0
      case OP_POSPLUSI:
2461
0
      case OP_NOTPLUSI:
2462
0
      case OP_NOTMINPLUSI:
2463
0
      case OP_NOTPOSPLUSI:
2464
0
      caseless = TRUE;
2465
0
      codevalue -= OP_STARI - OP_STAR;
2466
2467
      /* Fall through */
2468
0
      case OP_PLUS:
2469
0
      case OP_MINPLUS:
2470
0
      case OP_POSPLUS:
2471
0
      case OP_NOTPLUS:
2472
0
      case OP_NOTMINPLUS:
2473
0
      case OP_NOTPOSPLUS:
2474
0
      count = current_state->count;  /* Already matched */
2475
0
      if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2476
0
      if (clen > 0)
2477
0
        {
2478
0
        uint32_t otherd = NOTACHAR;
2479
0
        if (caseless)
2480
0
          {
2481
0
#ifdef SUPPORT_UNICODE
2482
0
          if (utf_or_ucp && d >= 128)
2483
0
            otherd = UCD_OTHERCASE(d);
2484
0
          else
2485
0
#endif  /* SUPPORT_UNICODE */
2486
0
          otherd = TABLE_GET(d, fcc, d);
2487
0
          }
2488
0
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2489
0
          {
2490
0
          if (count > 0 &&
2491
0
              (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2492
0
            {
2493
0
            active_count--;             /* Remove non-match possibility */
2494
0
            next_active_state--;
2495
0
            }
2496
0
          count++;
2497
0
          ADD_NEW(state_offset, count);
2498
0
          }
2499
0
        }
2500
0
      break;
2501
2502
      /*-----------------------------------------------------------------*/
2503
0
      case OP_QUERYI:
2504
0
      case OP_MINQUERYI:
2505
0
      case OP_POSQUERYI:
2506
0
      case OP_NOTQUERYI:
2507
0
      case OP_NOTMINQUERYI:
2508
0
      case OP_NOTPOSQUERYI:
2509
0
      caseless = TRUE;
2510
0
      codevalue -= OP_STARI - OP_STAR;
2511
      /* Fall through */
2512
0
      case OP_QUERY:
2513
0
      case OP_MINQUERY:
2514
0
      case OP_POSQUERY:
2515
0
      case OP_NOTQUERY:
2516
0
      case OP_NOTMINQUERY:
2517
0
      case OP_NOTPOSQUERY:
2518
0
      ADD_ACTIVE(state_offset + dlen + 1, 0);
2519
0
      if (clen > 0)
2520
0
        {
2521
0
        uint32_t otherd = NOTACHAR;
2522
0
        if (caseless)
2523
0
          {
2524
0
#ifdef SUPPORT_UNICODE
2525
0
          if (utf_or_ucp && d >= 128)
2526
0
            otherd = UCD_OTHERCASE(d);
2527
0
          else
2528
0
#endif  /* SUPPORT_UNICODE */
2529
0
          otherd = TABLE_GET(d, fcc, d);
2530
0
          }
2531
0
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2532
0
          {
2533
0
          if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2534
0
            {
2535
0
            active_count--;            /* Remove non-match possibility */
2536
0
            next_active_state--;
2537
0
            }
2538
0
          ADD_NEW(state_offset + dlen + 1, 0);
2539
0
          }
2540
0
        }
2541
0
      break;
2542
2543
      /*-----------------------------------------------------------------*/
2544
0
      case OP_STARI:
2545
0
      case OP_MINSTARI:
2546
0
      case OP_POSSTARI:
2547
0
      case OP_NOTSTARI:
2548
0
      case OP_NOTMINSTARI:
2549
0
      case OP_NOTPOSSTARI:
2550
0
      caseless = TRUE;
2551
0
      codevalue -= OP_STARI - OP_STAR;
2552
      /* Fall through */
2553
0
      case OP_STAR:
2554
0
      case OP_MINSTAR:
2555
0
      case OP_POSSTAR:
2556
0
      case OP_NOTSTAR:
2557
0
      case OP_NOTMINSTAR:
2558
0
      case OP_NOTPOSSTAR:
2559
0
      ADD_ACTIVE(state_offset + dlen + 1, 0);
2560
0
      if (clen > 0)
2561
0
        {
2562
0
        uint32_t otherd = NOTACHAR;
2563
0
        if (caseless)
2564
0
          {
2565
0
#ifdef SUPPORT_UNICODE
2566
0
          if (utf_or_ucp && d >= 128)
2567
0
            otherd = UCD_OTHERCASE(d);
2568
0
          else
2569
0
#endif  /* SUPPORT_UNICODE */
2570
0
          otherd = TABLE_GET(d, fcc, d);
2571
0
          }
2572
0
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2573
0
          {
2574
0
          if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2575
0
            {
2576
0
            active_count--;            /* Remove non-match possibility */
2577
0
            next_active_state--;
2578
0
            }
2579
0
          ADD_NEW(state_offset, 0);
2580
0
          }
2581
0
        }
2582
0
      break;
2583
2584
      /*-----------------------------------------------------------------*/
2585
0
      case OP_EXACTI:
2586
0
      case OP_NOTEXACTI:
2587
0
      caseless = TRUE;
2588
0
      codevalue -= OP_STARI - OP_STAR;
2589
      /* Fall through */
2590
0
      case OP_EXACT:
2591
0
      case OP_NOTEXACT:
2592
0
      count = current_state->count;  /* Number already matched */
2593
0
      if (clen > 0)
2594
0
        {
2595
0
        uint32_t otherd = NOTACHAR;
2596
0
        if (caseless)
2597
0
          {
2598
0
#ifdef SUPPORT_UNICODE
2599
0
          if (utf_or_ucp && d >= 128)
2600
0
            otherd = UCD_OTHERCASE(d);
2601
0
          else
2602
0
#endif  /* SUPPORT_UNICODE */
2603
0
          otherd = TABLE_GET(d, fcc, d);
2604
0
          }
2605
0
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2606
0
          {
2607
0
          if (++count >= (int)GET2(code, 1))
2608
0
            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2609
0
          else
2610
0
            { ADD_NEW(state_offset, count); }
2611
0
          }
2612
0
        }
2613
0
      break;
2614
2615
      /*-----------------------------------------------------------------*/
2616
0
      case OP_UPTOI:
2617
0
      case OP_MINUPTOI:
2618
0
      case OP_POSUPTOI:
2619
0
      case OP_NOTUPTOI:
2620
0
      case OP_NOTMINUPTOI:
2621
0
      case OP_NOTPOSUPTOI:
2622
0
      caseless = TRUE;
2623
0
      codevalue -= OP_STARI - OP_STAR;
2624
      /* Fall through */
2625
0
      case OP_UPTO:
2626
0
      case OP_MINUPTO:
2627
0
      case OP_POSUPTO:
2628
0
      case OP_NOTUPTO:
2629
0
      case OP_NOTMINUPTO:
2630
0
      case OP_NOTPOSUPTO:
2631
0
      ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2632
0
      count = current_state->count;  /* Number already matched */
2633
0
      if (clen > 0)
2634
0
        {
2635
0
        uint32_t otherd = NOTACHAR;
2636
0
        if (caseless)
2637
0
          {
2638
0
#ifdef SUPPORT_UNICODE
2639
0
          if (utf_or_ucp && d >= 128)
2640
0
            otherd = UCD_OTHERCASE(d);
2641
0
          else
2642
0
#endif  /* SUPPORT_UNICODE */
2643
0
          otherd = TABLE_GET(d, fcc, d);
2644
0
          }
2645
0
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2646
0
          {
2647
0
          if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2648
0
            {
2649
0
            active_count--;             /* Remove non-match possibility */
2650
0
            next_active_state--;
2651
0
            }
2652
0
          if (++count >= (int)GET2(code, 1))
2653
0
            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2654
0
          else
2655
0
            { ADD_NEW(state_offset, count); }
2656
0
          }
2657
0
        }
2658
0
      break;
2659
2660
2661
/* ========================================================================== */
2662
      /* These are the class-handling opcodes */
2663
2664
0
      case OP_CLASS:
2665
0
      case OP_NCLASS:
2666
0
      case OP_XCLASS:
2667
0
        {
2668
0
        BOOL isinclass = FALSE;
2669
0
        int next_state_offset;
2670
0
        PCRE2_SPTR ecode;
2671
2672
        /* For a simple class, there is always just a 32-byte table, and we
2673
        can set isinclass from it. */
2674
2675
0
        if (codevalue != OP_XCLASS)
2676
0
          {
2677
0
          ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2678
0
          if (clen > 0)
2679
0
            {
2680
0
            isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2681
0
              ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2682
0
            }
2683
0
          }
2684
2685
        /* An extended class may have a table or a list of single characters,
2686
        ranges, or both, and it may be positive or negative. There's a
2687
        function that sorts all this out. */
2688
2689
0
        else
2690
0
         {
2691
0
         ecode = code + GET(code, 1);
2692
0
         if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2693
0
         }
2694
2695
        /* At this point, isinclass is set for all kinds of class, and ecode
2696
        points to the byte after the end of the class. If there is a
2697
        quantifier, this is where it will be. */
2698
2699
0
        next_state_offset = (int)(ecode - start_code);
2700
2701
0
        switch (*ecode)
2702
0
          {
2703
0
          case OP_CRSTAR:
2704
0
          case OP_CRMINSTAR:
2705
0
          case OP_CRPOSSTAR:
2706
0
          ADD_ACTIVE(next_state_offset + 1, 0);
2707
0
          if (isinclass)
2708
0
            {
2709
0
            if (*ecode == OP_CRPOSSTAR)
2710
0
              {
2711
0
              active_count--;           /* Remove non-match possibility */
2712
0
              next_active_state--;
2713
0
              }
2714
0
            ADD_NEW(state_offset, 0);
2715
0
            }
2716
0
          break;
2717
2718
0
          case OP_CRPLUS:
2719
0
          case OP_CRMINPLUS:
2720
0
          case OP_CRPOSPLUS:
2721
0
          count = current_state->count;  /* Already matched */
2722
0
          if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2723
0
          if (isinclass)
2724
0
            {
2725
0
            if (count > 0 && *ecode == OP_CRPOSPLUS)
2726
0
              {
2727
0
              active_count--;           /* Remove non-match possibility */
2728
0
              next_active_state--;
2729
0
              }
2730
0
            count++;
2731
0
            ADD_NEW(state_offset, count);
2732
0
            }
2733
0
          break;
2734
2735
0
          case OP_CRQUERY:
2736
0
          case OP_CRMINQUERY:
2737
0
          case OP_CRPOSQUERY:
2738
0
          ADD_ACTIVE(next_state_offset + 1, 0);
2739
0
          if (isinclass)
2740
0
            {
2741
0
            if (*ecode == OP_CRPOSQUERY)
2742
0
              {
2743
0
              active_count--;           /* Remove non-match possibility */
2744
0
              next_active_state--;
2745
0
              }
2746
0
            ADD_NEW(next_state_offset + 1, 0);
2747
0
            }
2748
0
          break;
2749
2750
0
          case OP_CRRANGE:
2751
0
          case OP_CRMINRANGE:
2752
0
          case OP_CRPOSRANGE:
2753
0
          count = current_state->count;  /* Already matched */
2754
0
          if (count >= (int)GET2(ecode, 1))
2755
0
            { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2756
0
          if (isinclass)
2757
0
            {
2758
0
            int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2759
2760
0
            if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2761
0
              {
2762
0
              active_count--;           /* Remove non-match possibility */
2763
0
              next_active_state--;
2764
0
              }
2765
2766
0
            if (++count >= max && max != 0)   /* Max 0 => no limit */
2767
0
              { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2768
0
            else
2769
0
              { ADD_NEW(state_offset, count); }
2770
0
            }
2771
0
          break;
2772
2773
0
          default:
2774
0
          if (isinclass) { ADD_NEW(next_state_offset, 0); }
2775
0
          break;
2776
0
          }
2777
0
        }
2778
0
      break;
2779
2780
/* ========================================================================== */
2781
      /* These are the opcodes for fancy brackets of various kinds. We have
2782
      to use recursion in order to handle them. The "always failing" assertion
2783
      (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2784
      though the other "backtracking verbs" are not supported. */
2785
2786
0
      case OP_FAIL:
2787
0
      forced_fail++;    /* Count FAILs for multiple states */
2788
0
      break;
2789
2790
0
      case OP_ASSERT:
2791
0
      case OP_ASSERT_NOT:
2792
0
      case OP_ASSERTBACK:
2793
0
      case OP_ASSERTBACK_NOT:
2794
0
        {
2795
0
        int rc;
2796
0
        int *local_workspace;
2797
0
        PCRE2_SIZE *local_offsets;
2798
0
        PCRE2_SPTR endasscode = code + GET(code, 1);
2799
0
        RWS_anchor *rws = (RWS_anchor *)RWS;
2800
2801
0
        if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2802
0
          {
2803
0
          rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2804
0
          if (rc != 0) return rc;
2805
0
          RWS = (int *)rws;
2806
0
          }
2807
2808
0
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2809
0
        local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2810
0
        rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2811
2812
0
        while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2813
2814
0
        rc = internal_dfa_match(
2815
0
          mb,                                   /* static match data */
2816
0
          code,                                 /* this subexpression's code */
2817
0
          ptr,                                  /* where we currently are */
2818
0
          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2819
0
          local_offsets,                        /* offset vector */
2820
0
          RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2821
0
          local_workspace,                      /* workspace vector */
2822
0
          RWS_RSIZE,                            /* size of same */
2823
0
          rlevel,                               /* function recursion level */
2824
0
          RWS);                                 /* recursion workspace */
2825
2826
0
        rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2827
2828
0
        if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2829
0
        if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2830
0
            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2831
0
        }
2832
0
      break;
2833
2834
      /*-----------------------------------------------------------------*/
2835
0
      case OP_COND:
2836
0
      case OP_SCOND:
2837
0
        {
2838
0
        int codelink = (int)GET(code, 1);
2839
0
        PCRE2_UCHAR condcode;
2840
2841
        /* Because of the way auto-callout works during compile, a callout item
2842
        is inserted between OP_COND and an assertion condition. This does not
2843
        happen for the other conditions. */
2844
2845
0
        if (code[LINK_SIZE + 1] == OP_CALLOUT
2846
0
            || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2847
0
          {
2848
0
          PCRE2_SIZE callout_length;
2849
0
          rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2850
0
            1 + LINK_SIZE, &callout_length);
2851
0
          if (rrc < 0) return rrc;                 /* Abandon */
2852
0
          if (rrc > 0) break;                      /* Fail this thread */
2853
0
          code += callout_length;                  /* Skip callout data */
2854
0
          }
2855
2856
0
        condcode = code[LINK_SIZE+1];
2857
2858
        /* Back reference conditions and duplicate named recursion conditions
2859
        are not supported */
2860
2861
0
        if (condcode == OP_CREF || condcode == OP_DNCREF ||
2862
0
            condcode == OP_DNRREF)
2863
0
          return PCRE2_ERROR_DFA_UCOND;
2864
2865
        /* The DEFINE condition is always false, and the assertion (?!) is
2866
        converted to OP_FAIL. */
2867
2868
0
        if (condcode == OP_FALSE || condcode == OP_FAIL)
2869
0
          { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2870
2871
        /* There is also an always-true condition */
2872
2873
0
        else if (condcode == OP_TRUE)
2874
0
          { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2875
2876
        /* The only supported version of OP_RREF is for the value RREF_ANY,
2877
        which means "test if in any recursion". We can't test for specifically
2878
        recursed groups. */
2879
2880
0
        else if (condcode == OP_RREF)
2881
0
          {
2882
0
          unsigned int value = GET2(code, LINK_SIZE + 2);
2883
0
          if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2884
0
          if (mb->recursive != NULL)
2885
0
            { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2886
0
          else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2887
0
          }
2888
2889
        /* Otherwise, the condition is an assertion */
2890
2891
0
        else
2892
0
          {
2893
0
          int rc;
2894
0
          int *local_workspace;
2895
0
          PCRE2_SIZE *local_offsets;
2896
0
          PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2897
0
          PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2898
0
          RWS_anchor *rws = (RWS_anchor *)RWS;
2899
2900
0
          if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2901
0
            {
2902
0
            rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2903
0
            if (rc != 0) return rc;
2904
0
            RWS = (int *)rws;
2905
0
            }
2906
2907
0
          local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2908
0
          local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2909
0
          rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2910
2911
0
          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2912
2913
0
          rc = internal_dfa_match(
2914
0
            mb,                                   /* fixed match data */
2915
0
            asscode,                              /* this subexpression's code */
2916
0
            ptr,                                  /* where we currently are */
2917
0
            (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2918
0
            local_offsets,                        /* offset vector */
2919
0
            RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2920
0
            local_workspace,                      /* workspace vector */
2921
0
            RWS_RSIZE,                            /* size of same */
2922
0
            rlevel,                               /* function recursion level */
2923
0
            RWS);                                 /* recursion workspace */
2924
2925
0
          rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2926
2927
0
          if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2928
0
          if ((rc >= 0) ==
2929
0
                (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2930
0
            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2931
0
          else
2932
0
            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2933
0
          }
2934
0
        }
2935
0
      break;
2936
2937
      /*-----------------------------------------------------------------*/
2938
0
      case OP_RECURSE:
2939
0
        {
2940
0
        int rc;
2941
0
        int *local_workspace;
2942
0
        PCRE2_SIZE *local_offsets;
2943
0
        RWS_anchor *rws = (RWS_anchor *)RWS;
2944
0
        PCRE2_SPTR callpat = start_code + GET(code, 1);
2945
0
        uint32_t recno = (callpat == mb->start_code)? 0 :
2946
0
          GET2(callpat, 1 + LINK_SIZE);
2947
2948
0
        if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2949
0
          {
2950
0
          rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2951
0
          if (rc != 0) return rc;
2952
0
          RWS = (int *)rws;
2953
0
          }
2954
2955
0
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2956
0
        local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2957
0
        rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2958
2959
        /* Check for repeating a recursion without advancing the subject
2960
        pointer or last used character. This should catch convoluted mutual
2961
        recursions. (Some simple cases are caught at compile time.) */
2962
2963
0
        for (dfa_recursion_info *ri = mb->recursive;
2964
0
             ri != NULL;
2965
0
             ri = ri->prevrec)
2966
0
          {
2967
0
          if (recno == ri->group_num && ptr == ri->subject_position &&
2968
0
              mb->last_used_ptr == ri->last_used_ptr)
2969
0
            return PCRE2_ERROR_RECURSELOOP;
2970
0
          }
2971
2972
        /* Remember this recursion and where we started it so as to
2973
        catch infinite loops. */
2974
2975
0
        new_recursive.group_num = recno;
2976
0
        new_recursive.subject_position = ptr;
2977
0
        new_recursive.last_used_ptr = mb->last_used_ptr;
2978
0
        new_recursive.prevrec = mb->recursive;
2979
0
        mb->recursive = &new_recursive;
2980
2981
0
        rc = internal_dfa_match(
2982
0
          mb,                                   /* fixed match data */
2983
0
          callpat,                              /* this subexpression's code */
2984
0
          ptr,                                  /* where we currently are */
2985
0
          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2986
0
          local_offsets,                        /* offset vector */
2987
0
          RWS_OVEC_RSIZE/OVEC_UNIT,             /* size of same */
2988
0
          local_workspace,                      /* workspace vector */
2989
0
          RWS_RSIZE,                            /* size of same */
2990
0
          rlevel,                               /* function recursion level */
2991
0
          RWS);                                 /* recursion workspace */
2992
2993
0
        rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2994
0
        mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2995
2996
        /* Ran out of internal offsets */
2997
2998
0
        if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2999
3000
        /* For each successful matched substring, set up the next state with a
3001
        count of characters to skip before trying it. Note that the count is in
3002
        characters, not bytes. */
3003
3004
0
        if (rc > 0)
3005
0
          {
3006
0
          for (rc = rc*2 - 2; rc >= 0; rc -= 2)
3007
0
            {
3008
0
            PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
3009
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3010
0
            if (utf)
3011
0
              {
3012
0
              PCRE2_SPTR p = start_subject + local_offsets[rc];
3013
0
              PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
3014
0
              while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3015
0
              }
3016
0
#endif
3017
0
            if (charcount > 0)
3018
0
              {
3019
0
              ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
3020
0
                (int)(charcount - 1));
3021
0
              }
3022
0
            else
3023
0
              {
3024
0
              ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
3025
0
              }
3026
0
            }
3027
0
          }
3028
0
        else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3029
0
        }
3030
0
      break;
3031
3032
      /*-----------------------------------------------------------------*/
3033
0
      case OP_BRAPOS:
3034
0
      case OP_SBRAPOS:
3035
0
      case OP_CBRAPOS:
3036
0
      case OP_SCBRAPOS:
3037
0
      case OP_BRAPOSZERO:
3038
0
        {
3039
0
        int rc;
3040
0
        int *local_workspace;
3041
0
        PCRE2_SIZE *local_offsets;
3042
0
        PCRE2_SIZE charcount, matched_count;
3043
0
        PCRE2_SPTR local_ptr = ptr;
3044
0
        RWS_anchor *rws = (RWS_anchor *)RWS;
3045
0
        BOOL allow_zero;
3046
3047
0
        if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3048
0
          {
3049
0
          rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3050
0
          if (rc != 0) return rc;
3051
0
          RWS = (int *)rws;
3052
0
          }
3053
3054
0
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3055
0
        local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3056
0
        rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3057
3058
0
        if (codevalue == OP_BRAPOSZERO)
3059
0
          {
3060
0
          allow_zero = TRUE;
3061
0
          codevalue = *(++code);  /* Codevalue will be one of above BRAs */
3062
0
          }
3063
0
        else allow_zero = FALSE;
3064
3065
        /* Loop to match the subpattern as many times as possible as if it were
3066
        a complete pattern. */
3067
3068
0
        for (matched_count = 0;; matched_count++)
3069
0
          {
3070
0
          rc = internal_dfa_match(
3071
0
            mb,                                   /* fixed match data */
3072
0
            code,                                 /* this subexpression's code */
3073
0
            local_ptr,                            /* where we currently are */
3074
0
            (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3075
0
            local_offsets,                        /* offset vector */
3076
0
            RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3077
0
            local_workspace,                      /* workspace vector */
3078
0
            RWS_RSIZE,                            /* size of same */
3079
0
            rlevel,                               /* function recursion level */
3080
0
            RWS);                                 /* recursion workspace */
3081
3082
          /* Failed to match */
3083
3084
0
          if (rc < 0)
3085
0
            {
3086
0
            if (rc != PCRE2_ERROR_NOMATCH) return rc;
3087
0
            break;
3088
0
            }
3089
3090
          /* Matched: break the loop if zero characters matched. */
3091
3092
0
          charcount = local_offsets[1] - local_offsets[0];
3093
0
          if (charcount == 0) break;
3094
0
          local_ptr += charcount;    /* Advance temporary position ptr */
3095
0
          }
3096
3097
0
        rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3098
3099
        /* At this point we have matched the subpattern matched_count
3100
        times, and local_ptr is pointing to the character after the end of the
3101
        last match. */
3102
3103
0
        if (matched_count > 0 || allow_zero)
3104
0
          {
3105
0
          PCRE2_SPTR end_subpattern = code;
3106
0
          int next_state_offset;
3107
3108
0
          do { end_subpattern += GET(end_subpattern, 1); }
3109
0
            while (*end_subpattern == OP_ALT);
3110
0
          next_state_offset =
3111
0
            (int)(end_subpattern - start_code + LINK_SIZE + 1);
3112
3113
          /* Optimization: if there are no more active states, and there
3114
          are no new states yet set up, then skip over the subject string
3115
          right here, to save looping. Otherwise, set up the new state to swing
3116
          into action when the end of the matched substring is reached. */
3117
3118
0
          if (i + 1 >= active_count && new_count == 0)
3119
0
            {
3120
0
            ptr = local_ptr;
3121
0
            clen = 0;
3122
0
            ADD_NEW(next_state_offset, 0);
3123
0
            }
3124
0
          else
3125
0
            {
3126
0
            PCRE2_SPTR p = ptr;
3127
0
            PCRE2_SPTR pp = local_ptr;
3128
0
            charcount = (PCRE2_SIZE)(pp - p);
3129
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3130
0
            if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3131
0
#endif
3132
0
            ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3133
0
            }
3134
0
          }
3135
0
        }
3136
0
      break;
3137
3138
      /*-----------------------------------------------------------------*/
3139
0
      case OP_ONCE:
3140
0
        {
3141
0
        int rc;
3142
0
        int *local_workspace;
3143
0
        PCRE2_SIZE *local_offsets;
3144
0
        RWS_anchor *rws = (RWS_anchor *)RWS;
3145
3146
0
        if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3147
0
          {
3148
0
          rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3149
0
          if (rc != 0) return rc;
3150
0
          RWS = (int *)rws;
3151
0
          }
3152
3153
0
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3154
0
        local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3155
0
        rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3156
3157
0
        rc = internal_dfa_match(
3158
0
          mb,                                   /* fixed match data */
3159
0
          code,                                 /* this subexpression's code */
3160
0
          ptr,                                  /* where we currently are */
3161
0
          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3162
0
          local_offsets,                        /* offset vector */
3163
0
          RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3164
0
          local_workspace,                      /* workspace vector */
3165
0
          RWS_RSIZE,                            /* size of same */
3166
0
          rlevel,                               /* function recursion level */
3167
0
          RWS);                                 /* recursion workspace */
3168
3169
0
        rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3170
3171
0
        if (rc >= 0)
3172
0
          {
3173
0
          PCRE2_SPTR end_subpattern = code;
3174
0
          PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3175
0
          int next_state_offset, repeat_state_offset;
3176
3177
0
          do { end_subpattern += GET(end_subpattern, 1); }
3178
0
            while (*end_subpattern == OP_ALT);
3179
0
          next_state_offset =
3180
0
            (int)(end_subpattern - start_code + LINK_SIZE + 1);
3181
3182
          /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3183
          arrange for the repeat state also to be added to the relevant list.
3184
          Calculate the offset, or set -1 for no repeat. */
3185
3186
0
          repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3187
0
                                 *end_subpattern == OP_KETRMIN)?
3188
0
            (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3189
3190
          /* If we have matched an empty string, add the next state at the
3191
          current character pointer. This is important so that the duplicate
3192
          checking kicks in, which is what breaks infinite loops that match an
3193
          empty string. */
3194
3195
0
          if (charcount == 0)
3196
0
            {
3197
0
            ADD_ACTIVE(next_state_offset, 0);
3198
0
            }
3199
3200
          /* Optimization: if there are no more active states, and there
3201
          are no new states yet set up, then skip over the subject string
3202
          right here, to save looping. Otherwise, set up the new state to swing
3203
          into action when the end of the matched substring is reached. */
3204
3205
0
          else if (i + 1 >= active_count && new_count == 0)
3206
0
            {
3207
0
            ptr += charcount;
3208
0
            clen = 0;
3209
0
            ADD_NEW(next_state_offset, 0);
3210
3211
            /* If we are adding a repeat state at the new character position,
3212
            we must fudge things so that it is the only current state.
3213
            Otherwise, it might be a duplicate of one we processed before, and
3214
            that would cause it to be skipped. */
3215
3216
0
            if (repeat_state_offset >= 0)
3217
0
              {
3218
0
              next_active_state = active_states;
3219
0
              active_count = 0;
3220
0
              i = -1;
3221
0
              ADD_ACTIVE(repeat_state_offset, 0);
3222
0
              }
3223
0
            }
3224
0
          else
3225
0
            {
3226
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3227
0
            if (utf)
3228
0
              {
3229
0
              PCRE2_SPTR p = start_subject + local_offsets[0];
3230
0
              PCRE2_SPTR pp = start_subject + local_offsets[1];
3231
0
              while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3232
0
              }
3233
0
#endif
3234
0
            ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3235
0
            if (repeat_state_offset >= 0)
3236
0
              { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3237
0
            }
3238
0
          }
3239
0
        else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3240
0
        }
3241
0
      break;
3242
3243
3244
/* ========================================================================== */
3245
      /* Handle callouts */
3246
3247
0
      case OP_CALLOUT:
3248
0
      case OP_CALLOUT_STR:
3249
0
        {
3250
0
        PCRE2_SIZE callout_length;
3251
0
        rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
3252
0
          &callout_length);
3253
0
        if (rrc < 0) return rrc;   /* Abandon */
3254
0
        if (rrc == 0)
3255
0
          { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3256
0
        }
3257
0
      break;
3258
3259
3260
/* ========================================================================== */
3261
0
      default:        /* Unsupported opcode */
3262
0
      return PCRE2_ERROR_DFA_UITEM;
3263
0
      }
3264
3265
0
    NEXT_ACTIVE_STATE: continue;
3266
3267
0
    }      /* End of loop scanning active states */
3268
3269
  /* We have finished the processing at the current subject character. If no
3270
  new states have been set for the next character, we have found all the
3271
  matches that we are going to find. If partial matching has been requested,
3272
  check for appropriate conditions.
3273
3274
  The "forced_ fail" variable counts the number of (*F) encountered for the
3275
  character. If it is equal to the original active_count (saved in
3276
  workspace[1]) it means that (*F) was found on every active state. In this
3277
  case we don't want to give a partial match.
3278
3279
  The "could_continue" variable is true if a state could have continued but
3280
  for the fact that the end of the subject was reached. */
3281
3282
0
  if (new_count <= 0)
3283
0
    {
3284
0
    if (could_continue &&                            /* Some could go on, and */
3285
0
        forced_fail != workspace[1] &&               /* Not all forced fail & */
3286
0
        (                                            /* either... */
3287
0
        (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3288
0
        ||                                           /* or... */
3289
0
        ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3290
0
         match_count < 0)                             /* no matches */
3291
0
        ) &&                                         /* And... */
3292
0
        (
3293
0
        partial_newline ||                   /* Either partial NL */
3294
0
          (                                  /* or ... */
3295
0
          ptr >= end_subject &&              /* End of subject and */
3296
0
            (                                  /* either */
3297
0
            ptr > mb->start_used_ptr ||        /* Inspected non-empty string */
3298
0
            mb->allowemptypartial              /* or pattern has lookbehind */
3299
0
            )                                  /* or could match empty */
3300
0
          )
3301
0
        ))
3302
0
      match_count = PCRE2_ERROR_PARTIAL;
3303
0
    break;  /* Exit from loop along the subject string */
3304
0
    }
3305
3306
  /* One or more states are active for the next character. */
3307
3308
0
  ptr += clen;    /* Advance to next subject character */
3309
0
  }               /* Loop to move along the subject string */
3310
3311
/* Control gets here from "break" a few lines above. If we have a match and
3312
PCRE2_ENDANCHORED is set, the match fails. */
3313
3314
0
if (match_count >= 0 &&
3315
0
    ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3316
0
    ptr < end_subject)
3317
0
  match_count = PCRE2_ERROR_NOMATCH;
3318
3319
0
return match_count;
3320
0
}
3321
3322
3323
3324
/*************************************************
3325
*     Match a pattern using the DFA algorithm    *
3326
*************************************************/
3327
3328
/* This function matches a compiled pattern to a subject string, using the
3329
alternate matching algorithm that finds all matches at once.
3330
3331
Arguments:
3332
  code          points to the compiled pattern
3333
  subject       subject string
3334
  length        length of subject string
3335
  startoffset   where to start matching in the subject
3336
  options       option bits
3337
  match_data    points to a match data structure
3338
  gcontext      points to a match context
3339
  workspace     pointer to workspace
3340
  wscount       size of workspace
3341
3342
Returns:        > 0 => number of match offset pairs placed in offsets
3343
                = 0 => offsets overflowed; longest matches are present
3344
                 -1 => failed to match
3345
               < -1 => some kind of unexpected problem
3346
*/
3347
3348
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
3349
pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3350
  PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3351
  pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3352
0
{
3353
0
int rc;
3354
0
int was_zero_terminated = 0;
3355
3356
0
const pcre2_real_code *re = (const pcre2_real_code *)code;
3357
3358
0
PCRE2_SPTR start_match;
3359
0
PCRE2_SPTR end_subject;
3360
0
PCRE2_SPTR bumpalong_limit;
3361
0
PCRE2_SPTR req_cu_ptr;
3362
3363
0
BOOL utf, anchored, startline, firstline;
3364
0
BOOL has_first_cu = FALSE;
3365
0
BOOL has_req_cu = FALSE;
3366
3367
0
#if PCRE2_CODE_UNIT_WIDTH == 8
3368
0
PCRE2_SPTR memchr_found_first_cu = NULL;
3369
0
PCRE2_SPTR memchr_found_first_cu2 = NULL;
3370
0
#endif
3371
3372
0
PCRE2_UCHAR first_cu = 0;
3373
0
PCRE2_UCHAR first_cu2 = 0;
3374
0
PCRE2_UCHAR req_cu = 0;
3375
0
PCRE2_UCHAR req_cu2 = 0;
3376
3377
0
const uint8_t *start_bits = NULL;
3378
3379
/* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3380
is used below, and it expects NLBLOCK to be defined as a pointer. */
3381
3382
0
pcre2_callout_block cb;
3383
0
dfa_match_block actual_match_block;
3384
0
dfa_match_block *mb = &actual_match_block;
3385
3386
/* Set up a starting block of memory for use during recursive calls to
3387
internal_dfa_match(). By putting this on the stack, it minimizes resource use
3388
in the case when it is not needed. If this is too small, more memory is
3389
obtained from the heap. At the start of each block is an anchor structure.*/
3390
3391
0
int base_recursion_workspace[RWS_BASE_SIZE];
3392
0
RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3393
0
rws->next = NULL;
3394
0
rws->size = RWS_BASE_SIZE;
3395
0
rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3396
3397
/* Recognize NULL, length 0 as an empty string. */
3398
3399
0
if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
3400
3401
/* Plausibility checks */
3402
3403
0
if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3404
0
if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3405
0
  return PCRE2_ERROR_NULL;
3406
3407
0
if (length == PCRE2_ZERO_TERMINATED)
3408
0
  {
3409
0
  length = PRIV(strlen)(subject);
3410
0
  was_zero_terminated = 1;
3411
0
  }
3412
3413
0
if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3414
0
if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3415
3416
/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3417
time. */
3418
3419
0
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3420
0
   ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3421
0
  return PCRE2_ERROR_BADOPTION;
3422
3423
/* Invalid UTF support is not available for DFA matching. */
3424
3425
0
if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3426
0
  return PCRE2_ERROR_DFA_UINVALID_UTF;
3427
3428
/* Check that the first field in the block is the magic number. If it is not,
3429
return with PCRE2_ERROR_BADMAGIC. */
3430
3431
0
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3432
3433
/* Check the code unit width. */
3434
3435
0
if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3436
0
  return PCRE2_ERROR_BADMODE;
3437
3438
/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3439
options variable for this function. Users of PCRE2 who are not calling the
3440
function directly would like to have a way of setting these flags, in the same
3441
way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3442
constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3443
(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3444
transferred to the options for this function. The bits are guaranteed to be
3445
adjacent, but do not have the same values. This bit of Boolean trickery assumes
3446
that the match-time bits are not more significant than the flag bits. If by
3447
accident this is not the case, a compile-time division by zero error will
3448
occur. */
3449
3450
0
#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3451
0
#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3452
0
options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3453
0
#undef FF
3454
0
#undef OO
3455
3456
/* If restarting after a partial match, do some sanity checks on the contents
3457
of the workspace. */
3458
3459
0
if ((options & PCRE2_DFA_RESTART) != 0)
3460
0
  {
3461
0
  if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3462
0
    workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3463
0
      return PCRE2_ERROR_DFA_BADRESTART;
3464
0
  }
3465
3466
/* Set some local values */
3467
3468
0
utf = (re->overall_options & PCRE2_UTF) != 0;
3469
0
start_match = subject + start_offset;
3470
0
end_subject = subject + length;
3471
0
req_cu_ptr = start_match - 1;
3472
0
anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3473
0
  (re->overall_options & PCRE2_ANCHORED) != 0;
3474
3475
/* The "must be at the start of a line" flags are used in a loop when finding
3476
where to start. */
3477
3478
0
startline = (re->flags & PCRE2_STARTLINE) != 0;
3479
0
firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
3480
0
bumpalong_limit = end_subject;
3481
3482
/* Initialize and set up the fixed fields in the callout block, with a pointer
3483
in the match block. */
3484
3485
0
mb->cb = &cb;
3486
0
cb.version = 2;
3487
0
cb.subject = subject;
3488
0
cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3489
0
cb.callout_flags = 0;
3490
0
cb.capture_top      = 1;      /* No capture support */
3491
0
cb.capture_last     = 0;
3492
0
cb.mark             = NULL;   /* No (*MARK) support */
3493
3494
/* Get data from the match context, if present, and fill in the remaining
3495
fields in the match block. It is an error to set an offset limit without
3496
setting the flag at compile time. */
3497
3498
0
if (mcontext == NULL)
3499
0
  {
3500
0
  mb->callout = NULL;
3501
0
  mb->memctl = re->memctl;
3502
0
  mb->match_limit = PRIV(default_match_context).match_limit;
3503
0
  mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3504
0
  mb->heap_limit = PRIV(default_match_context).heap_limit;
3505
0
  }
3506
0
else
3507
0
  {
3508
0
  if (mcontext->offset_limit != PCRE2_UNSET)
3509
0
    {
3510
0
    if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3511
0
      return PCRE2_ERROR_BADOFFSETLIMIT;
3512
0
    bumpalong_limit = subject + mcontext->offset_limit;
3513
0
    }
3514
0
  mb->callout = mcontext->callout;
3515
0
  mb->callout_data = mcontext->callout_data;
3516
0
  mb->memctl = mcontext->memctl;
3517
0
  mb->match_limit = mcontext->match_limit;
3518
0
  mb->match_limit_depth = mcontext->depth_limit;
3519
0
  mb->heap_limit = mcontext->heap_limit;
3520
0
  }
3521
3522
0
if (mb->match_limit > re->limit_match)
3523
0
  mb->match_limit = re->limit_match;
3524
3525
0
if (mb->match_limit_depth > re->limit_depth)
3526
0
  mb->match_limit_depth = re->limit_depth;
3527
3528
0
if (mb->heap_limit > re->limit_heap)
3529
0
  mb->heap_limit = re->limit_heap;
3530
3531
0
mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3532
0
  re->name_count * re->name_entry_size;
3533
0
mb->tables = re->tables;
3534
0
mb->start_subject = subject;
3535
0
mb->end_subject = end_subject;
3536
0
mb->start_offset = start_offset;
3537
0
mb->allowemptypartial = (re->max_lookbehind > 0) ||
3538
0
  (re->flags & PCRE2_MATCH_EMPTY) != 0;
3539
0
mb->moptions = options;
3540
0
mb->poptions = re->overall_options;
3541
0
mb->match_call_count = 0;
3542
0
mb->heap_used = 0;
3543
3544
/* Process the \R and newline settings. */
3545
3546
0
mb->bsr_convention = re->bsr_convention;
3547
0
mb->nltype = NLTYPE_FIXED;
3548
0
switch(re->newline_convention)
3549
0
  {
3550
0
  case PCRE2_NEWLINE_CR:
3551
0
  mb->nllen = 1;
3552
0
  mb->nl[0] = CHAR_CR;
3553
0
  break;
3554
3555
0
  case PCRE2_NEWLINE_LF:
3556
0
  mb->nllen = 1;
3557
0
  mb->nl[0] = CHAR_NL;
3558
0
  break;
3559
3560
0
  case PCRE2_NEWLINE_NUL:
3561
0
  mb->nllen = 1;
3562
0
  mb->nl[0] = CHAR_NUL;
3563
0
  break;
3564
3565
0
  case PCRE2_NEWLINE_CRLF:
3566
0
  mb->nllen = 2;
3567
0
  mb->nl[0] = CHAR_CR;
3568
0
  mb->nl[1] = CHAR_NL;
3569
0
  break;
3570
3571
0
  case PCRE2_NEWLINE_ANY:
3572
0
  mb->nltype = NLTYPE_ANY;
3573
0
  break;
3574
3575
0
  case PCRE2_NEWLINE_ANYCRLF:
3576
0
  mb->nltype = NLTYPE_ANYCRLF;
3577
0
  break;
3578
3579
0
  default: return PCRE2_ERROR_INTERNAL;
3580
0
  }
3581
3582
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3583
we must also check that a starting offset does not point into the middle of a
3584
multiunit character. We check only the portion of the subject that is going to
3585
be inspected during matching - from the offset minus the maximum back reference
3586
to the given length. This saves time when a small part of a large subject is
3587
being matched by the use of a starting offset. Note that the maximum lookbehind
3588
is a number of characters, not code units. */
3589
3590
0
#ifdef SUPPORT_UNICODE
3591
0
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3592
0
  {
3593
0
  PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3594
3595
0
  if (start_offset > 0)
3596
0
    {
3597
0
#if PCRE2_CODE_UNIT_WIDTH != 32
3598
0
    unsigned int i;
3599
0
    if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3600
0
      return PCRE2_ERROR_BADUTFOFFSET;
3601
0
    for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3602
0
      {
3603
0
      check_subject--;
3604
0
      while (check_subject > subject &&
3605
0
#if PCRE2_CODE_UNIT_WIDTH == 8
3606
0
      (*check_subject & 0xc0) == 0x80)
3607
#else  /* 16-bit */
3608
      (*check_subject & 0xfc00) == 0xdc00)
3609
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3610
0
        check_subject--;
3611
0
      }
3612
#else   /* In the 32-bit library, one code unit equals one character. */
3613
    check_subject -= re->max_lookbehind;
3614
    if (check_subject < subject) check_subject = subject;
3615
#endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3616
0
    }
3617
3618
  /* Validate the relevant portion of the subject. After an error, adjust the
3619
  offset to be an absolute offset in the whole string. */
3620
3621
0
  match_data->rc = PRIV(valid_utf)(check_subject,
3622
0
    length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3623
0
  if (match_data->rc != 0)
3624
0
    {
3625
0
    match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3626
0
    return match_data->rc;
3627
0
    }
3628
0
  }
3629
0
#endif  /* SUPPORT_UNICODE */
3630
3631
/* Set up the first code unit to match, if available. If there's no first code
3632
unit there may be a bitmap of possible first characters. */
3633
3634
0
if ((re->flags & PCRE2_FIRSTSET) != 0)
3635
0
  {
3636
0
  has_first_cu = TRUE;
3637
0
  first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3638
0
  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3639
0
    {
3640
0
    first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3641
0
#ifdef SUPPORT_UNICODE
3642
0
#if PCRE2_CODE_UNIT_WIDTH == 8
3643
0
    if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3644
0
      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3645
#else
3646
    if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3647
      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3648
#endif
3649
0
#endif  /* SUPPORT_UNICODE */
3650
0
    }
3651
0
  }
3652
0
else
3653
0
  if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3654
0
    start_bits = re->start_bitmap;
3655
3656
/* There may be a "last known required code unit" set. */
3657
3658
0
if ((re->flags & PCRE2_LASTSET) != 0)
3659
0
  {
3660
0
  has_req_cu = TRUE;
3661
0
  req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3662
0
  if ((re->flags & PCRE2_LASTCASELESS) != 0)
3663
0
    {
3664
0
    req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3665
0
#ifdef SUPPORT_UNICODE
3666
0
#if PCRE2_CODE_UNIT_WIDTH == 8
3667
0
    if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3668
0
      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3669
#else
3670
    if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3671
      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3672
#endif
3673
0
#endif  /* SUPPORT_UNICODE */
3674
0
    }
3675
0
  }
3676
3677
/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3678
free the memory that was obtained. */
3679
3680
0
if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3681
0
  {
3682
0
  match_data->memctl.free((void *)match_data->subject,
3683
0
    match_data->memctl.memory_data);
3684
0
  match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3685
0
  }
3686
3687
/* Fill in fields that are always returned in the match data. */
3688
3689
0
match_data->code = re;
3690
0
match_data->subject = NULL;  /* Default for no match */
3691
0
match_data->mark = NULL;
3692
0
match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3693
3694
/* Call the main matching function, looping for a non-anchored regex after a
3695
failed match. If not restarting, perform certain optimizations at the start of
3696
a match. */
3697
3698
0
for (;;)
3699
0
  {
3700
  /* ----------------- Start of match optimizations ---------------- */
3701
3702
  /* There are some optimizations that avoid running the match if a known
3703
  starting point is not found, or if a known later code unit is not present.
3704
  However, there is an option (settable at compile time) that disables
3705
  these, for testing and for ensuring that all callouts do actually occur.
3706
  The optimizations must also be avoided when restarting a DFA match. */
3707
3708
0
  if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3709
0
      (options & PCRE2_DFA_RESTART) == 0)
3710
0
    {
3711
    /* If firstline is TRUE, the start of the match is constrained to the first
3712
    line of a multiline string. That is, the match must be before or at the
3713
    first newline following the start of matching. Temporarily adjust
3714
    end_subject so that we stop the optimization scans for a first code unit
3715
    immediately after the first character of a newline (the first code unit can
3716
    legitimately be a newline). If the match fails at the newline, later code
3717
    breaks this loop. */
3718
3719
0
    if (firstline)
3720
0
      {
3721
0
      PCRE2_SPTR t = start_match;
3722
0
#ifdef SUPPORT_UNICODE
3723
0
      if (utf)
3724
0
        {
3725
0
        while (t < end_subject && !IS_NEWLINE(t))
3726
0
          {
3727
0
          t++;
3728
0
          ACROSSCHAR(t < end_subject, t, t++);
3729
0
          }
3730
0
        }
3731
0
      else
3732
0
#endif
3733
0
      while (t < end_subject && !IS_NEWLINE(t)) t++;
3734
0
      end_subject = t;
3735
0
      }
3736
3737
    /* Anchored: check the first code unit if one is recorded. This may seem
3738
    pointless but it can help in detecting a no match case without scanning for
3739
    the required code unit. */
3740
3741
0
    if (anchored)
3742
0
      {
3743
0
      if (has_first_cu || start_bits != NULL)
3744
0
        {
3745
0
        BOOL ok = start_match < end_subject;
3746
0
        if (ok)
3747
0
          {
3748
0
          PCRE2_UCHAR c = UCHAR21TEST(start_match);
3749
0
          ok = has_first_cu && (c == first_cu || c == first_cu2);
3750
0
          if (!ok && start_bits != NULL)
3751
0
            {
3752
#if PCRE2_CODE_UNIT_WIDTH != 8
3753
            if (c > 255) c = 255;
3754
#endif
3755
0
            ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3756
0
            }
3757
0
          }
3758
0
        if (!ok) break;
3759
0
        }
3760
0
      }
3761
3762
    /* Not anchored. Advance to a unique first code unit if there is one. */
3763
3764
0
    else
3765
0
      {
3766
0
      if (has_first_cu)
3767
0
        {
3768
0
        if (first_cu != first_cu2)  /* Caseless */
3769
0
          {
3770
          /* In 16-bit and 32_bit modes we have to do our own search, so can
3771
          look for both cases at once. */
3772
3773
#if PCRE2_CODE_UNIT_WIDTH != 8
3774
          PCRE2_UCHAR smc;
3775
          while (start_match < end_subject &&
3776
                (smc = UCHAR21TEST(start_match)) != first_cu &&
3777
                 smc != first_cu2)
3778
            start_match++;
3779
#else
3780
          /* In 8-bit mode, the use of memchr() gives a big speed up, even
3781
          though we have to call it twice in order to find the earliest
3782
          occurrence of the code unit in either of its cases. Caching is used
3783
          to remember the positions of previously found code units. This can
3784
          make a huge difference when the strings are very long and only one
3785
          case is actually present. */
3786
3787
0
          PCRE2_SPTR pp1 = NULL;
3788
0
          PCRE2_SPTR pp2 = NULL;
3789
0
          PCRE2_SIZE searchlength = end_subject - start_match;
3790
3791
          /* If we haven't got a previously found position for first_cu, or if
3792
          the current starting position is later, we need to do a search. If
3793
          the code unit is not found, set it to the end. */
3794
3795
0
          if (memchr_found_first_cu == NULL ||
3796
0
              start_match > memchr_found_first_cu)
3797
0
            {
3798
0
            pp1 = memchr(start_match, first_cu, searchlength);
3799
0
            memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3800
0
            }
3801
3802
          /* If the start is before a previously found position, use the
3803
          previous position, or NULL if a previous search failed. */
3804
3805
0
          else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3806
0
            memchr_found_first_cu;
3807
3808
          /* Do the same thing for the other case. */
3809
3810
0
          if (memchr_found_first_cu2 == NULL ||
3811
0
              start_match > memchr_found_first_cu2)
3812
0
            {
3813
0
            pp2 = memchr(start_match, first_cu2, searchlength);
3814
0
            memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3815
0
            }
3816
3817
0
          else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3818
0
            memchr_found_first_cu2;
3819
3820
          /* Set the start to the end of the subject if neither case was found.
3821
          Otherwise, use the earlier found point. */
3822
3823
0
          if (pp1 == NULL)
3824
0
            start_match = (pp2 == NULL)? end_subject : pp2;
3825
0
          else
3826
0
            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3827
3828
0
#endif  /* 8-bit handling */
3829
0
          }
3830
3831
        /* The caseful case is much simpler. */
3832
3833
0
        else
3834
0
          {
3835
#if PCRE2_CODE_UNIT_WIDTH != 8
3836
          while (start_match < end_subject && UCHAR21TEST(start_match) !=
3837
                 first_cu)
3838
            start_match++;
3839
#else  /* 8-bit code units */
3840
0
          start_match = memchr(start_match, first_cu, end_subject - start_match);
3841
0
          if (start_match == NULL) start_match = end_subject;
3842
0
#endif
3843
0
          }
3844
3845
        /* If we can't find the required code unit, having reached the true end
3846
        of the subject, break the bumpalong loop, to force a match failure,
3847
        except when doing partial matching, when we let the next cycle run at
3848
        the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3849
        which partially matches "abc", even though the string does not contain
3850
        the starting character "d". If we have not reached the true end of the
3851
        subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3852
        we also let the cycle run, because the matching string is legitimately
3853
        allowed to start with the first code unit of a newline. */
3854
3855
0
        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3856
0
            start_match >= mb->end_subject)
3857
0
          break;
3858
0
        }
3859
3860
      /* If there's no first code unit, advance to just after a linebreak for a
3861
      multiline match if required. */
3862
3863
0
      else if (startline)
3864
0
        {
3865
0
        if (start_match > mb->start_subject + start_offset)
3866
0
          {
3867
0
#ifdef SUPPORT_UNICODE
3868
0
          if (utf)
3869
0
            {
3870
0
            while (start_match < end_subject && !WAS_NEWLINE(start_match))
3871
0
              {
3872
0
              start_match++;
3873
0
              ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3874
0
              }
3875
0
            }
3876
0
          else
3877
0
#endif
3878
0
          while (start_match < end_subject && !WAS_NEWLINE(start_match))
3879
0
            start_match++;
3880
3881
          /* If we have just passed a CR and the newline option is ANY or
3882
          ANYCRLF, and we are now at a LF, advance the match position by one
3883
          more code unit. */
3884
3885
0
          if (start_match[-1] == CHAR_CR &&
3886
0
               (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3887
0
               start_match < end_subject &&
3888
0
               UCHAR21TEST(start_match) == CHAR_NL)
3889
0
            start_match++;
3890
0
          }
3891
0
        }
3892
3893
      /* If there's no first code unit or a requirement for a multiline line
3894
      start, advance to a non-unique first code unit if any have been
3895
      identified. The bitmap contains only 256 bits. When code units are 16 or
3896
      32 bits wide, all code units greater than 254 set the 255 bit. */
3897
3898
0
      else if (start_bits != NULL)
3899
0
        {
3900
0
        while (start_match < end_subject)
3901
0
          {
3902
0
          uint32_t c = UCHAR21TEST(start_match);
3903
#if PCRE2_CODE_UNIT_WIDTH != 8
3904
          if (c > 255) c = 255;
3905
#endif
3906
0
          if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3907
0
          start_match++;
3908
0
          }
3909
3910
        /* See comment above in first_cu checking about the next line. */
3911
3912
0
        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3913
0
            start_match >= mb->end_subject)
3914
0
          break;
3915
0
        }
3916
0
      }  /* End of first code unit handling */
3917
3918
    /* Restore fudged end_subject */
3919
3920
0
    end_subject = mb->end_subject;
3921
3922
    /* The following two optimizations are disabled for partial matching. */
3923
3924
0
    if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3925
0
      {
3926
0
      PCRE2_SPTR p;
3927
3928
      /* The minimum matching length is a lower bound; no actual string of that
3929
      length may actually match the pattern. Although the value is, strictly,
3930
      in characters, we treat it as code units to avoid spending too much time
3931
      in this optimization. */
3932
3933
0
      if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3934
3935
      /* If req_cu is set, we know that that code unit must appear in the
3936
      subject for the match to succeed. If the first code unit is set, req_cu
3937
      must be later in the subject; otherwise the test starts at the match
3938
      point. This optimization can save a huge amount of backtracking in
3939
      patterns with nested unlimited repeats that aren't going to match.
3940
      Writing separate code for cased/caseless versions makes it go faster, as
3941
      does using an autoincrement and backing off on a match. As in the case of
3942
      the first code unit, using memchr() in the 8-bit library gives a big
3943
      speed up. Unlike the first_cu check above, we do not need to call
3944
      memchr() twice in the caseless case because we only need to check for the
3945
      presence of the character in either case, not find the first occurrence.
3946
3947
      The search can be skipped if the code unit was found later than the
3948
      current starting point in a previous iteration of the bumpalong loop.
3949
3950
      HOWEVER: when the subject string is very, very long, searching to its end
3951
      can take a long time, and give bad performance on quite ordinary
3952
      patterns. This showed up when somebody was matching something like
3953
      /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3954
      sufficiently long, but it's worth searching a lot more for unanchored
3955
      patterns. */
3956
3957
0
      p = start_match + (has_first_cu? 1:0);
3958
0
      if (has_req_cu && p > req_cu_ptr)
3959
0
        {
3960
0
        PCRE2_SIZE check_length = end_subject - start_match;
3961
3962
0
        if (check_length < REQ_CU_MAX ||
3963
0
              (!anchored && check_length < REQ_CU_MAX * 1000))
3964
0
          {
3965
0
          if (req_cu != req_cu2)  /* Caseless */
3966
0
            {
3967
#if PCRE2_CODE_UNIT_WIDTH != 8
3968
            while (p < end_subject)
3969
              {
3970
              uint32_t pp = UCHAR21INCTEST(p);
3971
              if (pp == req_cu || pp == req_cu2) { p--; break; }
3972
              }
3973
#else  /* 8-bit code units */
3974
0
            PCRE2_SPTR pp = p;
3975
0
            p = memchr(pp, req_cu, end_subject - pp);
3976
0
            if (p == NULL)
3977
0
              {
3978
0
              p = memchr(pp, req_cu2, end_subject - pp);
3979
0
              if (p == NULL) p = end_subject;
3980
0
              }
3981
0
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3982
0
            }
3983
3984
          /* The caseful case */
3985
3986
0
          else
3987
0
            {
3988
#if PCRE2_CODE_UNIT_WIDTH != 8
3989
            while (p < end_subject)
3990
              {
3991
              if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3992
              }
3993
3994
#else  /* 8-bit code units */
3995
0
            p = memchr(p, req_cu, end_subject - p);
3996
0
            if (p == NULL) p = end_subject;
3997
0
#endif
3998
0
            }
3999
4000
          /* If we can't find the required code unit, break the matching loop,
4001
          forcing a match failure. */
4002
4003
0
          if (p >= end_subject) break;
4004
4005
          /* If we have found the required code unit, save the point where we
4006
          found it, so that we don't search again next time round the loop if
4007
          the start hasn't passed this code unit yet. */
4008
4009
0
          req_cu_ptr = p;
4010
0
          }
4011
0
        }
4012
0
      }
4013
0
    }
4014
4015
  /* ------------ End of start of match optimizations ------------ */
4016
4017
  /* Give no match if we have passed the bumpalong limit. */
4018
4019
0
  if (start_match > bumpalong_limit) break;
4020
4021
  /* OK, now we can do the business */
4022
4023
0
  mb->start_used_ptr = start_match;
4024
0
  mb->last_used_ptr = start_match;
4025
0
  mb->recursive = NULL;
4026
4027
0
  rc = internal_dfa_match(
4028
0
    mb,                           /* fixed match data */
4029
0
    mb->start_code,               /* this subexpression's code */
4030
0
    start_match,                  /* where we currently are */
4031
0
    start_offset,                 /* start offset in subject */
4032
0
    match_data->ovector,          /* offset vector */
4033
0
    (uint32_t)match_data->oveccount * 2,  /* actual size of same */
4034
0
    workspace,                    /* workspace vector */
4035
0
    (int)wscount,                 /* size of same */
4036
0
    0,                            /* function recurse level */
4037
0
    base_recursion_workspace);    /* initial workspace for recursion */
4038
4039
  /* Anything other than "no match" means we are done, always; otherwise, carry
4040
  on only if not anchored. */
4041
4042
0
  if (rc != PCRE2_ERROR_NOMATCH || anchored)
4043
0
    {
4044
0
    if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
4045
0
      {
4046
0
      match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
4047
0
      match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
4048
0
      }
4049
0
    match_data->subject_length = length;
4050
0
    match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
4051
0
    match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
4052
0
    match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4053
0
    match_data->rc = rc;
4054
4055
0
    if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
4056
0
      {
4057
0
      length = CU2BYTES(length + was_zero_terminated);
4058
0
      match_data->subject = match_data->memctl.malloc(length,
4059
0
        match_data->memctl.memory_data);
4060
0
      if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4061
0
      memcpy((void *)match_data->subject, subject, length);
4062
0
      match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
4063
0
      }
4064
0
    else
4065
0
      {
4066
0
      if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4067
0
      }
4068
0
    goto EXIT;
4069
0
    }
4070
4071
  /* Advance to the next subject character unless we are at the end of a line
4072
  and firstline is set. */
4073
4074
0
  if (firstline && IS_NEWLINE(start_match)) break;
4075
0
  start_match++;
4076
0
#ifdef SUPPORT_UNICODE
4077
0
  if (utf)
4078
0
    {
4079
0
    ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4080
0
    }
4081
0
#endif
4082
0
  if (start_match > end_subject) break;
4083
4084
  /* If we have just passed a CR and we are now at a LF, and the pattern does
4085
  not contain any explicit matches for \r or \n, and the newline option is CRLF
4086
  or ANY or ANYCRLF, advance the match position by one more character. */
4087
4088
0
  if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
4089
0
      start_match < end_subject &&
4090
0
      UCHAR21TEST(start_match) == CHAR_NL &&
4091
0
      (re->flags & PCRE2_HASCRORLF) == 0 &&
4092
0
        (mb->nltype == NLTYPE_ANY ||
4093
0
         mb->nltype == NLTYPE_ANYCRLF ||
4094
0
         mb->nllen == 2))
4095
0
    start_match++;
4096
4097
0
  }   /* "Bumpalong" loop */
4098
4099
0
NOMATCH_EXIT:
4100
0
rc = PCRE2_ERROR_NOMATCH;
4101
4102
0
EXIT:
4103
0
while (rws->next != NULL)
4104
0
  {
4105
0
  RWS_anchor *next = rws->next;
4106
0
  rws->next = next->next;
4107
0
  mb->memctl.free(next, mb->memctl.memory_data);
4108
0
  }
4109
4110
0
return rc;
4111
0
}
4112
4113
/* These #undefs are here to enable unity builds with CMake. */
4114
4115
#undef NLBLOCK /* Block containing newline information */
4116
#undef PSSTART /* Field containing processed string start */
4117
#undef PSEND   /* Field containing processed string end */
4118
4119
/* End of pcre2_dfa_match.c */