Coverage Report

Created: 2025-06-13 06:43

/src/php-src/ext/pcre/pcre2lib/pcre2_dfa_match.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
/* This module contains the external function pcre2_dfa_match(), which is an
43
alternative matching function that uses a sort of DFA algorithm (not a true
44
FSM). This is NOT Perl-compatible, but it has advantages in certain
45
applications. */
46
47
48
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49
the performance of his patterns greatly. I could not use it as it stood, as it
50
was not thread safe, and made assumptions about pattern sizes. Also, it caused
51
test 7 to loop, and test 9 to crash with a segfault.
52
53
The issue is the check for duplicate states, which is done by a simple linear
54
search up the state list. (Grep for "duplicate" below to find the code.) For
55
many patterns, there will never be many states active at one time, so a simple
56
linear search is fine. In patterns that have many active states, it might be a
57
bottleneck. The suggested code used an indexing scheme to remember which states
58
had previously been used for each character, and avoided the linear search when
59
it knew there was no chance of a duplicate. This was implemented when adding
60
states to the state lists.
61
62
I wrote some thread-safe, not-limited code to try something similar at the time
63
of checking for duplicates (instead of when adding states), using index vectors
64
on the stack. It did give a 13% improvement with one specially constructed
65
pattern for certain subject strings, but on other strings and on many of the
66
simpler patterns in the test suite it did worse. The major problem, I think,
67
was the extra time to initialize the index. This had to be done for each call
68
of internal_dfa_match(). (The supplied patch used a static vector, initialized
69
only once - I suspect this was the cause of the problems with the tests.)
70
71
Overall, I concluded that the gains in some cases did not outweigh the losses
72
in others, so I abandoned this code. */
73
74
75
#ifdef HAVE_CONFIG_H
76
#include "config.h"
77
#endif
78
79
0
#define NLBLOCK mb             /* Block containing newline information */
80
0
#define PSSTART start_subject  /* Field containing processed string start */
81
0
#define PSEND   end_subject    /* Field containing processed string end */
82
83
#include "pcre2_internal.h"
84
85
#define PUBLIC_DFA_MATCH_OPTIONS \
86
0
  (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87
0
   PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88
0
   PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89
0
   PCRE2_COPY_MATCHED_SUBJECT)
90
91
92
/*************************************************
93
*      Code parameters and static tables         *
94
*************************************************/
95
96
/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97
into others, under special conditions. A gap of 20 between the blocks should be
98
enough. The resulting opcodes don't have to be less than 256 because they are
99
never stored, so we push them well clear of the normal opcodes. */
100
101
0
#define OP_PROP_EXTRA       300
102
0
#define OP_EXTUNI_EXTRA     320
103
0
#define OP_ANYNL_EXTRA      340
104
0
#define OP_HSPACE_EXTRA     360
105
0
#define OP_VSPACE_EXTRA     380
106
107
108
/* This table identifies those opcodes that are followed immediately by a
109
character that is to be tested in some way. This makes it possible to
110
centralize the loading of these characters. In the case of Type * etc, the
111
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112
small value. Non-zero values in the table are the offsets from the opcode where
113
the character is to be found. ***NOTE*** If the start of this table is
114
modified, the three tables that follow must also be modified. */
115
116
static const uint8_t coptable[] = {
117
  0,                             /* End                                    */
118
  0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119
  0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120
  0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121
  0, 0,                          /* \P, \p                                 */
122
  0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123
  0,                             /* \X                                     */
124
  0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
125
  1,                             /* Char                                   */
126
  1,                             /* Chari                                  */
127
  1,                             /* not                                    */
128
  1,                             /* noti                                   */
129
  /* Positive single-char repeats                                          */
130
  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132
  1+IMM2_SIZE,                   /* exact                                  */
133
  1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134
  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136
  1+IMM2_SIZE,                   /* exact I                                */
137
  1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138
  /* Negative single-char repeats - only for chars < 256                   */
139
  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141
  1+IMM2_SIZE,                   /* NOT exact                              */
142
  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143
  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145
  1+IMM2_SIZE,                   /* NOT exact I                            */
146
  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147
  /* Positive type repeats                                                 */
148
  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150
  1+IMM2_SIZE,                   /* Type exact                             */
151
  1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152
  /* Character class & ref repeats                                         */
153
  0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154
  0, 0,                          /* CRRANGE, CRMINRANGE                    */
155
  0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
156
  0,                             /* CLASS                                  */
157
  0,                             /* NCLASS                                 */
158
  0,                             /* XCLASS - variable length               */
159
  0,                             /* ECLASS - variable length               */
160
  0,                             /* REF                                    */
161
  0,                             /* REFI                                   */
162
  0,                             /* DNREF                                  */
163
  0,                             /* DNREFI                                 */
164
  0,                             /* RECURSE                                */
165
  0,                             /* CALLOUT                                */
166
  0,                             /* CALLOUT_STR                            */
167
  0,                             /* Alt                                    */
168
  0,                             /* Ket                                    */
169
  0,                             /* KetRmax                                */
170
  0,                             /* KetRmin                                */
171
  0,                             /* KetRpos                                */
172
  0, 0,                          /* Reverse, Vreverse                      */
173
  0,                             /* Assert                                 */
174
  0,                             /* Assert not                             */
175
  0,                             /* Assert behind                          */
176
  0,                             /* Assert behind not                      */
177
  0,                             /* NA assert                              */
178
  0,                             /* NA assert behind                       */
179
  0,                             /* Assert scan substring                  */
180
  0,                             /* ONCE                                   */
181
  0,                             /* SCRIPT_RUN                             */
182
  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
183
  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
184
  0, 0,                          /* CREF, DNCREF                           */
185
  0, 0,                          /* RREF, DNRREF                           */
186
  0, 0,                          /* FALSE, TRUE                            */
187
  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
188
  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
189
  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
190
  0, 0,                          /* COMMIT, COMMIT_ARG                     */
191
  0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
192
  0, 0, 0,                       /* CLOSE, SKIPZERO, DEFINE                */
193
  0, 0,                          /* \B and \b in UCP mode                  */
194
};
195
196
/* This table identifies those opcodes that inspect a character. It is used to
197
remember the fact that a character could have been inspected when the end of
198
the subject is reached. ***NOTE*** If the start of this table is modified, the
199
two tables that follow must also be modified. */
200
201
static const uint8_t poptable[] = {
202
  0,                             /* End                                    */
203
  0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
204
  1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
205
  1, 1, 1,                       /* Any, AllAny, Anybyte                   */
206
  1, 1,                          /* \P, \p                                 */
207
  1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
208
  1,                             /* \X                                     */
209
  0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
210
  1,                             /* Char                                   */
211
  1,                             /* Chari                                  */
212
  1,                             /* not                                    */
213
  1,                             /* noti                                   */
214
  /* Positive single-char repeats                                          */
215
  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
216
  1, 1, 1,                       /* upto, minupto, exact                   */
217
  1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
218
  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
219
  1, 1, 1,                       /* upto I, minupto I, exact I             */
220
  1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
221
  /* Negative single-char repeats - only for chars < 256                   */
222
  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
223
  1, 1, 1,                       /* NOT upto, minupto, exact               */
224
  1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
225
  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
226
  1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
227
  1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
228
  /* Positive type repeats                                                 */
229
  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
230
  1, 1, 1,                       /* Type upto, minupto, exact              */
231
  1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
232
  /* Character class & ref repeats                                         */
233
  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
234
  1, 1,                          /* CRRANGE, CRMINRANGE                    */
235
  1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
236
  1,                             /* CLASS                                  */
237
  1,                             /* NCLASS                                 */
238
  1,                             /* XCLASS - variable length               */
239
  1,                             /* ECLASS - variable length               */
240
  0,                             /* REF                                    */
241
  0,                             /* REFI                                   */
242
  0,                             /* DNREF                                  */
243
  0,                             /* DNREFI                                 */
244
  0,                             /* RECURSE                                */
245
  0,                             /* CALLOUT                                */
246
  0,                             /* CALLOUT_STR                            */
247
  0,                             /* Alt                                    */
248
  0,                             /* Ket                                    */
249
  0,                             /* KetRmax                                */
250
  0,                             /* KetRmin                                */
251
  0,                             /* KetRpos                                */
252
  0, 0,                          /* Reverse, Vreverse                      */
253
  0,                             /* Assert                                 */
254
  0,                             /* Assert not                             */
255
  0,                             /* Assert behind                          */
256
  0,                             /* Assert behind not                      */
257
  0,                             /* NA assert                              */
258
  0,                             /* NA assert behind                       */
259
  0,                             /* Assert scan substring                  */
260
  0,                             /* ONCE                                   */
261
  0,                             /* SCRIPT_RUN                             */
262
  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
263
  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
264
  0, 0,                          /* CREF, DNCREF                           */
265
  0, 0,                          /* RREF, DNRREF                           */
266
  0, 0,                          /* FALSE, TRUE                            */
267
  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
268
  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
269
  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
270
  0, 0,                          /* COMMIT, COMMIT_ARG                     */
271
  0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
272
  0, 0, 0,                       /* CLOSE, SKIPZERO, DEFINE                */
273
  1, 1,                          /* \B and \b in UCP mode                  */
274
};
275
276
/* Compile-time check that these tables have the correct size. */
277
STATIC_ASSERT(sizeof(coptable) == OP_TABLE_LENGTH, coptable);
278
STATIC_ASSERT(sizeof(poptable) == OP_TABLE_LENGTH, poptable);
279
280
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
281
and \w */
282
283
static const uint8_t toptable1[] = {
284
  0, 0, 0, 0, 0, 0,
285
  ctype_digit, ctype_digit,
286
  ctype_space, ctype_space,
287
  ctype_word,  ctype_word,
288
  0, 0                            /* OP_ANY, OP_ALLANY */
289
};
290
291
static const uint8_t toptable2[] = {
292
  0, 0, 0, 0, 0, 0,
293
  ctype_digit, 0,
294
  ctype_space, 0,
295
  ctype_word,  0,
296
  1, 1                            /* OP_ANY, OP_ALLANY */
297
};
298
299
300
/* Structure for holding data about a particular state, which is in effect the
301
current data for an active path through the match tree. It must consist
302
entirely of ints because the working vector we are passed, and which we put
303
these structures in, is a vector of ints. */
304
305
typedef struct stateblock {
306
  int offset;                     /* Offset to opcode (-ve has meaning) */
307
  int count;                      /* Count for repeats */
308
  int data;                       /* Some use extra data */
309
} stateblock;
310
311
0
#define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
312
313
314
/* Before version 10.32 the recursive calls of internal_dfa_match() were passed
315
local working space and output vectors that were created on the stack. This has
316
caused issues for some patterns, especially in small-stack environments such as
317
Windows. A new scheme is now in use which sets up a vector on the stack, but if
318
this is too small, heap memory is used, up to the heap_limit. The main
319
parameters are all numbers of ints because the workspace is a vector of ints.
320
321
The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
322
defined in pcre2_internal.h so as to be available to pcre2test when it is
323
finding the minimum heap requirement for a match. */
324
325
0
#define OVEC_UNIT  (sizeof(PCRE2_SIZE)/sizeof(int))
326
327
0
#define RWS_BASE_SIZE   (DFA_START_RWS_SIZE/sizeof(int))  /* Stack vector */
328
0
#define RWS_RSIZE       1000                    /* Work size for recursion */
329
0
#define RWS_OVEC_RSIZE  (1000*OVEC_UNIT)        /* Ovector for recursion */
330
0
#define RWS_OVEC_OSIZE  (2*OVEC_UNIT)           /* Ovector in other cases */
331
332
/* This structure is at the start of each workspace block. */
333
334
typedef struct RWS_anchor {
335
  struct RWS_anchor *next;
336
  uint32_t size;  /* Number of ints */
337
  uint32_t free;  /* Number of ints */
338
} RWS_anchor;
339
340
0
#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
341
342
343
344
/*************************************************
345
*               Process a callout                *
346
*************************************************/
347
348
/* This function is called to perform a callout.
349
350
Arguments:
351
  code              current code pointer
352
  offsets           points to current capture offsets
353
  current_subject   start of current subject match
354
  ptr               current position in subject
355
  mb                the match block
356
  extracode         extra code offset when called from condition
357
  lengthptr         where to return the callout length
358
359
Returns:            the return from the callout
360
*/
361
362
static int
363
do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
364
  PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
365
  PCRE2_SIZE *lengthptr)
366
0
{
367
0
pcre2_callout_block *cb = mb->cb;
368
369
0
*lengthptr = (code[extracode] == OP_CALLOUT)?
370
0
  (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
371
0
  (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
372
373
0
if (mb->callout == NULL) return 0;    /* No callout provided */
374
375
/* Fixed fields in the callout block are set once and for all at the start of
376
matching. */
377
378
0
cb->offset_vector    = offsets;
379
0
cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
380
0
cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
381
0
cb->pattern_position = GET(code, 1 + extracode);
382
0
cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
383
384
0
if (code[extracode] == OP_CALLOUT)
385
0
  {
386
0
  cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
387
0
  cb->callout_string_offset = 0;
388
0
  cb->callout_string = NULL;
389
0
  cb->callout_string_length = 0;
390
0
  }
391
0
else
392
0
  {
393
0
  cb->callout_number = 0;
394
0
  cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
395
0
  cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
396
0
  cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
397
0
  }
398
399
0
return (mb->callout)(cb, mb->callout_data);
400
0
}
401
402
403
404
/*************************************************
405
*         Expand local workspace memory          *
406
*************************************************/
407
408
/* This function is called when internal_dfa_match() is about to be called
409
recursively and there is insufficient working space left in the current
410
workspace block. If there's an existing next block, use it; otherwise get a new
411
block unless the heap limit is reached.
412
413
Arguments:
414
  rwsptr     pointer to block pointer (updated)
415
  ovecsize   space needed for an ovector
416
  mb         the match block
417
418
Returns:     0 rwsptr has been updated
419
            !0 an error code
420
*/
421
422
static int
423
more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
424
0
{
425
0
RWS_anchor *rws = *rwsptr;
426
0
RWS_anchor *new;
427
428
0
if (rws->next != NULL)
429
0
  {
430
0
  new = rws->next;
431
0
  }
432
433
/* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
434
mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
435
overflow. */
436
437
0
else
438
0
  {
439
0
  uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
440
0
  uint32_t newsizeK = newsize/(1024/sizeof(int));
441
442
0
  if (newsizeK + mb->heap_used > mb->heap_limit)
443
0
    newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
444
0
  newsize = newsizeK*(1024/sizeof(int));
445
446
0
  if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
447
0
    return PCRE2_ERROR_HEAPLIMIT;
448
0
  new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
449
0
  if (new == NULL) return PCRE2_ERROR_NOMEMORY;
450
0
  mb->heap_used += newsizeK;
451
0
  new->next = NULL;
452
0
  new->size = newsize;
453
0
  rws->next = new;
454
0
  }
455
456
0
new->free = new->size - RWS_ANCHOR_SIZE;
457
0
*rwsptr = new;
458
0
return 0;
459
0
}
460
461
462
463
/*************************************************
464
*     Match a Regular Expression - DFA engine    *
465
*************************************************/
466
467
/* This internal function applies a compiled pattern to a subject string,
468
starting at a given point, using a DFA engine. This function is called from the
469
external one, possibly multiple times if the pattern is not anchored. The
470
function calls itself recursively for some kinds of subpattern.
471
472
Arguments:
473
  mb                the match_data block with fixed information
474
  this_start_code   the opening bracket of this subexpression's code
475
  current_subject   where we currently are in the subject string
476
  start_offset      start offset in the subject string
477
  offsets           vector to contain the matching string offsets
478
  offsetcount       size of same
479
  workspace         vector of workspace
480
  wscount           size of same
481
  rlevel            function call recursion level
482
483
Returns:            > 0 => number of match offset pairs placed in offsets
484
                    = 0 => offsets overflowed; longest matches are present
485
                     -1 => failed to match
486
                   < -1 => some kind of unexpected problem
487
488
The following macros are used for adding states to the two state vectors (one
489
for the current character, one for the following character). */
490
491
#define ADD_ACTIVE(x,y) \
492
0
  if (active_count++ < wscount) \
493
0
    { \
494
0
    next_active_state->offset = (x); \
495
0
    next_active_state->count  = (y); \
496
0
    next_active_state++; \
497
0
    } \
498
0
  else return PCRE2_ERROR_DFA_WSSIZE
499
500
#define ADD_ACTIVE_DATA(x,y,z) \
501
  if (active_count++ < wscount) \
502
    { \
503
    next_active_state->offset = (x); \
504
    next_active_state->count  = (y); \
505
    next_active_state->data   = (z); \
506
    next_active_state++; \
507
    } \
508
  else return PCRE2_ERROR_DFA_WSSIZE
509
510
#define ADD_NEW(x,y) \
511
0
  if (new_count++ < wscount) \
512
0
    { \
513
0
    next_new_state->offset = (x); \
514
0
    next_new_state->count  = (y); \
515
0
    next_new_state++; \
516
0
    } \
517
0
  else return PCRE2_ERROR_DFA_WSSIZE
518
519
#define ADD_NEW_DATA(x,y,z) \
520
0
  if (new_count++ < wscount) \
521
0
    { \
522
0
    next_new_state->offset = (x); \
523
0
    next_new_state->count  = (y); \
524
0
    next_new_state->data   = (z); \
525
0
    next_new_state++; \
526
0
    } \
527
0
  else return PCRE2_ERROR_DFA_WSSIZE
528
529
/* And now, here is the code */
530
531
static int
532
internal_dfa_match(
533
  dfa_match_block *mb,
534
  PCRE2_SPTR this_start_code,
535
  PCRE2_SPTR current_subject,
536
  PCRE2_SIZE start_offset,
537
  PCRE2_SIZE *offsets,
538
  uint32_t offsetcount,
539
  int *workspace,
540
  int wscount,
541
  uint32_t rlevel,
542
  int *RWS)
543
0
{
544
0
stateblock *active_states, *new_states, *temp_states;
545
0
stateblock *next_active_state, *next_new_state;
546
0
const uint8_t *ctypes, *lcc, *fcc;
547
0
PCRE2_SPTR ptr;
548
0
PCRE2_SPTR end_code;
549
0
dfa_recursion_info new_recursive;
550
0
int active_count, new_count, match_count;
551
552
/* Some fields in the mb block are frequently referenced, so we load them into
553
independent variables in the hope that this will perform better. */
554
555
0
PCRE2_SPTR start_subject = mb->start_subject;
556
0
PCRE2_SPTR end_subject = mb->end_subject;
557
0
PCRE2_SPTR start_code = mb->start_code;
558
559
0
#ifdef SUPPORT_UNICODE
560
0
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
561
0
BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
562
#else
563
BOOL utf = FALSE;
564
#endif
565
566
0
BOOL reset_could_continue = FALSE;
567
568
0
if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
569
0
if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
570
0
offsetcount &= (uint32_t)(-2);  /* Round down */
571
572
0
wscount -= 2;
573
0
wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
574
0
          (2 * INTS_PER_STATEBLOCK);
575
576
0
ctypes = mb->tables + ctypes_offset;
577
0
lcc = mb->tables + lcc_offset;
578
0
fcc = mb->tables + fcc_offset;
579
580
0
match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
581
582
0
active_states = (stateblock *)(workspace + 2);
583
0
next_new_state = new_states = active_states + wscount;
584
0
new_count = 0;
585
586
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
587
the alternative states onto the list, and find out where the end is. This
588
makes is possible to use this function recursively, when we want to stop at a
589
matching internal ket rather than at the end.
590
591
If we are dealing with a backward assertion we have to find out the maximum
592
amount to move back, and set up each alternative appropriately. */
593
594
0
if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
595
0
  {
596
0
  size_t max_back = 0;
597
0
  size_t gone_back;
598
599
0
  end_code = this_start_code;
600
0
  do
601
0
    {
602
0
    size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
603
0
    if (back > max_back) max_back = back;
604
0
    end_code += GET(end_code, 1);
605
0
    }
606
0
  while (*end_code == OP_ALT);
607
608
  /* If we can't go back the amount required for the longest lookbehind
609
  pattern, go back as far as we can; some alternatives may still be viable. */
610
611
0
#ifdef SUPPORT_UNICODE
612
  /* In character mode we have to step back character by character */
613
614
0
  if (utf)
615
0
    {
616
0
    for (gone_back = 0; gone_back < max_back; gone_back++)
617
0
      {
618
0
      if (current_subject <= start_subject) break;
619
0
      current_subject--;
620
0
      ACROSSCHAR(current_subject > start_subject, current_subject,
621
0
        current_subject--);
622
0
      }
623
0
    }
624
0
  else
625
0
#endif
626
627
  /* In byte-mode we can do this quickly. */
628
629
0
    {
630
0
    size_t current_offset = (size_t)(current_subject - start_subject);
631
0
    gone_back = (current_offset < max_back)? current_offset : max_back;
632
0
    current_subject -= gone_back;
633
0
    }
634
635
  /* Save the earliest consulted character */
636
637
0
  if (current_subject < mb->start_used_ptr)
638
0
    mb->start_used_ptr = current_subject;
639
640
  /* Now we can process the individual branches. There will be an OP_REVERSE at
641
  the start of each branch, except when the length of the branch is zero. */
642
643
0
  end_code = this_start_code;
644
0
  do
645
0
    {
646
0
    uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
647
0
    size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
648
0
    if (back <= gone_back)
649
0
      {
650
0
      int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
651
0
      ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
652
0
      }
653
0
    end_code += GET(end_code, 1);
654
0
    }
655
0
  while (*end_code == OP_ALT);
656
0
 }
657
658
/* This is the code for a "normal" subpattern (not a backward assertion). The
659
start of a whole pattern is always one of these. If we are at the top level,
660
we may be asked to restart matching from the same point that we reached for a
661
previous partial match. We still have to scan through the top-level branches to
662
find the end state. */
663
664
0
else
665
0
  {
666
0
  end_code = this_start_code;
667
668
  /* Restarting */
669
670
0
  if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
671
0
    {
672
0
    do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
673
0
    new_count = workspace[1];
674
0
    if (!workspace[0])
675
0
      memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
676
0
    }
677
678
  /* Not restarting */
679
680
0
  else
681
0
    {
682
0
    int length = 1 + LINK_SIZE +
683
0
      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
684
0
        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
685
0
        ? IMM2_SIZE:0);
686
0
    do
687
0
      {
688
0
      ADD_NEW((int)(end_code - start_code + length), 0);
689
0
      end_code += GET(end_code, 1);
690
0
      length = 1 + LINK_SIZE;
691
0
      }
692
0
    while (*end_code == OP_ALT);
693
0
    }
694
0
  }
695
696
0
workspace[0] = 0;    /* Bit indicating which vector is current */
697
698
/* Loop for scanning the subject */
699
700
0
ptr = current_subject;
701
0
for (;;)
702
0
  {
703
0
  int i, j;
704
0
  int clen, dlen;
705
0
  uint32_t c, d;
706
0
  BOOL partial_newline = FALSE;
707
0
  BOOL could_continue = reset_could_continue;
708
0
  reset_could_continue = FALSE;
709
710
0
  if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
711
712
  /* Make the new state list into the active state list and empty the
713
  new state list. */
714
715
0
  temp_states = active_states;
716
0
  active_states = new_states;
717
0
  new_states = temp_states;
718
0
  active_count = new_count;
719
0
  new_count = 0;
720
721
0
  workspace[0] ^= 1;              /* Remember for the restarting feature */
722
0
  workspace[1] = active_count;
723
724
  /* Set the pointers for adding new states */
725
726
0
  next_active_state = active_states + active_count;
727
0
  next_new_state = new_states;
728
729
  /* Load the current character from the subject outside the loop, as many
730
  different states may want to look at it, and we assume that at least one
731
  will. */
732
733
0
  if (ptr < end_subject)
734
0
    {
735
0
    clen = 1;        /* Number of data items in the character */
736
0
#ifdef SUPPORT_UNICODE
737
0
    GETCHARLENTEST(c, ptr, clen);
738
#else
739
    c = *ptr;
740
#endif  /* SUPPORT_UNICODE */
741
0
    }
742
0
  else
743
0
    {
744
0
    clen = 0;        /* This indicates the end of the subject */
745
0
    c = NOTACHAR;    /* This value should never actually be used */
746
0
    }
747
748
  /* Scan up the active states and act on each one. The result of an action
749
  may be to add more states to the currently active list (e.g. on hitting a
750
  parenthesis) or it may be to put states on the new list, for considering
751
  when we move the character pointer on. */
752
753
0
  for (i = 0; i < active_count; i++)
754
0
    {
755
0
    stateblock *current_state = active_states + i;
756
0
    BOOL caseless = FALSE;
757
0
    PCRE2_SPTR code;
758
0
    uint32_t codevalue;
759
0
    int state_offset = current_state->offset;
760
0
    int rrc;
761
0
    int count;
762
763
    /* A negative offset is a special case meaning "hold off going to this
764
    (negated) state until the number of characters in the data field have
765
    been skipped". If the could_continue flag was passed over from a previous
766
    state, arrange for it to passed on. */
767
768
0
    if (state_offset < 0)
769
0
      {
770
0
      if (current_state->data > 0)
771
0
        {
772
0
        ADD_NEW_DATA(state_offset, current_state->count,
773
0
          current_state->data - 1);
774
0
        if (could_continue) reset_could_continue = TRUE;
775
0
        continue;
776
0
        }
777
0
      else
778
0
        {
779
0
        current_state->offset = state_offset = -state_offset;
780
0
        }
781
0
      }
782
783
    /* Check for a duplicate state with the same count, and skip if found.
784
    See the note at the head of this module about the possibility of improving
785
    performance here. */
786
787
0
    for (j = 0; j < i; j++)
788
0
      {
789
0
      if (active_states[j].offset == state_offset &&
790
0
          active_states[j].count == current_state->count)
791
0
        goto NEXT_ACTIVE_STATE;
792
0
      }
793
794
    /* The state offset is the offset to the opcode */
795
796
0
    code = start_code + state_offset;
797
0
    codevalue = *code;
798
799
    /* If this opcode inspects a character, but we are at the end of the
800
    subject, remember the fact for use when testing for a partial match. */
801
802
0
    if (clen == 0 && poptable[codevalue] != 0)
803
0
      could_continue = TRUE;
804
805
    /* If this opcode is followed by an inline character, load it. It is
806
    tempting to test for the presence of a subject character here, but that
807
    is wrong, because sometimes zero repetitions of the subject are
808
    permitted.
809
810
    We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
811
    argument that is not a data character - but is always one byte long because
812
    the values are small. We have to take special action to deal with  \P, \p,
813
    \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
814
    these ones to new opcodes. */
815
816
0
    if (coptable[codevalue] > 0)
817
0
      {
818
0
      dlen = 1;
819
0
#ifdef SUPPORT_UNICODE
820
0
      if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
821
0
#endif  /* SUPPORT_UNICODE */
822
0
      d = code[coptable[codevalue]];
823
0
      if (codevalue >= OP_TYPESTAR)
824
0
        {
825
0
        switch(d)
826
0
          {
827
0
          case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
828
0
          case OP_NOTPROP:
829
0
          case OP_PROP: codevalue += OP_PROP_EXTRA; break;
830
0
          case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
831
0
          case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
832
0
          case OP_NOT_HSPACE:
833
0
          case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
834
0
          case OP_NOT_VSPACE:
835
0
          case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
836
0
          default: break;
837
0
          }
838
0
        }
839
0
      }
840
0
    else
841
0
      {
842
0
      dlen = 0;         /* Not strictly necessary, but compilers moan */
843
0
      d = NOTACHAR;     /* if these variables are not set. */
844
0
      }
845
846
847
    /* Now process the individual opcodes */
848
849
0
    switch (codevalue)
850
0
      {
851
/* ========================================================================== */
852
      /* Reached a closing bracket. If not at the end of the pattern, carry
853
      on with the next opcode. For repeating opcodes, also add the repeat
854
      state. Note that KETRPOS will always be encountered at the end of the
855
      subpattern, because the possessive subpattern repeats are always handled
856
      using recursive calls. Thus, it never adds any new states.
857
858
      At the end of the (sub)pattern, unless we have an empty string and
859
      PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
860
      start of the subject, save the match data, shifting up all previous
861
      matches so we always have the longest first. */
862
863
0
      case OP_KET:
864
0
      case OP_KETRMIN:
865
0
      case OP_KETRMAX:
866
0
      case OP_KETRPOS:
867
0
      if (code != end_code)
868
0
        {
869
0
        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
870
0
        if (codevalue != OP_KET)
871
0
          {
872
0
          ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
873
0
          }
874
0
        }
875
0
      else
876
0
        {
877
0
        if (ptr > current_subject ||
878
0
            ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
879
0
              ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
880
0
                current_subject > start_subject + mb->start_offset)))
881
0
          {
882
0
          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
883
0
            else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
884
0
              match_count = 0;
885
0
          count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
886
0
          if (count > 0) (void)memmove(offsets + 2, offsets,
887
0
            (size_t)count * sizeof(PCRE2_SIZE));
888
0
          if (offsetcount >= 2)
889
0
            {
890
0
            offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
891
0
            offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
892
0
            }
893
0
          if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
894
0
          }
895
0
        }
896
0
      break;
897
898
/* ========================================================================== */
899
      /* These opcodes add to the current list of states without looking
900
      at the current character. */
901
902
      /*-----------------------------------------------------------------*/
903
0
      case OP_ALT:
904
0
      do { code += GET(code, 1); } while (*code == OP_ALT);
905
0
      ADD_ACTIVE((int)(code - start_code), 0);
906
0
      break;
907
908
      /*-----------------------------------------------------------------*/
909
0
      case OP_BRA:
910
0
      case OP_SBRA:
911
0
      do
912
0
        {
913
0
        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
914
0
        code += GET(code, 1);
915
0
        }
916
0
      while (*code == OP_ALT);
917
0
      break;
918
919
      /*-----------------------------------------------------------------*/
920
0
      case OP_CBRA:
921
0
      case OP_SCBRA:
922
0
      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
923
0
      code += GET(code, 1);
924
0
      while (*code == OP_ALT)
925
0
        {
926
0
        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
927
0
        code += GET(code, 1);
928
0
        }
929
0
      break;
930
931
      /*-----------------------------------------------------------------*/
932
0
      case OP_BRAZERO:
933
0
      case OP_BRAMINZERO:
934
0
      ADD_ACTIVE(state_offset + 1, 0);
935
0
      code += 1 + GET(code, 2);
936
0
      while (*code == OP_ALT) code += GET(code, 1);
937
0
      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
938
0
      break;
939
940
      /*-----------------------------------------------------------------*/
941
0
      case OP_SKIPZERO:
942
0
      code += 1 + GET(code, 2);
943
0
      while (*code == OP_ALT) code += GET(code, 1);
944
0
      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
945
0
      break;
946
947
      /*-----------------------------------------------------------------*/
948
0
      case OP_CIRC:
949
0
      if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
950
0
        { ADD_ACTIVE(state_offset + 1, 0); }
951
0
      break;
952
953
      /*-----------------------------------------------------------------*/
954
0
      case OP_CIRCM:
955
0
      if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
956
0
          ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
957
0
            && WAS_NEWLINE(ptr)))
958
0
        { ADD_ACTIVE(state_offset + 1, 0); }
959
0
      break;
960
961
      /*-----------------------------------------------------------------*/
962
0
      case OP_EOD:
963
0
      if (ptr >= end_subject)
964
0
        {
965
0
        if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
966
0
          return PCRE2_ERROR_PARTIAL;
967
0
        else { ADD_ACTIVE(state_offset + 1, 0); }
968
0
        }
969
0
      break;
970
971
      /*-----------------------------------------------------------------*/
972
0
      case OP_SOD:
973
0
      if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
974
0
      break;
975
976
      /*-----------------------------------------------------------------*/
977
0
      case OP_SOM:
978
0
      if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
979
0
      break;
980
981
982
/* ========================================================================== */
983
      /* These opcodes inspect the next subject character, and sometimes
984
      the previous one as well, but do not have an argument. The variable
985
      clen contains the length of the current character and is zero if we are
986
      at the end of the subject. */
987
988
      /*-----------------------------------------------------------------*/
989
0
      case OP_ANY:
990
0
      if (clen > 0 && !IS_NEWLINE(ptr))
991
0
        {
992
0
        if (ptr + 1 >= mb->end_subject &&
993
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
994
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
995
0
            NLBLOCK->nllen == 2 &&
996
0
            c == NLBLOCK->nl[0])
997
0
          {
998
0
          could_continue = partial_newline = TRUE;
999
0
          }
1000
0
        else
1001
0
          {
1002
0
          ADD_NEW(state_offset + 1, 0);
1003
0
          }
1004
0
        }
1005
0
      break;
1006
1007
      /*-----------------------------------------------------------------*/
1008
0
      case OP_ALLANY:
1009
0
      if (clen > 0)
1010
0
        { ADD_NEW(state_offset + 1, 0); }
1011
0
      break;
1012
1013
      /*-----------------------------------------------------------------*/
1014
0
      case OP_EODN:
1015
0
      if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1016
0
        {
1017
0
        if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1018
0
          return PCRE2_ERROR_PARTIAL;
1019
0
        ADD_ACTIVE(state_offset + 1, 0);
1020
0
        }
1021
0
      break;
1022
1023
      /*-----------------------------------------------------------------*/
1024
0
      case OP_DOLL:
1025
0
      if ((mb->moptions & PCRE2_NOTEOL) == 0)
1026
0
        {
1027
0
        if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1028
0
          could_continue = TRUE;
1029
0
        else if (clen == 0 ||
1030
0
            ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1031
0
               (ptr == end_subject - mb->nllen)
1032
0
            ))
1033
0
          { ADD_ACTIVE(state_offset + 1, 0); }
1034
0
        else if (ptr + 1 >= mb->end_subject &&
1035
0
                 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1036
0
                 NLBLOCK->nltype == NLTYPE_FIXED &&
1037
0
                 NLBLOCK->nllen == 2 &&
1038
0
                 c == NLBLOCK->nl[0])
1039
0
          {
1040
0
          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1041
0
            {
1042
0
            reset_could_continue = TRUE;
1043
0
            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1044
0
            }
1045
0
          else could_continue = partial_newline = TRUE;
1046
0
          }
1047
0
        }
1048
0
      break;
1049
1050
      /*-----------------------------------------------------------------*/
1051
0
      case OP_DOLLM:
1052
0
      if ((mb->moptions & PCRE2_NOTEOL) == 0)
1053
0
        {
1054
0
        if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1055
0
          could_continue = TRUE;
1056
0
        else if (clen == 0 ||
1057
0
            ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1058
0
          { ADD_ACTIVE(state_offset + 1, 0); }
1059
0
        else if (ptr + 1 >= mb->end_subject &&
1060
0
                 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1061
0
                 NLBLOCK->nltype == NLTYPE_FIXED &&
1062
0
                 NLBLOCK->nllen == 2 &&
1063
0
                 c == NLBLOCK->nl[0])
1064
0
          {
1065
0
          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1066
0
            {
1067
0
            reset_could_continue = TRUE;
1068
0
            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1069
0
            }
1070
0
          else could_continue = partial_newline = TRUE;
1071
0
          }
1072
0
        }
1073
0
      else if (IS_NEWLINE(ptr))
1074
0
        { ADD_ACTIVE(state_offset + 1, 0); }
1075
0
      break;
1076
1077
      /*-----------------------------------------------------------------*/
1078
1079
0
      case OP_DIGIT:
1080
0
      case OP_WHITESPACE:
1081
0
      case OP_WORDCHAR:
1082
0
      if (clen > 0 && c < 256 &&
1083
0
            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1084
0
        { ADD_NEW(state_offset + 1, 0); }
1085
0
      break;
1086
1087
      /*-----------------------------------------------------------------*/
1088
0
      case OP_NOT_DIGIT:
1089
0
      case OP_NOT_WHITESPACE:
1090
0
      case OP_NOT_WORDCHAR:
1091
0
      if (clen > 0 && (c >= 256 ||
1092
0
            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1093
0
        { ADD_NEW(state_offset + 1, 0); }
1094
0
      break;
1095
1096
      /*-----------------------------------------------------------------*/
1097
0
      case OP_WORD_BOUNDARY:
1098
0
      case OP_NOT_WORD_BOUNDARY:
1099
0
      case OP_NOT_UCP_WORD_BOUNDARY:
1100
0
      case OP_UCP_WORD_BOUNDARY:
1101
0
        {
1102
0
        int left_word, right_word;
1103
1104
0
        if (ptr > start_subject)
1105
0
          {
1106
0
          PCRE2_SPTR temp = ptr - 1;
1107
0
          if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1108
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1109
0
          if (utf) { BACKCHAR(temp); }
1110
0
#endif
1111
0
          GETCHARTEST(d, temp);
1112
0
#ifdef SUPPORT_UNICODE
1113
0
          if (codevalue == OP_UCP_WORD_BOUNDARY ||
1114
0
              codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1115
0
            {
1116
0
            int chartype = UCD_CHARTYPE(d);
1117
0
            int category = PRIV(ucp_gentype)[chartype];
1118
0
            left_word = (category == ucp_L || category == ucp_N ||
1119
0
              chartype == ucp_Mn || chartype == ucp_Pc);
1120
0
            }
1121
0
          else
1122
0
#endif
1123
0
          left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1124
0
          }
1125
0
        else left_word = FALSE;
1126
1127
0
        if (clen > 0)
1128
0
          {
1129
0
          if (ptr >= mb->last_used_ptr)
1130
0
            {
1131
0
            PCRE2_SPTR temp = ptr + 1;
1132
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1133
0
            if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1134
0
#endif
1135
0
            mb->last_used_ptr = temp;
1136
0
            }
1137
0
#ifdef SUPPORT_UNICODE
1138
0
          if (codevalue == OP_UCP_WORD_BOUNDARY ||
1139
0
              codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1140
0
            {
1141
0
            int chartype = UCD_CHARTYPE(c);
1142
0
            int category = PRIV(ucp_gentype)[chartype];
1143
0
            right_word = (category == ucp_L || category == ucp_N ||
1144
0
              chartype == ucp_Mn || chartype == ucp_Pc);
1145
0
            }
1146
0
          else
1147
0
#endif
1148
0
          right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1149
0
          }
1150
0
        else right_word = FALSE;
1151
1152
0
        if ((left_word == right_word) ==
1153
0
            (codevalue == OP_NOT_WORD_BOUNDARY ||
1154
0
             codevalue == OP_NOT_UCP_WORD_BOUNDARY))
1155
0
          { ADD_ACTIVE(state_offset + 1, 0); }
1156
0
        }
1157
0
      break;
1158
1159
1160
      /*-----------------------------------------------------------------*/
1161
      /* Check the next character by Unicode property. We will get here only
1162
      if the support is in the binary; otherwise a compile-time error occurs.
1163
      */
1164
1165
0
#ifdef SUPPORT_UNICODE
1166
0
      case OP_PROP:
1167
0
      case OP_NOTPROP:
1168
0
      if (clen > 0)
1169
0
        {
1170
0
        BOOL OK;
1171
0
        int chartype;
1172
0
        const uint32_t *cp;
1173
0
        const ucd_record * prop = GET_UCD(c);
1174
0
        switch(code[1])
1175
0
          {
1176
0
          case PT_LAMP:
1177
0
          chartype = prop->chartype;
1178
0
          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1179
0
               chartype == ucp_Lt;
1180
0
          break;
1181
1182
0
          case PT_GC:
1183
0
          OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1184
0
          break;
1185
1186
0
          case PT_PC:
1187
0
          OK = prop->chartype == code[2];
1188
0
          break;
1189
1190
0
          case PT_SC:
1191
0
          OK = prop->script == code[2];
1192
0
          break;
1193
1194
0
          case PT_SCX:
1195
0
          OK = (prop->script == code[2] ||
1196
0
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
1197
0
          break;
1198
1199
          /* These are specials for combination cases. */
1200
1201
0
          case PT_ALNUM:
1202
0
          chartype = prop->chartype;
1203
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1204
0
               PRIV(ucp_gentype)[chartype] == ucp_N;
1205
0
          break;
1206
1207
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1208
          which means that Perl space and POSIX space are now identical. PCRE
1209
          was changed at release 8.34. */
1210
1211
0
          case PT_SPACE:    /* Perl space */
1212
0
          case PT_PXSPACE:  /* POSIX space */
1213
0
          switch(c)
1214
0
            {
1215
0
            HSPACE_CASES:
1216
0
            VSPACE_CASES:
1217
0
            OK = TRUE;
1218
0
            break;
1219
1220
0
            default:
1221
0
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1222
0
            break;
1223
0
            }
1224
0
          break;
1225
1226
0
          case PT_WORD:
1227
0
          chartype = prop->chartype;
1228
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1229
0
               PRIV(ucp_gentype)[chartype] == ucp_N ||
1230
0
               chartype == ucp_Mn || chartype == ucp_Pc;
1231
0
          break;
1232
1233
0
          case PT_CLIST:
1234
#if PCRE2_CODE_UNIT_WIDTH == 32
1235
          if (c > MAX_UTF_CODE_POINT)
1236
            {
1237
            OK = FALSE;
1238
            break;
1239
            }
1240
#endif
1241
0
          cp = PRIV(ucd_caseless_sets) + code[2];
1242
0
          for (;;)
1243
0
            {
1244
0
            if (c < *cp) { OK = FALSE; break; }
1245
0
            if (c == *cp++) { OK = TRUE; break; }
1246
0
            }
1247
0
          break;
1248
1249
0
          case PT_UCNC:
1250
0
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1251
0
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1252
0
               c >= 0xe000;
1253
0
          break;
1254
1255
0
          case PT_BIDICL:
1256
0
          OK = UCD_BIDICLASS(c) == code[2];
1257
0
          break;
1258
1259
0
          case PT_BOOL:
1260
0
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1261
0
            UCD_BPROPS_PROP(prop), code[2]) != 0;
1262
0
          break;
1263
1264
          /* Should never occur, but keep compilers from grumbling. */
1265
1266
0
          default:
1267
0
          OK = codevalue != OP_PROP;
1268
0
          break;
1269
0
          }
1270
1271
0
        if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1272
0
        }
1273
0
      break;
1274
0
#endif
1275
1276
1277
1278
/* ========================================================================== */
1279
      /* These opcodes likewise inspect the subject character, but have an
1280
      argument that is not a data character. It is one of these opcodes:
1281
      OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1282
      OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1283
1284
0
      case OP_TYPEPLUS:
1285
0
      case OP_TYPEMINPLUS:
1286
0
      case OP_TYPEPOSPLUS:
1287
0
      count = current_state->count;  /* Already matched */
1288
0
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1289
0
      if (clen > 0)
1290
0
        {
1291
0
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1292
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1293
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1294
0
            NLBLOCK->nllen == 2 &&
1295
0
            c == NLBLOCK->nl[0])
1296
0
          {
1297
0
          could_continue = partial_newline = TRUE;
1298
0
          }
1299
0
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1300
0
            (c < 256 &&
1301
0
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1302
0
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1303
0
          {
1304
0
          if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1305
0
            {
1306
0
            active_count--;            /* Remove non-match possibility */
1307
0
            next_active_state--;
1308
0
            }
1309
0
          count++;
1310
0
          ADD_NEW(state_offset, count);
1311
0
          }
1312
0
        }
1313
0
      break;
1314
1315
      /*-----------------------------------------------------------------*/
1316
0
      case OP_TYPEQUERY:
1317
0
      case OP_TYPEMINQUERY:
1318
0
      case OP_TYPEPOSQUERY:
1319
0
      ADD_ACTIVE(state_offset + 2, 0);
1320
0
      if (clen > 0)
1321
0
        {
1322
0
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1323
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1324
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1325
0
            NLBLOCK->nllen == 2 &&
1326
0
            c == NLBLOCK->nl[0])
1327
0
          {
1328
0
          could_continue = partial_newline = TRUE;
1329
0
          }
1330
0
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1331
0
            (c < 256 &&
1332
0
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1333
0
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1334
0
          {
1335
0
          if (codevalue == OP_TYPEPOSQUERY)
1336
0
            {
1337
0
            active_count--;            /* Remove non-match possibility */
1338
0
            next_active_state--;
1339
0
            }
1340
0
          ADD_NEW(state_offset + 2, 0);
1341
0
          }
1342
0
        }
1343
0
      break;
1344
1345
      /*-----------------------------------------------------------------*/
1346
0
      case OP_TYPESTAR:
1347
0
      case OP_TYPEMINSTAR:
1348
0
      case OP_TYPEPOSSTAR:
1349
0
      ADD_ACTIVE(state_offset + 2, 0);
1350
0
      if (clen > 0)
1351
0
        {
1352
0
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1353
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1354
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1355
0
            NLBLOCK->nllen == 2 &&
1356
0
            c == NLBLOCK->nl[0])
1357
0
          {
1358
0
          could_continue = partial_newline = TRUE;
1359
0
          }
1360
0
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1361
0
            (c < 256 &&
1362
0
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1363
0
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1364
0
          {
1365
0
          if (codevalue == OP_TYPEPOSSTAR)
1366
0
            {
1367
0
            active_count--;            /* Remove non-match possibility */
1368
0
            next_active_state--;
1369
0
            }
1370
0
          ADD_NEW(state_offset, 0);
1371
0
          }
1372
0
        }
1373
0
      break;
1374
1375
      /*-----------------------------------------------------------------*/
1376
0
      case OP_TYPEEXACT:
1377
0
      count = current_state->count;  /* Number already matched */
1378
0
      if (clen > 0)
1379
0
        {
1380
0
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1381
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1382
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1383
0
            NLBLOCK->nllen == 2 &&
1384
0
            c == NLBLOCK->nl[0])
1385
0
          {
1386
0
          could_continue = partial_newline = TRUE;
1387
0
          }
1388
0
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1389
0
            (c < 256 &&
1390
0
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1391
0
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1392
0
          {
1393
0
          if (++count >= (int)GET2(code, 1))
1394
0
            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1395
0
          else
1396
0
            { ADD_NEW(state_offset, count); }
1397
0
          }
1398
0
        }
1399
0
      break;
1400
1401
      /*-----------------------------------------------------------------*/
1402
0
      case OP_TYPEUPTO:
1403
0
      case OP_TYPEMINUPTO:
1404
0
      case OP_TYPEPOSUPTO:
1405
0
      ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1406
0
      count = current_state->count;  /* Number already matched */
1407
0
      if (clen > 0)
1408
0
        {
1409
0
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1410
0
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1411
0
            NLBLOCK->nltype == NLTYPE_FIXED &&
1412
0
            NLBLOCK->nllen == 2 &&
1413
0
            c == NLBLOCK->nl[0])
1414
0
          {
1415
0
          could_continue = partial_newline = TRUE;
1416
0
          }
1417
0
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1418
0
            (c < 256 &&
1419
0
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1420
0
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1421
0
          {
1422
0
          if (codevalue == OP_TYPEPOSUPTO)
1423
0
            {
1424
0
            active_count--;           /* Remove non-match possibility */
1425
0
            next_active_state--;
1426
0
            }
1427
0
          if (++count >= (int)GET2(code, 1))
1428
0
            { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1429
0
          else
1430
0
            { ADD_NEW(state_offset, count); }
1431
0
          }
1432
0
        }
1433
0
      break;
1434
1435
/* ========================================================================== */
1436
      /* These are virtual opcodes that are used when something like
1437
      OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1438
      argument. It keeps the code above fast for the other cases. The argument
1439
      is in the d variable. */
1440
1441
0
#ifdef SUPPORT_UNICODE
1442
0
      case OP_PROP_EXTRA + OP_TYPEPLUS:
1443
0
      case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1444
0
      case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1445
0
      count = current_state->count;           /* Already matched */
1446
0
      if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1447
0
      if (clen > 0)
1448
0
        {
1449
0
        BOOL OK;
1450
0
        int chartype;
1451
0
        const uint32_t *cp;
1452
0
        const ucd_record * prop = GET_UCD(c);
1453
0
        switch(code[2])
1454
0
          {
1455
0
          case PT_LAMP:
1456
0
          chartype = prop->chartype;
1457
0
          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1458
0
          break;
1459
1460
0
          case PT_GC:
1461
0
          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1462
0
          break;
1463
1464
0
          case PT_PC:
1465
0
          OK = prop->chartype == code[3];
1466
0
          break;
1467
1468
0
          case PT_SC:
1469
0
          OK = prop->script == code[3];
1470
0
          break;
1471
1472
0
          case PT_SCX:
1473
0
          OK = (prop->script == code[3] ||
1474
0
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1475
0
          break;
1476
1477
          /* These are specials for combination cases. */
1478
1479
0
          case PT_ALNUM:
1480
0
          chartype = prop->chartype;
1481
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1482
0
               PRIV(ucp_gentype)[chartype] == ucp_N;
1483
0
          break;
1484
1485
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1486
          which means that Perl space and POSIX space are now identical. PCRE
1487
          was changed at release 8.34. */
1488
1489
0
          case PT_SPACE:    /* Perl space */
1490
0
          case PT_PXSPACE:  /* POSIX space */
1491
0
          switch(c)
1492
0
            {
1493
0
            HSPACE_CASES:
1494
0
            VSPACE_CASES:
1495
0
            OK = TRUE;
1496
0
            break;
1497
1498
0
            default:
1499
0
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1500
0
            break;
1501
0
            }
1502
0
          break;
1503
1504
0
          case PT_WORD:
1505
0
          chartype = prop->chartype;
1506
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1507
0
               PRIV(ucp_gentype)[chartype] == ucp_N ||
1508
0
               chartype == ucp_Mn || chartype == ucp_Pc;
1509
0
          break;
1510
1511
0
          case PT_CLIST:
1512
#if PCRE2_CODE_UNIT_WIDTH == 32
1513
          if (c > MAX_UTF_CODE_POINT)
1514
            {
1515
            OK = FALSE;
1516
            break;
1517
            }
1518
#endif
1519
0
          cp = PRIV(ucd_caseless_sets) + code[3];
1520
0
          for (;;)
1521
0
            {
1522
0
            if (c < *cp) { OK = FALSE; break; }
1523
0
            if (c == *cp++) { OK = TRUE; break; }
1524
0
            }
1525
0
          break;
1526
1527
0
          case PT_UCNC:
1528
0
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1529
0
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1530
0
               c >= 0xe000;
1531
0
          break;
1532
1533
0
          case PT_BIDICL:
1534
0
          OK = UCD_BIDICLASS(c) == code[3];
1535
0
          break;
1536
1537
0
          case PT_BOOL:
1538
0
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1539
0
            UCD_BPROPS_PROP(prop), code[3]) != 0;
1540
0
          break;
1541
1542
          /* Should never occur, but keep compilers from grumbling. */
1543
1544
0
          default:
1545
0
          OK = codevalue != OP_PROP;
1546
0
          break;
1547
0
          }
1548
1549
0
        if (OK == (d == OP_PROP))
1550
0
          {
1551
0
          if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1552
0
            {
1553
0
            active_count--;           /* Remove non-match possibility */
1554
0
            next_active_state--;
1555
0
            }
1556
0
          count++;
1557
0
          ADD_NEW(state_offset, count);
1558
0
          }
1559
0
        }
1560
0
      break;
1561
1562
      /*-----------------------------------------------------------------*/
1563
0
      case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1564
0
      case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1565
0
      case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1566
0
      count = current_state->count;  /* Already matched */
1567
0
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1568
0
      if (clen > 0)
1569
0
        {
1570
0
        int ncount = 0;
1571
0
        if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1572
0
          {
1573
0
          active_count--;           /* Remove non-match possibility */
1574
0
          next_active_state--;
1575
0
          }
1576
0
        (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1577
0
          &ncount);
1578
0
        count++;
1579
0
        ADD_NEW_DATA(-state_offset, count, ncount);
1580
0
        }
1581
0
      break;
1582
0
#endif
1583
1584
      /*-----------------------------------------------------------------*/
1585
0
      case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1586
0
      case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1587
0
      case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1588
0
      count = current_state->count;  /* Already matched */
1589
0
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1590
0
      if (clen > 0)
1591
0
        {
1592
0
        int ncount = 0;
1593
0
        switch (c)
1594
0
          {
1595
0
          case CHAR_VT:
1596
0
          case CHAR_FF:
1597
0
          case CHAR_NEL:
1598
0
#ifndef EBCDIC
1599
0
          case 0x2028:
1600
0
          case 0x2029:
1601
0
#endif  /* Not EBCDIC */
1602
0
          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1603
0
          goto ANYNL01;
1604
1605
0
          case CHAR_CR:
1606
0
          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1607
          /* Fall through */
1608
1609
0
          ANYNL01:
1610
0
          case CHAR_LF:
1611
0
          if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1612
0
            {
1613
0
            active_count--;           /* Remove non-match possibility */
1614
0
            next_active_state--;
1615
0
            }
1616
0
          count++;
1617
0
          ADD_NEW_DATA(-state_offset, count, ncount);
1618
0
          break;
1619
1620
0
          default:
1621
0
          break;
1622
0
          }
1623
0
        }
1624
0
      break;
1625
1626
      /*-----------------------------------------------------------------*/
1627
0
      case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1628
0
      case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1629
0
      case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1630
0
      count = current_state->count;  /* Already matched */
1631
0
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1632
0
      if (clen > 0)
1633
0
        {
1634
0
        BOOL OK;
1635
0
        switch (c)
1636
0
          {
1637
0
          VSPACE_CASES:
1638
0
          OK = TRUE;
1639
0
          break;
1640
1641
0
          default:
1642
0
          OK = FALSE;
1643
0
          break;
1644
0
          }
1645
1646
0
        if (OK == (d == OP_VSPACE))
1647
0
          {
1648
0
          if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1649
0
            {
1650
0
            active_count--;           /* Remove non-match possibility */
1651
0
            next_active_state--;
1652
0
            }
1653
0
          count++;
1654
0
          ADD_NEW_DATA(-state_offset, count, 0);
1655
0
          }
1656
0
        }
1657
0
      break;
1658
1659
      /*-----------------------------------------------------------------*/
1660
0
      case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1661
0
      case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1662
0
      case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1663
0
      count = current_state->count;  /* Already matched */
1664
0
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1665
0
      if (clen > 0)
1666
0
        {
1667
0
        BOOL OK;
1668
0
        switch (c)
1669
0
          {
1670
0
          HSPACE_CASES:
1671
0
          OK = TRUE;
1672
0
          break;
1673
1674
0
          default:
1675
0
          OK = FALSE;
1676
0
          break;
1677
0
          }
1678
1679
0
        if (OK == (d == OP_HSPACE))
1680
0
          {
1681
0
          if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1682
0
            {
1683
0
            active_count--;           /* Remove non-match possibility */
1684
0
            next_active_state--;
1685
0
            }
1686
0
          count++;
1687
0
          ADD_NEW_DATA(-state_offset, count, 0);
1688
0
          }
1689
0
        }
1690
0
      break;
1691
1692
      /*-----------------------------------------------------------------*/
1693
0
#ifdef SUPPORT_UNICODE
1694
0
      case OP_PROP_EXTRA + OP_TYPEQUERY:
1695
0
      case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1696
0
      case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1697
0
      count = 4;
1698
0
      goto QS1;
1699
1700
0
      case OP_PROP_EXTRA + OP_TYPESTAR:
1701
0
      case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1702
0
      case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1703
0
      count = 0;
1704
1705
0
      QS1:
1706
1707
0
      ADD_ACTIVE(state_offset + 4, 0);
1708
0
      if (clen > 0)
1709
0
        {
1710
0
        BOOL OK;
1711
0
        int chartype;
1712
0
        const uint32_t *cp;
1713
0
        const ucd_record * prop = GET_UCD(c);
1714
0
        switch(code[2])
1715
0
          {
1716
0
          case PT_LAMP:
1717
0
          chartype = prop->chartype;
1718
0
          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1719
0
          break;
1720
1721
0
          case PT_GC:
1722
0
          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1723
0
          break;
1724
1725
0
          case PT_PC:
1726
0
          OK = prop->chartype == code[3];
1727
0
          break;
1728
1729
0
          case PT_SC:
1730
0
          OK = prop->script == code[3];
1731
0
          break;
1732
1733
0
          case PT_SCX:
1734
0
          OK = (prop->script == code[3] ||
1735
0
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1736
0
          break;
1737
1738
          /* These are specials for combination cases. */
1739
1740
0
          case PT_ALNUM:
1741
0
          chartype = prop->chartype;
1742
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1743
0
               PRIV(ucp_gentype)[chartype] == ucp_N;
1744
0
          break;
1745
1746
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1747
          which means that Perl space and POSIX space are now identical. PCRE
1748
          was changed at release 8.34. */
1749
1750
0
          case PT_SPACE:    /* Perl space */
1751
0
          case PT_PXSPACE:  /* POSIX space */
1752
0
          switch(c)
1753
0
            {
1754
0
            HSPACE_CASES:
1755
0
            VSPACE_CASES:
1756
0
            OK = TRUE;
1757
0
            break;
1758
1759
0
            default:
1760
0
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1761
0
            break;
1762
0
            }
1763
0
          break;
1764
1765
0
          case PT_WORD:
1766
0
          chartype = prop->chartype;
1767
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1768
0
               PRIV(ucp_gentype)[chartype] == ucp_N ||
1769
0
               chartype == ucp_Mn || chartype == ucp_Pc;
1770
0
          break;
1771
1772
0
          case PT_CLIST:
1773
#if PCRE2_CODE_UNIT_WIDTH == 32
1774
          if (c > MAX_UTF_CODE_POINT)
1775
            {
1776
            OK = FALSE;
1777
            break;
1778
            }
1779
#endif
1780
0
          cp = PRIV(ucd_caseless_sets) + code[3];
1781
0
          for (;;)
1782
0
            {
1783
0
            if (c < *cp) { OK = FALSE; break; }
1784
0
            if (c == *cp++) { OK = TRUE; break; }
1785
0
            }
1786
0
          break;
1787
1788
0
          case PT_UCNC:
1789
0
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1790
0
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1791
0
               c >= 0xe000;
1792
0
          break;
1793
1794
0
          case PT_BIDICL:
1795
0
          OK = UCD_BIDICLASS(c) == code[3];
1796
0
          break;
1797
1798
0
          case PT_BOOL:
1799
0
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1800
0
            UCD_BPROPS_PROP(prop), code[3]) != 0;
1801
0
          break;
1802
1803
          /* Should never occur, but keep compilers from grumbling. */
1804
1805
0
          default:
1806
0
          OK = codevalue != OP_PROP;
1807
0
          break;
1808
0
          }
1809
1810
0
        if (OK == (d == OP_PROP))
1811
0
          {
1812
0
          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1813
0
              codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1814
0
            {
1815
0
            active_count--;           /* Remove non-match possibility */
1816
0
            next_active_state--;
1817
0
            }
1818
0
          ADD_NEW(state_offset + count, 0);
1819
0
          }
1820
0
        }
1821
0
      break;
1822
1823
      /*-----------------------------------------------------------------*/
1824
0
      case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1825
0
      case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1826
0
      case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1827
0
      count = 2;
1828
0
      goto QS2;
1829
1830
0
      case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1831
0
      case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1832
0
      case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1833
0
      count = 0;
1834
1835
0
      QS2:
1836
1837
0
      ADD_ACTIVE(state_offset + 2, 0);
1838
0
      if (clen > 0)
1839
0
        {
1840
0
        int ncount = 0;
1841
0
        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1842
0
            codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1843
0
          {
1844
0
          active_count--;           /* Remove non-match possibility */
1845
0
          next_active_state--;
1846
0
          }
1847
0
        (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1848
0
          &ncount);
1849
0
        ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1850
0
        }
1851
0
      break;
1852
0
#endif
1853
1854
      /*-----------------------------------------------------------------*/
1855
0
      case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1856
0
      case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1857
0
      case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1858
0
      count = 2;
1859
0
      goto QS3;
1860
1861
0
      case OP_ANYNL_EXTRA + OP_TYPESTAR:
1862
0
      case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1863
0
      case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1864
0
      count = 0;
1865
1866
0
      QS3:
1867
0
      ADD_ACTIVE(state_offset + 2, 0);
1868
0
      if (clen > 0)
1869
0
        {
1870
0
        int ncount = 0;
1871
0
        switch (c)
1872
0
          {
1873
0
          case CHAR_VT:
1874
0
          case CHAR_FF:
1875
0
          case CHAR_NEL:
1876
0
#ifndef EBCDIC
1877
0
          case 0x2028:
1878
0
          case 0x2029:
1879
0
#endif  /* Not EBCDIC */
1880
0
          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1881
0
          goto ANYNL02;
1882
1883
0
          case CHAR_CR:
1884
0
          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1885
          /* Fall through */
1886
1887
0
          ANYNL02:
1888
0
          case CHAR_LF:
1889
0
          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1890
0
              codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1891
0
            {
1892
0
            active_count--;           /* Remove non-match possibility */
1893
0
            next_active_state--;
1894
0
            }
1895
0
          ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1896
0
          break;
1897
1898
0
          default:
1899
0
          break;
1900
0
          }
1901
0
        }
1902
0
      break;
1903
1904
      /*-----------------------------------------------------------------*/
1905
0
      case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1906
0
      case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1907
0
      case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1908
0
      count = 2;
1909
0
      goto QS4;
1910
1911
0
      case OP_VSPACE_EXTRA + OP_TYPESTAR:
1912
0
      case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1913
0
      case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1914
0
      count = 0;
1915
1916
0
      QS4:
1917
0
      ADD_ACTIVE(state_offset + 2, 0);
1918
0
      if (clen > 0)
1919
0
        {
1920
0
        BOOL OK;
1921
0
        switch (c)
1922
0
          {
1923
0
          VSPACE_CASES:
1924
0
          OK = TRUE;
1925
0
          break;
1926
1927
0
          default:
1928
0
          OK = FALSE;
1929
0
          break;
1930
0
          }
1931
0
        if (OK == (d == OP_VSPACE))
1932
0
          {
1933
0
          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1934
0
              codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1935
0
            {
1936
0
            active_count--;           /* Remove non-match possibility */
1937
0
            next_active_state--;
1938
0
            }
1939
0
          ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1940
0
          }
1941
0
        }
1942
0
      break;
1943
1944
      /*-----------------------------------------------------------------*/
1945
0
      case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1946
0
      case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1947
0
      case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1948
0
      count = 2;
1949
0
      goto QS5;
1950
1951
0
      case OP_HSPACE_EXTRA + OP_TYPESTAR:
1952
0
      case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1953
0
      case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1954
0
      count = 0;
1955
1956
0
      QS5:
1957
0
      ADD_ACTIVE(state_offset + 2, 0);
1958
0
      if (clen > 0)
1959
0
        {
1960
0
        BOOL OK;
1961
0
        switch (c)
1962
0
          {
1963
0
          HSPACE_CASES:
1964
0
          OK = TRUE;
1965
0
          break;
1966
1967
0
          default:
1968
0
          OK = FALSE;
1969
0
          break;
1970
0
          }
1971
1972
0
        if (OK == (d == OP_HSPACE))
1973
0
          {
1974
0
          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1975
0
              codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1976
0
            {
1977
0
            active_count--;           /* Remove non-match possibility */
1978
0
            next_active_state--;
1979
0
            }
1980
0
          ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1981
0
          }
1982
0
        }
1983
0
      break;
1984
1985
      /*-----------------------------------------------------------------*/
1986
0
#ifdef SUPPORT_UNICODE
1987
0
      case OP_PROP_EXTRA + OP_TYPEEXACT:
1988
0
      case OP_PROP_EXTRA + OP_TYPEUPTO:
1989
0
      case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1990
0
      case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1991
0
      if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1992
0
        { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1993
0
      count = current_state->count;  /* Number already matched */
1994
0
      if (clen > 0)
1995
0
        {
1996
0
        BOOL OK;
1997
0
        int chartype;
1998
0
        const uint32_t *cp;
1999
0
        const ucd_record * prop = GET_UCD(c);
2000
0
        switch(code[1 + IMM2_SIZE + 1])
2001
0
          {
2002
0
          case PT_LAMP:
2003
0
          chartype = prop->chartype;
2004
0
          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
2005
0
          break;
2006
2007
0
          case PT_GC:
2008
0
          OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
2009
0
          break;
2010
2011
0
          case PT_PC:
2012
0
          OK = prop->chartype == code[1 + IMM2_SIZE + 2];
2013
0
          break;
2014
2015
0
          case PT_SC:
2016
0
          OK = prop->script == code[1 + IMM2_SIZE + 2];
2017
0
          break;
2018
2019
0
          case PT_SCX:
2020
0
          OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
2021
0
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2022
0
                  code[1 + IMM2_SIZE + 2]) != 0);
2023
0
          break;
2024
2025
          /* These are specials for combination cases. */
2026
2027
0
          case PT_ALNUM:
2028
0
          chartype = prop->chartype;
2029
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2030
0
               PRIV(ucp_gentype)[chartype] == ucp_N;
2031
0
          break;
2032
2033
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2034
          which means that Perl space and POSIX space are now identical. PCRE
2035
          was changed at release 8.34. */
2036
2037
0
          case PT_SPACE:    /* Perl space */
2038
0
          case PT_PXSPACE:  /* POSIX space */
2039
0
          switch(c)
2040
0
            {
2041
0
            HSPACE_CASES:
2042
0
            VSPACE_CASES:
2043
0
            OK = TRUE;
2044
0
            break;
2045
2046
0
            default:
2047
0
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2048
0
            break;
2049
0
            }
2050
0
          break;
2051
2052
0
          case PT_WORD:
2053
0
          chartype = prop->chartype;
2054
0
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2055
0
               PRIV(ucp_gentype)[chartype] == ucp_N ||
2056
0
               chartype == ucp_Mn || chartype == ucp_Pc;
2057
0
          break;
2058
2059
0
          case PT_CLIST:
2060
#if PCRE2_CODE_UNIT_WIDTH == 32
2061
          if (c > MAX_UTF_CODE_POINT)
2062
            {
2063
            OK = FALSE;
2064
            break;
2065
            }
2066
#endif
2067
0
          cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
2068
0
          for (;;)
2069
0
            {
2070
0
            if (c < *cp) { OK = FALSE; break; }
2071
0
            if (c == *cp++) { OK = TRUE; break; }
2072
0
            }
2073
0
          break;
2074
2075
0
          case PT_UCNC:
2076
0
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2077
0
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2078
0
               c >= 0xe000;
2079
0
          break;
2080
2081
0
          case PT_BIDICL:
2082
0
          OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
2083
0
          break;
2084
2085
0
          case PT_BOOL:
2086
0
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2087
0
            UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
2088
0
          break;
2089
2090
          /* Should never occur, but keep compilers from grumbling. */
2091
2092
0
          default:
2093
0
          OK = codevalue != OP_PROP;
2094
0
          break;
2095
0
          }
2096
2097
0
        if (OK == (d == OP_PROP))
2098
0
          {
2099
0
          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2100
0
            {
2101
0
            active_count--;           /* Remove non-match possibility */
2102
0
            next_active_state--;
2103
0
            }
2104
0
          if (++count >= (int)GET2(code, 1))
2105
0
            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2106
0
          else
2107
0
            { ADD_NEW(state_offset, count); }
2108
0
          }
2109
0
        }
2110
0
      break;
2111
2112
      /*-----------------------------------------------------------------*/
2113
0
      case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2114
0
      case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2115
0
      case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2116
0
      case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2117
0
      if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2118
0
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2119
0
      count = current_state->count;  /* Number already matched */
2120
0
      if (clen > 0)
2121
0
        {
2122
0
        PCRE2_SPTR nptr;
2123
0
        int ncount = 0;
2124
0
        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2125
0
          {
2126
0
          active_count--;           /* Remove non-match possibility */
2127
0
          next_active_state--;
2128
0
          }
2129
0
        nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2130
0
          &ncount);
2131
0
        if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2132
0
            reset_could_continue = TRUE;
2133
0
        if (++count >= (int)GET2(code, 1))
2134
0
          { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2135
0
        else
2136
0
          { ADD_NEW_DATA(-state_offset, count, ncount); }
2137
0
        }
2138
0
      break;
2139
0
#endif
2140
2141
      /*-----------------------------------------------------------------*/
2142
0
      case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2143
0
      case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2144
0
      case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2145
0
      case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2146
0
      if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2147
0
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2148
0
      count = current_state->count;  /* Number already matched */
2149
0
      if (clen > 0)
2150
0
        {
2151
0
        int ncount = 0;
2152
0
        switch (c)
2153
0
          {
2154
0
          case CHAR_VT:
2155
0
          case CHAR_FF:
2156
0
          case CHAR_NEL:
2157
0
#ifndef EBCDIC
2158
0
          case 0x2028:
2159
0
          case 0x2029:
2160
0
#endif  /* Not EBCDIC */
2161
0
          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2162
0
          goto ANYNL03;
2163
2164
0
          case CHAR_CR:
2165
0
          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2166
          /* Fall through */
2167
2168
0
          ANYNL03:
2169
0
          case CHAR_LF:
2170
0
          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2171
0
            {
2172
0
            active_count--;           /* Remove non-match possibility */
2173
0
            next_active_state--;
2174
0
            }
2175
0
          if (++count >= (int)GET2(code, 1))
2176
0
            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2177
0
          else
2178
0
            { ADD_NEW_DATA(-state_offset, count, ncount); }
2179
0
          break;
2180
2181
0
          default:
2182
0
          break;
2183
0
          }
2184
0
        }
2185
0
      break;
2186
2187
      /*-----------------------------------------------------------------*/
2188
0
      case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2189
0
      case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2190
0
      case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2191
0
      case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2192
0
      if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2193
0
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2194
0
      count = current_state->count;  /* Number already matched */
2195
0
      if (clen > 0)
2196
0
        {
2197
0
        BOOL OK;
2198
0
        switch (c)
2199
0
          {
2200
0
          VSPACE_CASES:
2201
0
          OK = TRUE;
2202
0
          break;
2203
2204
0
          default:
2205
0
          OK = FALSE;
2206
0
          }
2207
2208
0
        if (OK == (d == OP_VSPACE))
2209
0
          {
2210
0
          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2211
0
            {
2212
0
            active_count--;           /* Remove non-match possibility */
2213
0
            next_active_state--;
2214
0
            }
2215
0
          if (++count >= (int)GET2(code, 1))
2216
0
            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2217
0
          else
2218
0
            { ADD_NEW_DATA(-state_offset, count, 0); }
2219
0
          }
2220
0
        }
2221
0
      break;
2222
2223
      /*-----------------------------------------------------------------*/
2224
0
      case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2225
0
      case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2226
0
      case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2227
0
      case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2228
0
      if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2229
0
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2230
0
      count = current_state->count;  /* Number already matched */
2231
0
      if (clen > 0)
2232
0
        {
2233
0
        BOOL OK;
2234
0
        switch (c)
2235
0
          {
2236
0
          HSPACE_CASES:
2237
0
          OK = TRUE;
2238
0
          break;
2239
2240
0
          default:
2241
0
          OK = FALSE;
2242
0
          break;
2243
0
          }
2244
2245
0
        if (OK == (d == OP_HSPACE))
2246
0
          {
2247
0
          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2248
0
            {
2249
0
            active_count--;           /* Remove non-match possibility */
2250
0
            next_active_state--;
2251
0
            }
2252
0
          if (++count >= (int)GET2(code, 1))
2253
0
            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2254
0
          else
2255
0
            { ADD_NEW_DATA(-state_offset, count, 0); }
2256
0
          }
2257
0
        }
2258
0
      break;
2259
2260
/* ========================================================================== */
2261
      /* These opcodes are followed by a character that is usually compared
2262
      to the current subject character; it is loaded into d. We still get
2263
      here even if there is no subject character, because in some cases zero
2264
      repetitions are permitted. */
2265
2266
      /*-----------------------------------------------------------------*/
2267
0
      case OP_CHAR:
2268
0
      if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2269
0
      break;
2270
2271
      /*-----------------------------------------------------------------*/
2272
0
      case OP_CHARI:
2273
0
      if (clen == 0) break;
2274
2275
0
#ifdef SUPPORT_UNICODE
2276
0
      if (utf_or_ucp)
2277
0
        {
2278
0
        if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2279
0
          {
2280
0
          unsigned int othercase;
2281
0
          if (c < 128)
2282
0
            othercase = fcc[c];
2283
0
          else
2284
0
            othercase = UCD_OTHERCASE(c);
2285
0
          if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2286
0
          }
2287
0
        }
2288
0
      else
2289
0
#endif  /* SUPPORT_UNICODE */
2290
      /* Not UTF or UCP mode */
2291
0
        {
2292
0
        if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2293
0
          { ADD_NEW(state_offset + 2, 0); }
2294
0
        }
2295
0
      break;
2296
2297
2298
0
#ifdef SUPPORT_UNICODE
2299
      /*-----------------------------------------------------------------*/
2300
      /* This is a tricky one because it can match more than one character.
2301
      Find out how many characters to skip, and then set up a negative state
2302
      to wait for them to pass before continuing. */
2303
2304
0
      case OP_EXTUNI:
2305
0
      if (clen > 0)
2306
0
        {
2307
0
        int ncount = 0;
2308
0
        PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2309
0
          end_subject, utf, &ncount);
2310
0
        if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2311
0
            reset_could_continue = TRUE;
2312
0
        ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2313
0
        }
2314
0
      break;
2315
0
#endif
2316
2317
      /*-----------------------------------------------------------------*/
2318
      /* This is a tricky like EXTUNI because it too can match more than one
2319
      character (when CR is followed by LF). In this case, set up a negative
2320
      state to wait for one character to pass before continuing. */
2321
2322
0
      case OP_ANYNL:
2323
0
      if (clen > 0) switch(c)
2324
0
        {
2325
0
        case CHAR_VT:
2326
0
        case CHAR_FF:
2327
0
        case CHAR_NEL:
2328
0
#ifndef EBCDIC
2329
0
        case 0x2028:
2330
0
        case 0x2029:
2331
0
#endif  /* Not EBCDIC */
2332
0
        if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2333
        /* Fall through */
2334
2335
0
        case CHAR_LF:
2336
0
        ADD_NEW(state_offset + 1, 0);
2337
0
        break;
2338
2339
0
        case CHAR_CR:
2340
0
        if (ptr + 1 >= end_subject)
2341
0
          {
2342
0
          ADD_NEW(state_offset + 1, 0);
2343
0
          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2344
0
            reset_could_continue = TRUE;
2345
0
          }
2346
0
        else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2347
0
          {
2348
0
          ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2349
0
          }
2350
0
        else
2351
0
          {
2352
0
          ADD_NEW(state_offset + 1, 0);
2353
0
          }
2354
0
        break;
2355
0
        }
2356
0
      break;
2357
2358
      /*-----------------------------------------------------------------*/
2359
0
      case OP_NOT_VSPACE:
2360
0
      if (clen > 0) switch(c)
2361
0
        {
2362
0
        VSPACE_CASES:
2363
0
        break;
2364
2365
0
        default:
2366
0
        ADD_NEW(state_offset + 1, 0);
2367
0
        break;
2368
0
        }
2369
0
      break;
2370
2371
      /*-----------------------------------------------------------------*/
2372
0
      case OP_VSPACE:
2373
0
      if (clen > 0) switch(c)
2374
0
        {
2375
0
        VSPACE_CASES:
2376
0
        ADD_NEW(state_offset + 1, 0);
2377
0
        break;
2378
2379
0
        default:
2380
0
        break;
2381
0
        }
2382
0
      break;
2383
2384
      /*-----------------------------------------------------------------*/
2385
0
      case OP_NOT_HSPACE:
2386
0
      if (clen > 0) switch(c)
2387
0
        {
2388
0
        HSPACE_CASES:
2389
0
        break;
2390
2391
0
        default:
2392
0
        ADD_NEW(state_offset + 1, 0);
2393
0
        break;
2394
0
        }
2395
0
      break;
2396
2397
      /*-----------------------------------------------------------------*/
2398
0
      case OP_HSPACE:
2399
0
      if (clen > 0) switch(c)
2400
0
        {
2401
0
        HSPACE_CASES:
2402
0
        ADD_NEW(state_offset + 1, 0);
2403
0
        break;
2404
2405
0
        default:
2406
0
        break;
2407
0
        }
2408
0
      break;
2409
2410
      /*-----------------------------------------------------------------*/
2411
      /* Match a negated single character casefully. */
2412
2413
0
      case OP_NOT:
2414
0
      if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2415
0
      break;
2416
2417
      /*-----------------------------------------------------------------*/
2418
      /* Match a negated single character caselessly. */
2419
2420
0
      case OP_NOTI:
2421
0
      if (clen > 0)
2422
0
        {
2423
0
        uint32_t otherd;
2424
0
#ifdef SUPPORT_UNICODE
2425
0
        if (utf_or_ucp && d >= 128)
2426
0
          otherd = UCD_OTHERCASE(d);
2427
0
        else
2428
0
#endif  /* SUPPORT_UNICODE */
2429
0
        otherd = TABLE_GET(d, fcc, d);
2430
0
        if (c != d && c != otherd)
2431
0
          { ADD_NEW(state_offset + dlen + 1, 0); }
2432
0
        }
2433
0
      break;
2434
2435
      /*-----------------------------------------------------------------*/
2436
0
      case OP_PLUSI:
2437
0
      case OP_MINPLUSI:
2438
0
      case OP_POSPLUSI:
2439
0
      case OP_NOTPLUSI:
2440
0
      case OP_NOTMINPLUSI:
2441
0
      case OP_NOTPOSPLUSI:
2442
0
      caseless = TRUE;
2443
0
      codevalue -= OP_STARI - OP_STAR;
2444
2445
      /* Fall through */
2446
0
      case OP_PLUS:
2447
0
      case OP_MINPLUS:
2448
0
      case OP_POSPLUS:
2449
0
      case OP_NOTPLUS:
2450
0
      case OP_NOTMINPLUS:
2451
0
      case OP_NOTPOSPLUS:
2452
0
      count = current_state->count;  /* Already matched */
2453
0
      if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2454
0
      if (clen > 0)
2455
0
        {
2456
0
        uint32_t otherd = NOTACHAR;
2457
0
        if (caseless)
2458
0
          {
2459
0
#ifdef SUPPORT_UNICODE
2460
0
          if (utf_or_ucp && d >= 128)
2461
0
            otherd = UCD_OTHERCASE(d);
2462
0
          else
2463
0
#endif  /* SUPPORT_UNICODE */
2464
0
          otherd = TABLE_GET(d, fcc, d);
2465
0
          }
2466
0
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2467
0
          {
2468
0
          if (count > 0 &&
2469
0
              (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2470
0
            {
2471
0
            active_count--;             /* Remove non-match possibility */
2472
0
            next_active_state--;
2473
0
            }
2474
0
          count++;
2475
0
          ADD_NEW(state_offset, count);
2476
0
          }
2477
0
        }
2478
0
      break;
2479
2480
      /*-----------------------------------------------------------------*/
2481
0
      case OP_QUERYI:
2482
0
      case OP_MINQUERYI:
2483
0
      case OP_POSQUERYI:
2484
0
      case OP_NOTQUERYI:
2485
0
      case OP_NOTMINQUERYI:
2486
0
      case OP_NOTPOSQUERYI:
2487
0
      caseless = TRUE;
2488
0
      codevalue -= OP_STARI - OP_STAR;
2489
      /* Fall through */
2490
0
      case OP_QUERY:
2491
0
      case OP_MINQUERY:
2492
0
      case OP_POSQUERY:
2493
0
      case OP_NOTQUERY:
2494
0
      case OP_NOTMINQUERY:
2495
0
      case OP_NOTPOSQUERY:
2496
0
      ADD_ACTIVE(state_offset + dlen + 1, 0);
2497
0
      if (clen > 0)
2498
0
        {
2499
0
        uint32_t otherd = NOTACHAR;
2500
0
        if (caseless)
2501
0
          {
2502
0
#ifdef SUPPORT_UNICODE
2503
0
          if (utf_or_ucp && d >= 128)
2504
0
            otherd = UCD_OTHERCASE(d);
2505
0
          else
2506
0
#endif  /* SUPPORT_UNICODE */
2507
0
          otherd = TABLE_GET(d, fcc, d);
2508
0
          }
2509
0
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2510
0
          {
2511
0
          if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2512
0
            {
2513
0
            active_count--;            /* Remove non-match possibility */
2514
0
            next_active_state--;
2515
0
            }
2516
0
          ADD_NEW(state_offset + dlen + 1, 0);
2517
0
          }
2518
0
        }
2519
0
      break;
2520
2521
      /*-----------------------------------------------------------------*/
2522
0
      case OP_STARI:
2523
0
      case OP_MINSTARI:
2524
0
      case OP_POSSTARI:
2525
0
      case OP_NOTSTARI:
2526
0
      case OP_NOTMINSTARI:
2527
0
      case OP_NOTPOSSTARI:
2528
0
      caseless = TRUE;
2529
0
      codevalue -= OP_STARI - OP_STAR;
2530
      /* Fall through */
2531
0
      case OP_STAR:
2532
0
      case OP_MINSTAR:
2533
0
      case OP_POSSTAR:
2534
0
      case OP_NOTSTAR:
2535
0
      case OP_NOTMINSTAR:
2536
0
      case OP_NOTPOSSTAR:
2537
0
      ADD_ACTIVE(state_offset + dlen + 1, 0);
2538
0
      if (clen > 0)
2539
0
        {
2540
0
        uint32_t otherd = NOTACHAR;
2541
0
        if (caseless)
2542
0
          {
2543
0
#ifdef SUPPORT_UNICODE
2544
0
          if (utf_or_ucp && d >= 128)
2545
0
            otherd = UCD_OTHERCASE(d);
2546
0
          else
2547
0
#endif  /* SUPPORT_UNICODE */
2548
0
          otherd = TABLE_GET(d, fcc, d);
2549
0
          }
2550
0
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2551
0
          {
2552
0
          if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2553
0
            {
2554
0
            active_count--;            /* Remove non-match possibility */
2555
0
            next_active_state--;
2556
0
            }
2557
0
          ADD_NEW(state_offset, 0);
2558
0
          }
2559
0
        }
2560
0
      break;
2561
2562
      /*-----------------------------------------------------------------*/
2563
0
      case OP_EXACTI:
2564
0
      case OP_NOTEXACTI:
2565
0
      caseless = TRUE;
2566
0
      codevalue -= OP_STARI - OP_STAR;
2567
      /* Fall through */
2568
0
      case OP_EXACT:
2569
0
      case OP_NOTEXACT:
2570
0
      count = current_state->count;  /* Number already matched */
2571
0
      if (clen > 0)
2572
0
        {
2573
0
        uint32_t otherd = NOTACHAR;
2574
0
        if (caseless)
2575
0
          {
2576
0
#ifdef SUPPORT_UNICODE
2577
0
          if (utf_or_ucp && d >= 128)
2578
0
            otherd = UCD_OTHERCASE(d);
2579
0
          else
2580
0
#endif  /* SUPPORT_UNICODE */
2581
0
          otherd = TABLE_GET(d, fcc, d);
2582
0
          }
2583
0
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2584
0
          {
2585
0
          if (++count >= (int)GET2(code, 1))
2586
0
            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2587
0
          else
2588
0
            { ADD_NEW(state_offset, count); }
2589
0
          }
2590
0
        }
2591
0
      break;
2592
2593
      /*-----------------------------------------------------------------*/
2594
0
      case OP_UPTOI:
2595
0
      case OP_MINUPTOI:
2596
0
      case OP_POSUPTOI:
2597
0
      case OP_NOTUPTOI:
2598
0
      case OP_NOTMINUPTOI:
2599
0
      case OP_NOTPOSUPTOI:
2600
0
      caseless = TRUE;
2601
0
      codevalue -= OP_STARI - OP_STAR;
2602
      /* Fall through */
2603
0
      case OP_UPTO:
2604
0
      case OP_MINUPTO:
2605
0
      case OP_POSUPTO:
2606
0
      case OP_NOTUPTO:
2607
0
      case OP_NOTMINUPTO:
2608
0
      case OP_NOTPOSUPTO:
2609
0
      ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2610
0
      count = current_state->count;  /* Number already matched */
2611
0
      if (clen > 0)
2612
0
        {
2613
0
        uint32_t otherd = NOTACHAR;
2614
0
        if (caseless)
2615
0
          {
2616
0
#ifdef SUPPORT_UNICODE
2617
0
          if (utf_or_ucp && d >= 128)
2618
0
            otherd = UCD_OTHERCASE(d);
2619
0
          else
2620
0
#endif  /* SUPPORT_UNICODE */
2621
0
          otherd = TABLE_GET(d, fcc, d);
2622
0
          }
2623
0
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2624
0
          {
2625
0
          if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2626
0
            {
2627
0
            active_count--;             /* Remove non-match possibility */
2628
0
            next_active_state--;
2629
0
            }
2630
0
          if (++count >= (int)GET2(code, 1))
2631
0
            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2632
0
          else
2633
0
            { ADD_NEW(state_offset, count); }
2634
0
          }
2635
0
        }
2636
0
      break;
2637
2638
2639
/* ========================================================================== */
2640
      /* These are the class-handling opcodes */
2641
2642
0
      case OP_CLASS:
2643
0
      case OP_NCLASS:
2644
0
#ifdef SUPPORT_WIDE_CHARS
2645
0
      case OP_XCLASS:
2646
0
      case OP_ECLASS:
2647
0
#endif
2648
0
        {
2649
0
        BOOL isinclass = FALSE;
2650
0
        int next_state_offset;
2651
0
        PCRE2_SPTR ecode;
2652
2653
0
#ifdef SUPPORT_WIDE_CHARS
2654
        /* An extended class may have a table or a list of single characters,
2655
        ranges, or both, and it may be positive or negative. There's a
2656
        function that sorts all this out. */
2657
2658
0
        if (codevalue == OP_XCLASS)
2659
0
         {
2660
0
         ecode = code + GET(code, 1);
2661
0
         if (clen > 0)
2662
0
           isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE,
2663
0
             (const uint8_t*)mb->start_code, utf);
2664
0
         }
2665
2666
        /* A nested set-based class has internal opcodes for performing
2667
        set operations. */
2668
2669
0
        else if (codevalue == OP_ECLASS)
2670
0
         {
2671
0
         ecode = code + GET(code, 1);
2672
0
         if (clen > 0)
2673
0
           isinclass = PRIV(eclass)(c, code + 1 + LINK_SIZE, ecode,
2674
0
             (const uint8_t*)mb->start_code, utf);
2675
0
         }
2676
2677
0
        else
2678
0
#endif /* SUPPORT_WIDE_CHARS */
2679
2680
        /* For a simple class, there is always just a 32-byte table, and we
2681
        can set isinclass from it. */
2682
2683
0
          {
2684
0
          ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2685
0
          if (clen > 0)
2686
0
            {
2687
0
            isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2688
0
              ((((const uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2689
0
            }
2690
0
          }
2691
2692
        /* At this point, isinclass is set for all kinds of class, and ecode
2693
        points to the byte after the end of the class. If there is a
2694
        quantifier, this is where it will be. */
2695
2696
0
        next_state_offset = (int)(ecode - start_code);
2697
2698
0
        switch (*ecode)
2699
0
          {
2700
0
          case OP_CRSTAR:
2701
0
          case OP_CRMINSTAR:
2702
0
          case OP_CRPOSSTAR:
2703
0
          ADD_ACTIVE(next_state_offset + 1, 0);
2704
0
          if (isinclass)
2705
0
            {
2706
0
            if (*ecode == OP_CRPOSSTAR)
2707
0
              {
2708
0
              active_count--;           /* Remove non-match possibility */
2709
0
              next_active_state--;
2710
0
              }
2711
0
            ADD_NEW(state_offset, 0);
2712
0
            }
2713
0
          break;
2714
2715
0
          case OP_CRPLUS:
2716
0
          case OP_CRMINPLUS:
2717
0
          case OP_CRPOSPLUS:
2718
0
          count = current_state->count;  /* Already matched */
2719
0
          if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2720
0
          if (isinclass)
2721
0
            {
2722
0
            if (count > 0 && *ecode == OP_CRPOSPLUS)
2723
0
              {
2724
0
              active_count--;           /* Remove non-match possibility */
2725
0
              next_active_state--;
2726
0
              }
2727
0
            count++;
2728
0
            ADD_NEW(state_offset, count);
2729
0
            }
2730
0
          break;
2731
2732
0
          case OP_CRQUERY:
2733
0
          case OP_CRMINQUERY:
2734
0
          case OP_CRPOSQUERY:
2735
0
          ADD_ACTIVE(next_state_offset + 1, 0);
2736
0
          if (isinclass)
2737
0
            {
2738
0
            if (*ecode == OP_CRPOSQUERY)
2739
0
              {
2740
0
              active_count--;           /* Remove non-match possibility */
2741
0
              next_active_state--;
2742
0
              }
2743
0
            ADD_NEW(next_state_offset + 1, 0);
2744
0
            }
2745
0
          break;
2746
2747
0
          case OP_CRRANGE:
2748
0
          case OP_CRMINRANGE:
2749
0
          case OP_CRPOSRANGE:
2750
0
          count = current_state->count;  /* Already matched */
2751
0
          if (count >= (int)GET2(ecode, 1))
2752
0
            { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2753
0
          if (isinclass)
2754
0
            {
2755
0
            int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2756
2757
0
            if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2758
0
              {
2759
0
              active_count--;           /* Remove non-match possibility */
2760
0
              next_active_state--;
2761
0
              }
2762
2763
0
            if (++count >= max && max != 0)   /* Max 0 => no limit */
2764
0
              { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2765
0
            else
2766
0
              { ADD_NEW(state_offset, count); }
2767
0
            }
2768
0
          break;
2769
2770
0
          default:
2771
0
          if (isinclass) { ADD_NEW(next_state_offset, 0); }
2772
0
          break;
2773
0
          }
2774
0
        }
2775
0
      break;
2776
2777
/* ========================================================================== */
2778
      /* These are the opcodes for fancy brackets of various kinds. We have
2779
      to use recursion in order to handle them. The "always failing" assertion
2780
      (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2781
      though the other "backtracking verbs" are not supported. */
2782
2783
0
      case OP_FAIL:
2784
0
      break;
2785
2786
0
      case OP_ASSERT:
2787
0
      case OP_ASSERT_NOT:
2788
0
      case OP_ASSERTBACK:
2789
0
      case OP_ASSERTBACK_NOT:
2790
0
        {
2791
0
        int rc;
2792
0
        int *local_workspace;
2793
0
        PCRE2_SIZE *local_offsets;
2794
0
        PCRE2_SPTR endasscode = code + GET(code, 1);
2795
0
        RWS_anchor *rws = (RWS_anchor *)RWS;
2796
2797
0
        if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2798
0
          {
2799
0
          rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2800
0
          if (rc != 0) return rc;
2801
0
          RWS = (int *)rws;
2802
0
          }
2803
2804
0
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2805
0
        local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2806
0
        rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2807
2808
0
        while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2809
2810
0
        rc = internal_dfa_match(
2811
0
          mb,                                   /* static match data */
2812
0
          code,                                 /* this subexpression's code */
2813
0
          ptr,                                  /* where we currently are */
2814
0
          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2815
0
          local_offsets,                        /* offset vector */
2816
0
          RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2817
0
          local_workspace,                      /* workspace vector */
2818
0
          RWS_RSIZE,                            /* size of same */
2819
0
          rlevel,                               /* function recursion level */
2820
0
          RWS);                                 /* recursion workspace */
2821
2822
0
        rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2823
2824
0
        if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2825
0
        if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2826
0
            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2827
0
        }
2828
0
      break;
2829
2830
      /*-----------------------------------------------------------------*/
2831
0
      case OP_COND:
2832
0
      case OP_SCOND:
2833
0
        {
2834
0
        int codelink = (int)GET(code, 1);
2835
0
        PCRE2_UCHAR condcode;
2836
2837
        /* Because of the way auto-callout works during compile, a callout item
2838
        is inserted between OP_COND and an assertion condition. This does not
2839
        happen for the other conditions. */
2840
2841
0
        if (code[LINK_SIZE + 1] == OP_CALLOUT
2842
0
            || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2843
0
          {
2844
0
          PCRE2_SIZE callout_length;
2845
0
          rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2846
0
            1 + LINK_SIZE, &callout_length);
2847
0
          if (rrc < 0) return rrc;                 /* Abandon */
2848
0
          if (rrc > 0) break;                      /* Fail this thread */
2849
0
          code += callout_length;                  /* Skip callout data */
2850
0
          }
2851
2852
0
        condcode = code[LINK_SIZE+1];
2853
2854
        /* Back reference conditions and duplicate named recursion conditions
2855
        are not supported */
2856
2857
0
        if (condcode == OP_CREF || condcode == OP_DNCREF ||
2858
0
            condcode == OP_DNRREF)
2859
0
          return PCRE2_ERROR_DFA_UCOND;
2860
2861
        /* The DEFINE condition is always false, and the assertion (?!) is
2862
        converted to OP_FAIL. */
2863
2864
0
        if (condcode == OP_FALSE || condcode == OP_FAIL)
2865
0
          { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2866
2867
        /* There is also an always-true condition */
2868
2869
0
        else if (condcode == OP_TRUE)
2870
0
          { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2871
2872
        /* The only supported version of OP_RREF is for the value RREF_ANY,
2873
        which means "test if in any recursion". We can't test for specifically
2874
        recursed groups. */
2875
2876
0
        else if (condcode == OP_RREF)
2877
0
          {
2878
0
          unsigned int value = GET2(code, LINK_SIZE + 2);
2879
0
          if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2880
0
          if (mb->recursive != NULL)
2881
0
            { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2882
0
          else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2883
0
          }
2884
2885
        /* Otherwise, the condition is an assertion */
2886
2887
0
        else
2888
0
          {
2889
0
          int rc;
2890
0
          int *local_workspace;
2891
0
          PCRE2_SIZE *local_offsets;
2892
0
          PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2893
0
          PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2894
0
          RWS_anchor *rws = (RWS_anchor *)RWS;
2895
2896
0
          if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2897
0
            {
2898
0
            rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2899
0
            if (rc != 0) return rc;
2900
0
            RWS = (int *)rws;
2901
0
            }
2902
2903
0
          local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2904
0
          local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2905
0
          rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2906
2907
0
          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2908
2909
0
          rc = internal_dfa_match(
2910
0
            mb,                                   /* fixed match data */
2911
0
            asscode,                              /* this subexpression's code */
2912
0
            ptr,                                  /* where we currently are */
2913
0
            (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2914
0
            local_offsets,                        /* offset vector */
2915
0
            RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2916
0
            local_workspace,                      /* workspace vector */
2917
0
            RWS_RSIZE,                            /* size of same */
2918
0
            rlevel,                               /* function recursion level */
2919
0
            RWS);                                 /* recursion workspace */
2920
2921
0
          rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2922
2923
0
          if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2924
0
          if ((rc >= 0) ==
2925
0
                (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2926
0
            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2927
0
          else
2928
0
            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2929
0
          }
2930
0
        }
2931
0
      break;
2932
2933
      /*-----------------------------------------------------------------*/
2934
0
      case OP_RECURSE:
2935
0
        {
2936
0
        int rc;
2937
0
        int *local_workspace;
2938
0
        PCRE2_SIZE *local_offsets;
2939
0
        RWS_anchor *rws = (RWS_anchor *)RWS;
2940
0
        PCRE2_SPTR callpat = start_code + GET(code, 1);
2941
0
        uint32_t recno = (callpat == mb->start_code)? 0 :
2942
0
          GET2(callpat, 1 + LINK_SIZE);
2943
2944
0
        if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2945
0
          {
2946
0
          rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2947
0
          if (rc != 0) return rc;
2948
0
          RWS = (int *)rws;
2949
0
          }
2950
2951
0
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2952
0
        local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2953
0
        rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2954
2955
        /* Check for repeating a recursion without advancing the subject
2956
        pointer or last used character. This should catch convoluted mutual
2957
        recursions. (Some simple cases are caught at compile time.) */
2958
2959
0
        for (dfa_recursion_info *ri = mb->recursive;
2960
0
             ri != NULL;
2961
0
             ri = ri->prevrec)
2962
0
          {
2963
0
          if (recno == ri->group_num && ptr == ri->subject_position &&
2964
0
              mb->last_used_ptr == ri->last_used_ptr)
2965
0
            return PCRE2_ERROR_RECURSELOOP;
2966
0
          }
2967
2968
        /* Remember this recursion and where we started it so as to
2969
        catch infinite loops. */
2970
2971
0
        new_recursive.group_num = recno;
2972
0
        new_recursive.subject_position = ptr;
2973
0
        new_recursive.last_used_ptr = mb->last_used_ptr;
2974
0
        new_recursive.prevrec = mb->recursive;
2975
0
        mb->recursive = &new_recursive;
2976
2977
0
        rc = internal_dfa_match(
2978
0
          mb,                                   /* fixed match data */
2979
0
          callpat,                              /* this subexpression's code */
2980
0
          ptr,                                  /* where we currently are */
2981
0
          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2982
0
          local_offsets,                        /* offset vector */
2983
0
          RWS_OVEC_RSIZE/OVEC_UNIT,             /* size of same */
2984
0
          local_workspace,                      /* workspace vector */
2985
0
          RWS_RSIZE,                            /* size of same */
2986
0
          rlevel,                               /* function recursion level */
2987
0
          RWS);                                 /* recursion workspace */
2988
2989
0
        rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2990
0
        mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2991
2992
        /* Ran out of internal offsets */
2993
2994
0
        if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2995
2996
        /* For each successful matched substring, set up the next state with a
2997
        count of characters to skip before trying it. Note that the count is in
2998
        characters, not bytes. */
2999
3000
0
        if (rc > 0)
3001
0
          {
3002
0
          for (rc = rc*2 - 2; rc >= 0; rc -= 2)
3003
0
            {
3004
0
            PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
3005
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3006
0
            if (utf)
3007
0
              {
3008
0
              PCRE2_SPTR p = start_subject + local_offsets[rc];
3009
0
              PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
3010
0
              while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3011
0
              }
3012
0
#endif
3013
0
            if (charcount > 0)
3014
0
              {
3015
0
              ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
3016
0
                (int)(charcount - 1));
3017
0
              }
3018
0
            else
3019
0
              {
3020
0
              ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
3021
0
              }
3022
0
            }
3023
0
          }
3024
0
        else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3025
0
        }
3026
0
      break;
3027
3028
      /*-----------------------------------------------------------------*/
3029
0
      case OP_BRAPOS:
3030
0
      case OP_SBRAPOS:
3031
0
      case OP_CBRAPOS:
3032
0
      case OP_SCBRAPOS:
3033
0
      case OP_BRAPOSZERO:
3034
0
        {
3035
0
        int rc;
3036
0
        int *local_workspace;
3037
0
        PCRE2_SIZE *local_offsets;
3038
0
        PCRE2_SIZE charcount, matched_count;
3039
0
        PCRE2_SPTR local_ptr = ptr;
3040
0
        RWS_anchor *rws = (RWS_anchor *)RWS;
3041
0
        BOOL allow_zero;
3042
3043
0
        if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3044
0
          {
3045
0
          rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3046
0
          if (rc != 0) return rc;
3047
0
          RWS = (int *)rws;
3048
0
          }
3049
3050
0
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3051
0
        local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3052
0
        rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3053
3054
0
        if (codevalue == OP_BRAPOSZERO)
3055
0
          {
3056
0
          allow_zero = TRUE;
3057
0
          ++code;  /* The following opcode will be one of the above BRAs */
3058
0
          }
3059
0
        else allow_zero = FALSE;
3060
3061
        /* Loop to match the subpattern as many times as possible as if it were
3062
        a complete pattern. */
3063
3064
0
        for (matched_count = 0;; matched_count++)
3065
0
          {
3066
0
          rc = internal_dfa_match(
3067
0
            mb,                                   /* fixed match data */
3068
0
            code,                                 /* this subexpression's code */
3069
0
            local_ptr,                            /* where we currently are */
3070
0
            (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3071
0
            local_offsets,                        /* offset vector */
3072
0
            RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3073
0
            local_workspace,                      /* workspace vector */
3074
0
            RWS_RSIZE,                            /* size of same */
3075
0
            rlevel,                               /* function recursion level */
3076
0
            RWS);                                 /* recursion workspace */
3077
3078
          /* Failed to match */
3079
3080
0
          if (rc < 0)
3081
0
            {
3082
0
            if (rc != PCRE2_ERROR_NOMATCH) return rc;
3083
0
            break;
3084
0
            }
3085
3086
          /* Matched: break the loop if zero characters matched. */
3087
3088
0
          charcount = local_offsets[1] - local_offsets[0];
3089
0
          if (charcount == 0) break;
3090
0
          local_ptr += charcount;    /* Advance temporary position ptr */
3091
0
          }
3092
3093
0
        rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3094
3095
        /* At this point we have matched the subpattern matched_count
3096
        times, and local_ptr is pointing to the character after the end of the
3097
        last match. */
3098
3099
0
        if (matched_count > 0 || allow_zero)
3100
0
          {
3101
0
          PCRE2_SPTR end_subpattern = code;
3102
0
          int next_state_offset;
3103
3104
0
          do { end_subpattern += GET(end_subpattern, 1); }
3105
0
            while (*end_subpattern == OP_ALT);
3106
0
          next_state_offset =
3107
0
            (int)(end_subpattern - start_code + LINK_SIZE + 1);
3108
3109
          /* Optimization: if there are no more active states, and there
3110
          are no new states yet set up, then skip over the subject string
3111
          right here, to save looping. Otherwise, set up the new state to swing
3112
          into action when the end of the matched substring is reached. */
3113
3114
0
          if (i + 1 >= active_count && new_count == 0)
3115
0
            {
3116
0
            ptr = local_ptr;
3117
0
            clen = 0;
3118
0
            ADD_NEW(next_state_offset, 0);
3119
0
            }
3120
0
          else
3121
0
            {
3122
0
            PCRE2_SPTR p = ptr;
3123
0
            PCRE2_SPTR pp = local_ptr;
3124
0
            charcount = (PCRE2_SIZE)(pp - p);
3125
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3126
0
            if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3127
0
#endif
3128
0
            ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3129
0
            }
3130
0
          }
3131
0
        }
3132
0
      break;
3133
3134
      /*-----------------------------------------------------------------*/
3135
0
      case OP_ONCE:
3136
0
        {
3137
0
        int rc;
3138
0
        int *local_workspace;
3139
0
        PCRE2_SIZE *local_offsets;
3140
0
        RWS_anchor *rws = (RWS_anchor *)RWS;
3141
3142
0
        if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3143
0
          {
3144
0
          rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3145
0
          if (rc != 0) return rc;
3146
0
          RWS = (int *)rws;
3147
0
          }
3148
3149
0
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3150
0
        local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3151
0
        rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3152
3153
0
        rc = internal_dfa_match(
3154
0
          mb,                                   /* fixed match data */
3155
0
          code,                                 /* this subexpression's code */
3156
0
          ptr,                                  /* where we currently are */
3157
0
          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3158
0
          local_offsets,                        /* offset vector */
3159
0
          RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3160
0
          local_workspace,                      /* workspace vector */
3161
0
          RWS_RSIZE,                            /* size of same */
3162
0
          rlevel,                               /* function recursion level */
3163
0
          RWS);                                 /* recursion workspace */
3164
3165
0
        rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3166
3167
0
        if (rc >= 0)
3168
0
          {
3169
0
          PCRE2_SPTR end_subpattern = code;
3170
0
          PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3171
0
          int next_state_offset, repeat_state_offset;
3172
3173
0
          do { end_subpattern += GET(end_subpattern, 1); }
3174
0
            while (*end_subpattern == OP_ALT);
3175
0
          next_state_offset =
3176
0
            (int)(end_subpattern - start_code + LINK_SIZE + 1);
3177
3178
          /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3179
          arrange for the repeat state also to be added to the relevant list.
3180
          Calculate the offset, or set -1 for no repeat. */
3181
3182
0
          repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3183
0
                                 *end_subpattern == OP_KETRMIN)?
3184
0
            (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3185
3186
          /* If we have matched an empty string, add the next state at the
3187
          current character pointer. This is important so that the duplicate
3188
          checking kicks in, which is what breaks infinite loops that match an
3189
          empty string. */
3190
3191
0
          if (charcount == 0)
3192
0
            {
3193
0
            ADD_ACTIVE(next_state_offset, 0);
3194
0
            }
3195
3196
          /* Optimization: if there are no more active states, and there
3197
          are no new states yet set up, then skip over the subject string
3198
          right here, to save looping. Otherwise, set up the new state to swing
3199
          into action when the end of the matched substring is reached. */
3200
3201
0
          else if (i + 1 >= active_count && new_count == 0)
3202
0
            {
3203
0
            ptr += charcount;
3204
0
            clen = 0;
3205
0
            ADD_NEW(next_state_offset, 0);
3206
3207
            /* If we are adding a repeat state at the new character position,
3208
            we must fudge things so that it is the only current state.
3209
            Otherwise, it might be a duplicate of one we processed before, and
3210
            that would cause it to be skipped. */
3211
3212
0
            if (repeat_state_offset >= 0)
3213
0
              {
3214
0
              next_active_state = active_states;
3215
0
              active_count = 0;
3216
0
              i = -1;
3217
0
              ADD_ACTIVE(repeat_state_offset, 0);
3218
0
              }
3219
0
            }
3220
0
          else
3221
0
            {
3222
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3223
0
            if (utf)
3224
0
              {
3225
0
              PCRE2_SPTR p = start_subject + local_offsets[0];
3226
0
              PCRE2_SPTR pp = start_subject + local_offsets[1];
3227
0
              while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3228
0
              }
3229
0
#endif
3230
0
            ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3231
0
            if (repeat_state_offset >= 0)
3232
0
              { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3233
0
            }
3234
0
          }
3235
0
        else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3236
0
        }
3237
0
      break;
3238
3239
3240
/* ========================================================================== */
3241
      /* Handle callouts */
3242
3243
0
      case OP_CALLOUT:
3244
0
      case OP_CALLOUT_STR:
3245
0
        {
3246
0
        PCRE2_SIZE callout_length;
3247
0
        rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
3248
0
          &callout_length);
3249
0
        if (rrc < 0) return rrc;   /* Abandon */
3250
0
        if (rrc == 0)
3251
0
          { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3252
0
        }
3253
0
      break;
3254
3255
3256
/* ========================================================================== */
3257
0
      default:        /* Unsupported opcode */
3258
0
      return PCRE2_ERROR_DFA_UITEM;
3259
0
      }
3260
3261
0
    NEXT_ACTIVE_STATE: continue;
3262
3263
0
    }      /* End of loop scanning active states */
3264
3265
  /* We have finished the processing at the current subject character. If no
3266
  new states have been set for the next character, we have found all the
3267
  matches that we are going to find. If partial matching has been requested,
3268
  check for appropriate conditions.
3269
3270
  The "could_continue" variable is true if a state could have continued but
3271
  for the fact that the end of the subject was reached. */
3272
3273
0
  if (new_count <= 0)
3274
0
    {
3275
0
    if (could_continue &&                            /* Some could go on, and */
3276
0
        (                                            /* either... */
3277
0
        (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3278
0
        ||                                           /* or... */
3279
0
        ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3280
0
         match_count < 0)                             /* no matches */
3281
0
        ) &&                                         /* And... */
3282
0
        (
3283
0
        partial_newline ||                   /* Either partial NL */
3284
0
          (                                  /* or ... */
3285
0
          ptr >= end_subject &&              /* End of subject and */
3286
0
            (                                  /* either */
3287
0
            ptr > mb->start_used_ptr ||        /* Inspected non-empty string */
3288
0
            mb->allowemptypartial              /* or pattern has lookbehind */
3289
0
            )                                  /* or could match empty */
3290
0
          )
3291
0
        ))
3292
0
      match_count = PCRE2_ERROR_PARTIAL;
3293
0
    break;  /* Exit from loop along the subject string */
3294
0
    }
3295
3296
  /* One or more states are active for the next character. */
3297
3298
0
  ptr += clen;    /* Advance to next subject character */
3299
0
  }               /* Loop to move along the subject string */
3300
3301
/* Control gets here from "break" a few lines above. If we have a match and
3302
PCRE2_ENDANCHORED is set, the match fails. */
3303
3304
0
if (match_count >= 0 &&
3305
0
    ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3306
0
    ptr < end_subject)
3307
0
  match_count = PCRE2_ERROR_NOMATCH;
3308
3309
0
return match_count;
3310
0
}
3311
3312
3313
3314
/*************************************************
3315
*     Match a pattern using the DFA algorithm    *
3316
*************************************************/
3317
3318
/* This function matches a compiled pattern to a subject string, using the
3319
alternate matching algorithm that finds all matches at once.
3320
3321
Arguments:
3322
  code          points to the compiled pattern
3323
  subject       subject string
3324
  length        length of subject string
3325
  startoffset   where to start matching in the subject
3326
  options       option bits
3327
  match_data    points to a match data structure
3328
  gcontext      points to a match context
3329
  workspace     pointer to workspace
3330
  wscount       size of workspace
3331
3332
Returns:        > 0 => number of match offset pairs placed in offsets
3333
                = 0 => offsets overflowed; longest matches are present
3334
                 -1 => failed to match
3335
               < -1 => some kind of unexpected problem
3336
*/
3337
3338
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
3339
pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3340
  PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3341
  pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3342
0
{
3343
0
int rc;
3344
0
int was_zero_terminated = 0;
3345
3346
0
const pcre2_real_code *re = (const pcre2_real_code *)code;
3347
3348
0
PCRE2_SPTR start_match;
3349
0
PCRE2_SPTR end_subject;
3350
0
PCRE2_SPTR bumpalong_limit;
3351
0
PCRE2_SPTR req_cu_ptr;
3352
3353
0
BOOL utf, anchored, startline, firstline;
3354
0
BOOL has_first_cu = FALSE;
3355
0
BOOL has_req_cu = FALSE;
3356
3357
0
#if PCRE2_CODE_UNIT_WIDTH == 8
3358
0
PCRE2_SPTR memchr_found_first_cu = NULL;
3359
0
PCRE2_SPTR memchr_found_first_cu2 = NULL;
3360
0
#endif
3361
3362
0
PCRE2_UCHAR first_cu = 0;
3363
0
PCRE2_UCHAR first_cu2 = 0;
3364
0
PCRE2_UCHAR req_cu = 0;
3365
0
PCRE2_UCHAR req_cu2 = 0;
3366
3367
0
const uint8_t *start_bits = NULL;
3368
3369
/* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3370
is used below, and it expects NLBLOCK to be defined as a pointer. */
3371
3372
0
pcre2_callout_block cb;
3373
0
dfa_match_block actual_match_block;
3374
0
dfa_match_block *mb = &actual_match_block;
3375
3376
/* Set up a starting block of memory for use during recursive calls to
3377
internal_dfa_match(). By putting this on the stack, it minimizes resource use
3378
in the case when it is not needed. If this is too small, more memory is
3379
obtained from the heap. At the start of each block is an anchor structure.*/
3380
3381
0
int base_recursion_workspace[RWS_BASE_SIZE];
3382
0
RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3383
0
rws->next = NULL;
3384
0
rws->size = RWS_BASE_SIZE;
3385
0
rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3386
3387
/* Recognize NULL, length 0 as an empty string. */
3388
3389
0
if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
3390
3391
/* Plausibility checks */
3392
3393
0
if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3394
0
if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3395
0
  return PCRE2_ERROR_NULL;
3396
3397
0
if (length == PCRE2_ZERO_TERMINATED)
3398
0
  {
3399
0
  length = PRIV(strlen)(subject);
3400
0
  was_zero_terminated = 1;
3401
0
  }
3402
3403
0
if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3404
0
if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3405
3406
/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3407
time. */
3408
3409
0
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3410
0
   ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3411
0
  return PCRE2_ERROR_BADOPTION;
3412
3413
/* Invalid UTF support is not available for DFA matching. */
3414
3415
0
if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3416
0
  return PCRE2_ERROR_DFA_UINVALID_UTF;
3417
3418
/* Check that the first field in the block is the magic number. If it is not,
3419
return with PCRE2_ERROR_BADMAGIC. */
3420
3421
0
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3422
3423
/* Check the code unit width. */
3424
3425
0
if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3426
0
  return PCRE2_ERROR_BADMODE;
3427
3428
/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3429
options variable for this function. Users of PCRE2 who are not calling the
3430
function directly would like to have a way of setting these flags, in the same
3431
way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with
3432
constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3433
(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3434
transferred to the options for this function. The bits are guaranteed to be
3435
adjacent, but do not have the same values. This bit of Boolean trickery assumes
3436
that the match-time bits are not more significant than the flag bits. If by
3437
accident this is not the case, a compile-time division by zero error will
3438
occur. */
3439
3440
0
#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3441
0
#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3442
0
options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3443
0
#undef FF
3444
0
#undef OO
3445
3446
/* If restarting after a partial match, do some sanity checks on the contents
3447
of the workspace. */
3448
3449
0
if ((options & PCRE2_DFA_RESTART) != 0)
3450
0
  {
3451
0
  if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3452
0
    workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3453
0
      return PCRE2_ERROR_DFA_BADRESTART;
3454
0
  }
3455
3456
/* Set some local values */
3457
3458
0
utf = (re->overall_options & PCRE2_UTF) != 0;
3459
0
start_match = subject + start_offset;
3460
0
end_subject = subject + length;
3461
0
req_cu_ptr = start_match - 1;
3462
0
anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3463
0
  (re->overall_options & PCRE2_ANCHORED) != 0;
3464
3465
/* The "must be at the start of a line" flags are used in a loop when finding
3466
where to start. */
3467
3468
0
startline = (re->flags & PCRE2_STARTLINE) != 0;
3469
0
firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
3470
0
bumpalong_limit = end_subject;
3471
3472
/* Initialize and set up the fixed fields in the callout block, with a pointer
3473
in the match block. */
3474
3475
0
mb->cb = &cb;
3476
0
cb.version = 2;
3477
0
cb.subject = subject;
3478
0
cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3479
0
cb.callout_flags = 0;
3480
0
cb.capture_top      = 1;      /* No capture support */
3481
0
cb.capture_last     = 0;
3482
0
cb.mark             = NULL;   /* No (*MARK) support */
3483
3484
/* Get data from the match context, if present, and fill in the remaining
3485
fields in the match block. It is an error to set an offset limit without
3486
setting the flag at compile time. */
3487
3488
0
if (mcontext == NULL)
3489
0
  {
3490
0
  mb->callout = NULL;
3491
0
  mb->memctl = re->memctl;
3492
0
  mb->match_limit = PRIV(default_match_context).match_limit;
3493
0
  mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3494
0
  mb->heap_limit = PRIV(default_match_context).heap_limit;
3495
0
  }
3496
0
else
3497
0
  {
3498
0
  if (mcontext->offset_limit != PCRE2_UNSET)
3499
0
    {
3500
0
    if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3501
0
      return PCRE2_ERROR_BADOFFSETLIMIT;
3502
0
    bumpalong_limit = subject + mcontext->offset_limit;
3503
0
    }
3504
0
  mb->callout = mcontext->callout;
3505
0
  mb->callout_data = mcontext->callout_data;
3506
0
  mb->memctl = mcontext->memctl;
3507
0
  mb->match_limit = mcontext->match_limit;
3508
0
  mb->match_limit_depth = mcontext->depth_limit;
3509
0
  mb->heap_limit = mcontext->heap_limit;
3510
0
  }
3511
3512
0
if (mb->match_limit > re->limit_match)
3513
0
  mb->match_limit = re->limit_match;
3514
3515
0
if (mb->match_limit_depth > re->limit_depth)
3516
0
  mb->match_limit_depth = re->limit_depth;
3517
3518
0
if (mb->heap_limit > re->limit_heap)
3519
0
  mb->heap_limit = re->limit_heap;
3520
3521
0
mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start);
3522
0
mb->tables = re->tables;
3523
0
mb->start_subject = subject;
3524
0
mb->end_subject = end_subject;
3525
0
mb->start_offset = start_offset;
3526
0
mb->allowemptypartial = (re->max_lookbehind > 0) ||
3527
0
  (re->flags & PCRE2_MATCH_EMPTY) != 0;
3528
0
mb->moptions = options;
3529
0
mb->poptions = re->overall_options;
3530
0
mb->match_call_count = 0;
3531
0
mb->heap_used = 0;
3532
3533
/* Process the \R and newline settings. */
3534
3535
0
mb->bsr_convention = re->bsr_convention;
3536
0
mb->nltype = NLTYPE_FIXED;
3537
0
switch(re->newline_convention)
3538
0
  {
3539
0
  case PCRE2_NEWLINE_CR:
3540
0
  mb->nllen = 1;
3541
0
  mb->nl[0] = CHAR_CR;
3542
0
  break;
3543
3544
0
  case PCRE2_NEWLINE_LF:
3545
0
  mb->nllen = 1;
3546
0
  mb->nl[0] = CHAR_NL;
3547
0
  break;
3548
3549
0
  case PCRE2_NEWLINE_NUL:
3550
0
  mb->nllen = 1;
3551
0
  mb->nl[0] = CHAR_NUL;
3552
0
  break;
3553
3554
0
  case PCRE2_NEWLINE_CRLF:
3555
0
  mb->nllen = 2;
3556
0
  mb->nl[0] = CHAR_CR;
3557
0
  mb->nl[1] = CHAR_NL;
3558
0
  break;
3559
3560
0
  case PCRE2_NEWLINE_ANY:
3561
0
  mb->nltype = NLTYPE_ANY;
3562
0
  break;
3563
3564
0
  case PCRE2_NEWLINE_ANYCRLF:
3565
0
  mb->nltype = NLTYPE_ANYCRLF;
3566
0
  break;
3567
3568
0
  default:
3569
0
  PCRE2_DEBUG_UNREACHABLE();
3570
0
  return PCRE2_ERROR_INTERNAL;
3571
0
  }
3572
3573
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3574
we must also check that a starting offset does not point into the middle of a
3575
multiunit character. We check only the portion of the subject that is going to
3576
be inspected during matching - from the offset minus the maximum back reference
3577
to the given length. This saves time when a small part of a large subject is
3578
being matched by the use of a starting offset. Note that the maximum lookbehind
3579
is a number of characters, not code units. */
3580
3581
0
#ifdef SUPPORT_UNICODE
3582
0
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3583
0
  {
3584
0
  PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3585
3586
0
  if (start_offset > 0)
3587
0
    {
3588
0
#if PCRE2_CODE_UNIT_WIDTH != 32
3589
0
    unsigned int i;
3590
0
    if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3591
0
      return PCRE2_ERROR_BADUTFOFFSET;
3592
0
    for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3593
0
      {
3594
0
      check_subject--;
3595
0
      while (check_subject > subject &&
3596
0
#if PCRE2_CODE_UNIT_WIDTH == 8
3597
0
      (*check_subject & 0xc0) == 0x80)
3598
#else  /* 16-bit */
3599
      (*check_subject & 0xfc00) == 0xdc00)
3600
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3601
0
        check_subject--;
3602
0
      }
3603
#else   /* In the 32-bit library, one code unit equals one character. */
3604
    check_subject -= re->max_lookbehind;
3605
    if (check_subject < subject) check_subject = subject;
3606
#endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3607
0
    }
3608
3609
  /* Validate the relevant portion of the subject. After an error, adjust the
3610
  offset to be an absolute offset in the whole string. */
3611
3612
0
  match_data->rc = PRIV(valid_utf)(check_subject,
3613
0
    length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3614
0
  if (match_data->rc != 0)
3615
0
    {
3616
0
    match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3617
0
    return match_data->rc;
3618
0
    }
3619
0
  }
3620
0
#endif  /* SUPPORT_UNICODE */
3621
3622
/* Set up the first code unit to match, if available. If there's no first code
3623
unit there may be a bitmap of possible first characters. */
3624
3625
0
if ((re->flags & PCRE2_FIRSTSET) != 0)
3626
0
  {
3627
0
  has_first_cu = TRUE;
3628
0
  first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3629
0
  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3630
0
    {
3631
0
    first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3632
0
#ifdef SUPPORT_UNICODE
3633
0
#if PCRE2_CODE_UNIT_WIDTH == 8
3634
0
    if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3635
0
      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3636
#else
3637
    if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3638
      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3639
#endif
3640
0
#endif  /* SUPPORT_UNICODE */
3641
0
    }
3642
0
  }
3643
0
else
3644
0
  if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3645
0
    start_bits = re->start_bitmap;
3646
3647
/* There may be a "last known required code unit" set. */
3648
3649
0
if ((re->flags & PCRE2_LASTSET) != 0)
3650
0
  {
3651
0
  has_req_cu = TRUE;
3652
0
  req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3653
0
  if ((re->flags & PCRE2_LASTCASELESS) != 0)
3654
0
    {
3655
0
    req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3656
0
#ifdef SUPPORT_UNICODE
3657
0
#if PCRE2_CODE_UNIT_WIDTH == 8
3658
0
    if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3659
0
      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3660
#else
3661
    if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3662
      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3663
#endif
3664
0
#endif  /* SUPPORT_UNICODE */
3665
0
    }
3666
0
  }
3667
3668
/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3669
free the memory that was obtained. */
3670
3671
0
if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3672
0
  {
3673
0
  match_data->memctl.free((void *)match_data->subject,
3674
0
    match_data->memctl.memory_data);
3675
0
  match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3676
0
  }
3677
3678
/* Fill in fields that are always returned in the match data. */
3679
3680
0
match_data->code = re;
3681
0
match_data->subject = NULL;  /* Default for no match */
3682
0
match_data->mark = NULL;
3683
0
match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3684
3685
/* Call the main matching function, looping for a non-anchored regex after a
3686
failed match. If not restarting, perform certain optimizations at the start of
3687
a match. */
3688
3689
0
for (;;)
3690
0
  {
3691
  /* ----------------- Start of match optimizations ---------------- */
3692
3693
  /* There are some optimizations that avoid running the match if a known
3694
  starting point is not found, or if a known later code unit is not present.
3695
  However, there is an option (settable at compile time) that disables
3696
  these, for testing and for ensuring that all callouts do actually occur.
3697
  The optimizations must also be avoided when restarting a DFA match. */
3698
3699
0
  if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0 &&
3700
0
      (options & PCRE2_DFA_RESTART) == 0)
3701
0
    {
3702
    /* If firstline is TRUE, the start of the match is constrained to the first
3703
    line of a multiline string. That is, the match must be before or at the
3704
    first newline following the start of matching. Temporarily adjust
3705
    end_subject so that we stop the optimization scans for a first code unit
3706
    immediately after the first character of a newline (the first code unit can
3707
    legitimately be a newline). If the match fails at the newline, later code
3708
    breaks this loop. */
3709
3710
0
    if (firstline)
3711
0
      {
3712
0
      PCRE2_SPTR t = start_match;
3713
0
#ifdef SUPPORT_UNICODE
3714
0
      if (utf)
3715
0
        {
3716
0
        while (t < end_subject && !IS_NEWLINE(t))
3717
0
          {
3718
0
          t++;
3719
0
          ACROSSCHAR(t < end_subject, t, t++);
3720
0
          }
3721
0
        }
3722
0
      else
3723
0
#endif
3724
0
      while (t < end_subject && !IS_NEWLINE(t)) t++;
3725
0
      end_subject = t;
3726
0
      }
3727
3728
    /* Anchored: check the first code unit if one is recorded. This may seem
3729
    pointless but it can help in detecting a no match case without scanning for
3730
    the required code unit. */
3731
3732
0
    if (anchored)
3733
0
      {
3734
0
      if (has_first_cu || start_bits != NULL)
3735
0
        {
3736
0
        BOOL ok = start_match < end_subject;
3737
0
        if (ok)
3738
0
          {
3739
0
          PCRE2_UCHAR c = UCHAR21TEST(start_match);
3740
0
          ok = has_first_cu && (c == first_cu || c == first_cu2);
3741
0
          if (!ok && start_bits != NULL)
3742
0
            {
3743
#if PCRE2_CODE_UNIT_WIDTH != 8
3744
            if (c > 255) c = 255;
3745
#endif
3746
0
            ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3747
0
            }
3748
0
          }
3749
0
        if (!ok) break;
3750
0
        }
3751
0
      }
3752
3753
    /* Not anchored. Advance to a unique first code unit if there is one. */
3754
3755
0
    else
3756
0
      {
3757
0
      if (has_first_cu)
3758
0
        {
3759
0
        if (first_cu != first_cu2)  /* Caseless */
3760
0
          {
3761
          /* In 16-bit and 32_bit modes we have to do our own search, so can
3762
          look for both cases at once. */
3763
3764
#if PCRE2_CODE_UNIT_WIDTH != 8
3765
          PCRE2_UCHAR smc;
3766
          while (start_match < end_subject &&
3767
                (smc = UCHAR21TEST(start_match)) != first_cu &&
3768
                 smc != first_cu2)
3769
            start_match++;
3770
#else
3771
          /* In 8-bit mode, the use of memchr() gives a big speed up, even
3772
          though we have to call it twice in order to find the earliest
3773
          occurrence of the code unit in either of its cases. Caching is used
3774
          to remember the positions of previously found code units. This can
3775
          make a huge difference when the strings are very long and only one
3776
          case is actually present. */
3777
3778
0
          PCRE2_SPTR pp1 = NULL;
3779
0
          PCRE2_SPTR pp2 = NULL;
3780
0
          PCRE2_SIZE searchlength = end_subject - start_match;
3781
3782
          /* If we haven't got a previously found position for first_cu, or if
3783
          the current starting position is later, we need to do a search. If
3784
          the code unit is not found, set it to the end. */
3785
3786
0
          if (memchr_found_first_cu == NULL ||
3787
0
              start_match > memchr_found_first_cu)
3788
0
            {
3789
0
            pp1 = memchr(start_match, first_cu, searchlength);
3790
0
            memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3791
0
            }
3792
3793
          /* If the start is before a previously found position, use the
3794
          previous position, or NULL if a previous search failed. */
3795
3796
0
          else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3797
0
            memchr_found_first_cu;
3798
3799
          /* Do the same thing for the other case. */
3800
3801
0
          if (memchr_found_first_cu2 == NULL ||
3802
0
              start_match > memchr_found_first_cu2)
3803
0
            {
3804
0
            pp2 = memchr(start_match, first_cu2, searchlength);
3805
0
            memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3806
0
            }
3807
3808
0
          else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3809
0
            memchr_found_first_cu2;
3810
3811
          /* Set the start to the end of the subject if neither case was found.
3812
          Otherwise, use the earlier found point. */
3813
3814
0
          if (pp1 == NULL)
3815
0
            start_match = (pp2 == NULL)? end_subject : pp2;
3816
0
          else
3817
0
            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3818
3819
0
#endif  /* 8-bit handling */
3820
0
          }
3821
3822
        /* The caseful case is much simpler. */
3823
3824
0
        else
3825
0
          {
3826
#if PCRE2_CODE_UNIT_WIDTH != 8
3827
          while (start_match < end_subject && UCHAR21TEST(start_match) !=
3828
                 first_cu)
3829
            start_match++;
3830
#else  /* 8-bit code units */
3831
0
          start_match = memchr(start_match, first_cu, end_subject - start_match);
3832
0
          if (start_match == NULL) start_match = end_subject;
3833
0
#endif
3834
0
          }
3835
3836
        /* If we can't find the required code unit, having reached the true end
3837
        of the subject, break the bumpalong loop, to force a match failure,
3838
        except when doing partial matching, when we let the next cycle run at
3839
        the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3840
        which partially matches "abc", even though the string does not contain
3841
        the starting character "d". If we have not reached the true end of the
3842
        subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3843
        we also let the cycle run, because the matching string is legitimately
3844
        allowed to start with the first code unit of a newline. */
3845
3846
0
        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3847
0
            start_match >= mb->end_subject)
3848
0
          break;
3849
0
        }
3850
3851
      /* If there's no first code unit, advance to just after a linebreak for a
3852
      multiline match if required. */
3853
3854
0
      else if (startline)
3855
0
        {
3856
0
        if (start_match > mb->start_subject + start_offset)
3857
0
          {
3858
0
#ifdef SUPPORT_UNICODE
3859
0
          if (utf)
3860
0
            {
3861
0
            while (start_match < end_subject && !WAS_NEWLINE(start_match))
3862
0
              {
3863
0
              start_match++;
3864
0
              ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3865
0
              }
3866
0
            }
3867
0
          else
3868
0
#endif
3869
0
          while (start_match < end_subject && !WAS_NEWLINE(start_match))
3870
0
            start_match++;
3871
3872
          /* If we have just passed a CR and the newline option is ANY or
3873
          ANYCRLF, and we are now at a LF, advance the match position by one
3874
          more code unit. */
3875
3876
0
          if (start_match[-1] == CHAR_CR &&
3877
0
               (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3878
0
               start_match < end_subject &&
3879
0
               UCHAR21TEST(start_match) == CHAR_NL)
3880
0
            start_match++;
3881
0
          }
3882
0
        }
3883
3884
      /* If there's no first code unit or a requirement for a multiline line
3885
      start, advance to a non-unique first code unit if any have been
3886
      identified. The bitmap contains only 256 bits. When code units are 16 or
3887
      32 bits wide, all code units greater than 254 set the 255 bit. */
3888
3889
0
      else if (start_bits != NULL)
3890
0
        {
3891
0
        while (start_match < end_subject)
3892
0
          {
3893
0
          uint32_t c = UCHAR21TEST(start_match);
3894
#if PCRE2_CODE_UNIT_WIDTH != 8
3895
          if (c > 255) c = 255;
3896
#endif
3897
0
          if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3898
0
          start_match++;
3899
0
          }
3900
3901
        /* See comment above in first_cu checking about the next line. */
3902
3903
0
        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3904
0
            start_match >= mb->end_subject)
3905
0
          break;
3906
0
        }
3907
0
      }  /* End of first code unit handling */
3908
3909
    /* Restore fudged end_subject */
3910
3911
0
    end_subject = mb->end_subject;
3912
3913
    /* The following two optimizations are disabled for partial matching. */
3914
3915
0
    if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3916
0
      {
3917
0
      PCRE2_SPTR p;
3918
3919
      /* The minimum matching length is a lower bound; no actual string of that
3920
      length may actually match the pattern. Although the value is, strictly,
3921
      in characters, we treat it as code units to avoid spending too much time
3922
      in this optimization. */
3923
3924
0
      if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3925
3926
      /* If req_cu is set, we know that that code unit must appear in the
3927
      subject for the match to succeed. If the first code unit is set, req_cu
3928
      must be later in the subject; otherwise the test starts at the match
3929
      point. This optimization can save a huge amount of backtracking in
3930
      patterns with nested unlimited repeats that aren't going to match.
3931
      Writing separate code for cased/caseless versions makes it go faster, as
3932
      does using an autoincrement and backing off on a match. As in the case of
3933
      the first code unit, using memchr() in the 8-bit library gives a big
3934
      speed up. Unlike the first_cu check above, we do not need to call
3935
      memchr() twice in the caseless case because we only need to check for the
3936
      presence of the character in either case, not find the first occurrence.
3937
3938
      The search can be skipped if the code unit was found later than the
3939
      current starting point in a previous iteration of the bumpalong loop.
3940
3941
      HOWEVER: when the subject string is very, very long, searching to its end
3942
      can take a long time, and give bad performance on quite ordinary
3943
      patterns. This showed up when somebody was matching something like
3944
      /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3945
      sufficiently long, but it's worth searching a lot more for unanchored
3946
      patterns. */
3947
3948
0
      p = start_match + (has_first_cu? 1:0);
3949
0
      if (has_req_cu && p > req_cu_ptr)
3950
0
        {
3951
0
        PCRE2_SIZE check_length = end_subject - start_match;
3952
3953
0
        if (check_length < REQ_CU_MAX ||
3954
0
              (!anchored && check_length < REQ_CU_MAX * 1000))
3955
0
          {
3956
0
          if (req_cu != req_cu2)  /* Caseless */
3957
0
            {
3958
#if PCRE2_CODE_UNIT_WIDTH != 8
3959
            while (p < end_subject)
3960
              {
3961
              uint32_t pp = UCHAR21INCTEST(p);
3962
              if (pp == req_cu || pp == req_cu2) { p--; break; }
3963
              }
3964
#else  /* 8-bit code units */
3965
0
            PCRE2_SPTR pp = p;
3966
0
            p = memchr(pp, req_cu, end_subject - pp);
3967
0
            if (p == NULL)
3968
0
              {
3969
0
              p = memchr(pp, req_cu2, end_subject - pp);
3970
0
              if (p == NULL) p = end_subject;
3971
0
              }
3972
0
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3973
0
            }
3974
3975
          /* The caseful case */
3976
3977
0
          else
3978
0
            {
3979
#if PCRE2_CODE_UNIT_WIDTH != 8
3980
            while (p < end_subject)
3981
              {
3982
              if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3983
              }
3984
3985
#else  /* 8-bit code units */
3986
0
            p = memchr(p, req_cu, end_subject - p);
3987
0
            if (p == NULL) p = end_subject;
3988
0
#endif
3989
0
            }
3990
3991
          /* If we can't find the required code unit, break the matching loop,
3992
          forcing a match failure. */
3993
3994
0
          if (p >= end_subject) break;
3995
3996
          /* If we have found the required code unit, save the point where we
3997
          found it, so that we don't search again next time round the loop if
3998
          the start hasn't passed this code unit yet. */
3999
4000
0
          req_cu_ptr = p;
4001
0
          }
4002
0
        }
4003
0
      }
4004
0
    }
4005
4006
  /* ------------ End of start of match optimizations ------------ */
4007
4008
  /* Give no match if we have passed the bumpalong limit. */
4009
4010
0
  if (start_match > bumpalong_limit) break;
4011
4012
  /* OK, now we can do the business */
4013
4014
0
  mb->start_used_ptr = start_match;
4015
0
  mb->last_used_ptr = start_match;
4016
0
  mb->recursive = NULL;
4017
4018
0
  rc = internal_dfa_match(
4019
0
    mb,                           /* fixed match data */
4020
0
    mb->start_code,               /* this subexpression's code */
4021
0
    start_match,                  /* where we currently are */
4022
0
    start_offset,                 /* start offset in subject */
4023
0
    match_data->ovector,          /* offset vector */
4024
0
    (uint32_t)match_data->oveccount * 2,  /* actual size of same */
4025
0
    workspace,                    /* workspace vector */
4026
0
    (int)wscount,                 /* size of same */
4027
0
    0,                            /* function recurse level */
4028
0
    base_recursion_workspace);    /* initial workspace for recursion */
4029
4030
  /* Anything other than "no match" means we are done, always; otherwise, carry
4031
  on only if not anchored. */
4032
4033
0
  if (rc != PCRE2_ERROR_NOMATCH || anchored)
4034
0
    {
4035
0
    if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
4036
0
      {
4037
0
      match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
4038
0
      match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
4039
0
      }
4040
0
    match_data->subject_length = length;
4041
0
    match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
4042
0
    match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
4043
0
    match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4044
0
    match_data->rc = rc;
4045
4046
0
    if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
4047
0
      {
4048
0
      length = CU2BYTES(length + was_zero_terminated);
4049
0
      match_data->subject = match_data->memctl.malloc(length,
4050
0
        match_data->memctl.memory_data);
4051
0
      if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4052
0
      memcpy((void *)match_data->subject, subject, length);
4053
0
      match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
4054
0
      }
4055
0
    else
4056
0
      {
4057
0
      if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4058
0
      }
4059
0
    goto EXIT;
4060
0
    }
4061
4062
  /* Advance to the next subject character unless we are at the end of a line
4063
  and firstline is set. */
4064
4065
0
  if (firstline && IS_NEWLINE(start_match)) break;
4066
0
  start_match++;
4067
0
#ifdef SUPPORT_UNICODE
4068
0
  if (utf)
4069
0
    {
4070
0
    ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4071
0
    }
4072
0
#endif
4073
0
  if (start_match > end_subject) break;
4074
4075
  /* If we have just passed a CR and we are now at a LF, and the pattern does
4076
  not contain any explicit matches for \r or \n, and the newline option is CRLF
4077
  or ANY or ANYCRLF, advance the match position by one more character. */
4078
4079
0
  if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
4080
0
      start_match < end_subject &&
4081
0
      UCHAR21TEST(start_match) == CHAR_NL &&
4082
0
      (re->flags & PCRE2_HASCRORLF) == 0 &&
4083
0
        (mb->nltype == NLTYPE_ANY ||
4084
0
         mb->nltype == NLTYPE_ANYCRLF ||
4085
0
         mb->nllen == 2))
4086
0
    start_match++;
4087
4088
0
  }   /* "Bumpalong" loop */
4089
4090
0
NOMATCH_EXIT:
4091
0
rc = PCRE2_ERROR_NOMATCH;
4092
4093
0
EXIT:
4094
0
while (rws->next != NULL)
4095
0
  {
4096
0
  RWS_anchor *next = rws->next;
4097
0
  rws->next = next->next;
4098
0
  mb->memctl.free(next, mb->memctl.memory_data);
4099
0
  }
4100
4101
0
return rc;
4102
0
}
4103
4104
/* These #undefs are here to enable unity builds with CMake. */
4105
4106
#undef NLBLOCK /* Block containing newline information */
4107
#undef PSSTART /* Field containing processed string start */
4108
#undef PSEND   /* Field containing processed string end */
4109
4110
/* End of pcre2_dfa_match.c */