Coverage Report

Created: 2025-12-31 07:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/pcre/pcre2lib/pcre2_compile.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#ifdef HAVE_CONFIG_H
43
#include "config.h"
44
#endif
45
46
0
#define NLBLOCK cb             /* Block containing newline information */
47
#define PSSTART start_pattern  /* Field containing processed string start */
48
0
#define PSEND   end_pattern    /* Field containing processed string end */
49
50
#include "pcre2_internal.h"
51
52
/* In rare error cases debugging might require calling pcre2_printint(). */
53
54
#if 0
55
#ifdef EBCDIC
56
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57
#else
58
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59
#endif
60
#include "pcre2_printint.c"
61
#define DEBUG_CALL_PRINTINT
62
#endif
63
64
/* Other debugging code can be enabled by these defines. */
65
66
/* #define DEBUG_SHOW_CAPTURES */
67
/* #define DEBUG_SHOW_PARSED */
68
69
/* There are a few things that vary with different code unit sizes. Handle them
70
by defining macros in order to minimize #if usage. */
71
72
#if PCRE2_CODE_UNIT_WIDTH == 8
73
#define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74
49
#define XDIGIT(c)                xdigitab[c]
75
76
#else  /* Either 16-bit or 32-bit */
77
#define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78
79
#if PCRE2_CODE_UNIT_WIDTH == 16
80
#define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81
82
#else  /* 32-bit */
83
#define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84
#endif
85
#endif
86
87
/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88
consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89
them will be able to (i.e. assume a 64-bit world). */
90
91
#if PCRE2_SIZE_MAX <= UINT32_MAX
92
#define PUTOFFSET(s,p) *p++ = s
93
#define GETOFFSET(s,p) s = *p++
94
#define GETPLUSOFFSET(s,p) s = *(++p)
95
#define READPLUSOFFSET(s,p) s = p[1]
96
#define SKIPOFFSET(p) p++
97
#define SIZEOFFSET 1
98
#else
99
#define PUTOFFSET(s,p) \
100
138
  { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101
#define GETOFFSET(s,p) \
102
  { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103
#define GETPLUSOFFSET(s,p) \
104
148
  { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105
#define READPLUSOFFSET(s,p) \
106
1
  { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107
0
#define SKIPOFFSET(p) p += 2
108
3
#define SIZEOFFSET 2
109
#endif
110
111
/* Macros for manipulating elements of the parsed pattern vector. */
112
113
147k
#define META_CODE(x)   (x & 0xffff0000u)
114
145k
#define META_DATA(x)   (x & 0x0000ffffu)
115
#define META_DIFF(x,y) ((x-y)>>16)
116
117
/* Function definitions to allow mutual recursion */
118
119
#ifdef SUPPORT_UNICODE
120
static unsigned int
121
  add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t,
122
    compile_block *, const uint32_t *, unsigned int);
123
#endif
124
125
static int
126
  compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
127
    uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
128
    open_capitem *, compile_block *, PCRE2_SIZE *);
129
130
static int
131
  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
132
    compile_block *);
133
134
static BOOL
135
  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136
    compile_block *);
137
138
static int
139
  check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140
    compile_block *, int *);
141
142
143
/*************************************************
144
*      Code parameters and static tables         *
145
*************************************************/
146
147
2.22k
#define MAX_GROUP_NUMBER   65535u
148
14.0k
#define MAX_REPEAT_COUNT   65535u
149
14.0k
#define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
150
151
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152
different ways in the different pattern scans. The parsing and group-
153
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154
aligned for this. Having defined the size in code units, we set up
155
C16_WORK_SIZE as the number of elements in the 16-bit vector.
156
157
During the first compiling phase, when determining how much memory is required,
158
the regex is partly compiled into this space, but the compiled parts are
159
discarded as soon as they can be, so that hopefully there will never be an
160
overrun. The code does, however, check for an overrun, which can occur for
161
pathological patterns. The size of the workspace depends on LINK_SIZE because
162
the length of compiled items varies with this.
163
164
In the real compile phase, this workspace is not currently used. */
165
166
708
#define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
167
168
#define C16_WORK_SIZE \
169
  ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170
171
/* A uint32_t vector is used for caching information about the size of
172
capturing groups, to improve performance. A default is created on the stack of
173
this size. */
174
175
1
#define GROUPINFO_DEFAULT_SIZE 256
176
177
/* The overrun tests check for a slightly smaller size so that they detect the
178
overrun before it actually does run off the end of the data block. */
179
180
75.6k
#define WORK_SIZE_SAFETY_MARGIN (100)
181
182
/* This value determines the size of the initial vector that is used for
183
remembering named groups during the pre-compile. It is allocated on the stack,
184
but if it is too small, it is expanded, in a similar way to the workspace. The
185
value is the number of slots in the list. */
186
187
1.41k
#define NAMED_GROUP_LIST_SIZE  20
188
189
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190
of uint32_t. For short patterns this lives on the stack, with this size. Heap
191
memory is used for longer patterns. */
192
193
704
#define PARSED_PATTERN_DEFAULT_SIZE 1024
194
195
/* Maximum length value to check against when making sure that the variable
196
that holds the compiled pattern length does not overflow. We make it a bit less
197
than INT_MAX to allow for adding in group terminating code units, so that we
198
don't have to check them every time. */
199
200
73.4k
#define OFLOW_MAX (INT_MAX - 20)
201
202
/* Code values for parsed patterns, which are stored in a vector of 32-bit
203
unsigned ints. Values less than META_END are literal data values. The coding
204
for identifying the item is in the top 16-bits, leaving 16 bits for the
205
additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206
macros are used to manipulate parsed pattern elements.
207
208
NOTE: When these definitions are changed, the table of extra lengths for each
209
code (meta_extra_lengths, just below) must be updated to remain in step. */
210
211
168k
#define META_END              0x80000000u  /* End of pattern */
212
213
17.1k
#define META_ALT              0x80010000u  /* alternation */
214
0
#define META_ATOMIC           0x80020000u  /* atomic group */
215
1.70k
#define META_BACKREF          0x80030000u  /* Back ref */
216
0
#define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
217
37.9k
#define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
218
0
#define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
219
0
#define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
220
3.02k
#define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
221
2.76k
#define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
222
18.6k
#define META_CLASS            0x800a0000u  /* start non-empty class */
223
4
#define META_CLASS_EMPTY      0x800b0000u  /* empty class */
224
4
#define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
225
50.2k
#define META_CLASS_END        0x800d0000u  /* end of non-empty class */
226
6.50k
#define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
227
0
#define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
228
0
#define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
229
0
#define META_COND_NAME        0x80110000u  /* (?(<name>)... */
230
0
#define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
231
0
#define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
232
0
#define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
233
0
#define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
234
425
#define META_DOLLAR           0x80160000u  /* $ metacharacter */
235
2.89k
#define META_DOT              0x80170000u  /* . metacharacter */
236
6.35k
#define META_ESCAPE           0x80180000u  /* \d and friends */
237
7.98k
#define META_KET              0x80190000u  /* closing parenthesis */
238
17
#define META_NOCAPTURE        0x801a0000u  /* no capture parens */
239
0
#define META_OPTIONS          0x801b0000u  /* (?i) and friends */
240
75.1k
#define META_POSIX            0x801c0000u  /* POSIX class item */
241
37.5k
#define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
242
37.6k
#define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
243
75.2k
#define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
244
0
#define META_RECURSE          0x80200000u  /* Recursion */
245
0
#define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
246
2
#define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */
247
248
/* These must be kept together to make it easy to check that an assertion
249
is present where expected in a conditional group. */
250
251
14
#define META_LOOKAHEAD        0x80230000u  /* (?= */
252
4
#define META_LOOKAHEADNOT     0x80240000u  /* (?! */
253
4
#define META_LOOKBEHIND       0x80250000u  /* (?<= */
254
1
#define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */
255
256
/* These cannot be conditions */
257
258
1
#define META_LOOKAHEAD_NA     0x80270000u  /* (*napla: */
259
2
#define META_LOOKBEHIND_NA    0x80280000u  /* (*naplb: */
260
261
/* These must be kept in this order, with consecutive values, and the _ARG
262
versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263
versions. */
264
265
1
#define META_MARK             0x80290000u  /* (*MARK) */
266
13.9k
#define META_ACCEPT           0x802a0000u  /* (*ACCEPT) */
267
5
#define META_FAIL             0x802b0000u  /* (*FAIL) */
268
4
#define META_COMMIT           0x802c0000u  /* These               */
269
0
#define META_COMMIT_ARG       0x802d0000u  /*   pairs             */
270
7
#define META_PRUNE            0x802e0000u  /*     must            */
271
0
#define META_PRUNE_ARG        0x802f0000u  /*       be            */
272
9
#define META_SKIP             0x80300000u  /*         kept        */
273
0
#define META_SKIP_ARG         0x80310000u  /*           in        */
274
9
#define META_THEN             0x80320000u  /*             this    */
275
0
#define META_THEN_ARG         0x80330000u  /*               order */
276
277
/* These must be kept in groups of adjacent 3 values, and all together. */
278
279
437k
#define META_ASTERISK         0x80340000u  /* *  */
280
1.10k
#define META_ASTERISK_PLUS    0x80350000u  /* *+ */
281
1.74k
#define META_ASTERISK_QUERY   0x80360000u  /* *? */
282
5.07k
#define META_PLUS             0x80370000u  /* +  */
283
3.35k
#define META_PLUS_PLUS        0x80380000u  /* ++ */
284
4.43k
#define META_PLUS_QUERY       0x80390000u  /* +? */
285
10.1k
#define META_QUERY            0x803a0000u  /* ?  */
286
3.42k
#define META_QUERY_PLUS       0x803b0000u  /* ?+ */
287
5.20k
#define META_QUERY_QUERY      0x803c0000u  /* ?? */
288
4.40k
#define META_MINMAX           0x803d0000u  /* {n,m}  repeat */
289
0
#define META_MINMAX_PLUS      0x803e0000u  /* {n,m}+ repeat */
290
12.6k
#define META_MINMAX_QUERY     0x803f0000u  /* {n,m}? repeat */
291
292
#define META_FIRST_QUANTIFIER META_ASTERISK
293
#define META_LAST_QUANTIFIER  META_MINMAX_QUERY
294
295
/* This is a special "meta code" that is used only to distinguish (*asr: from
296
(*sr: in the table of aphabetic assertions. It is never stored in the parsed
297
pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298
therefore no need for it to have a length entry, so use a high value. */
299
300
0
#define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301
302
/* Table of extra lengths for each of the meta codes. Must be kept in step with
303
the definitions above. For some items these values are a basic length to which
304
a variable amount has to be added. */
305
306
static unsigned char meta_extra_lengths[] = {
307
  0,             /* META_END */
308
  0,             /* META_ALT */
309
  0,             /* META_ATOMIC */
310
  0,             /* META_BACKREF - more if group is >= 10 */
311
  1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
312
  1,             /* META_BIGVALUE */
313
  3,             /* META_CALLOUT_NUMBER */
314
  3+SIZEOFFSET,  /* META_CALLOUT_STRING */
315
  0,             /* META_CAPTURE */
316
  0,             /* META_CIRCUMFLEX */
317
  0,             /* META_CLASS */
318
  0,             /* META_CLASS_EMPTY */
319
  0,             /* META_CLASS_EMPTY_NOT */
320
  0,             /* META_CLASS_END */
321
  0,             /* META_CLASS_NOT */
322
  0,             /* META_COND_ASSERT */
323
  SIZEOFFSET,    /* META_COND_DEFINE */
324
  1+SIZEOFFSET,  /* META_COND_NAME */
325
  1+SIZEOFFSET,  /* META_COND_NUMBER */
326
  1+SIZEOFFSET,  /* META_COND_RNAME */
327
  1+SIZEOFFSET,  /* META_COND_RNUMBER */
328
  3,             /* META_COND_VERSION */
329
  0,             /* META_DOLLAR */
330
  0,             /* META_DOT */
331
  0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332
  0,             /* META_KET */
333
  0,             /* META_NOCAPTURE */
334
  1,             /* META_OPTIONS */
335
  1,             /* META_POSIX */
336
  1,             /* META_POSIX_NEG */
337
  0,             /* META_RANGE_ESCAPED */
338
  0,             /* META_RANGE_LITERAL */
339
  SIZEOFFSET,    /* META_RECURSE */
340
  1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
341
  0,             /* META_SCRIPT_RUN */
342
  0,             /* META_LOOKAHEAD */
343
  0,             /* META_LOOKAHEADNOT */
344
  SIZEOFFSET,    /* META_LOOKBEHIND */
345
  SIZEOFFSET,    /* META_LOOKBEHINDNOT */
346
  0,             /* META_LOOKAHEAD_NA */
347
  SIZEOFFSET,    /* META_LOOKBEHIND_NA */
348
  1,             /* META_MARK - plus the string length */
349
  0,             /* META_ACCEPT */
350
  0,             /* META_FAIL */
351
  0,             /* META_COMMIT */
352
  1,             /* META_COMMIT_ARG - plus the string length */
353
  0,             /* META_PRUNE */
354
  1,             /* META_PRUNE_ARG - plus the string length */
355
  0,             /* META_SKIP */
356
  1,             /* META_SKIP_ARG - plus the string length */
357
  0,             /* META_THEN */
358
  1,             /* META_THEN_ARG - plus the string length */
359
  0,             /* META_ASTERISK */
360
  0,             /* META_ASTERISK_PLUS */
361
  0,             /* META_ASTERISK_QUERY */
362
  0,             /* META_PLUS */
363
  0,             /* META_PLUS_PLUS */
364
  0,             /* META_PLUS_QUERY */
365
  0,             /* META_QUERY */
366
  0,             /* META_QUERY_PLUS */
367
  0,             /* META_QUERY_QUERY */
368
  2,             /* META_MINMAX */
369
  2,             /* META_MINMAX_PLUS */
370
  2              /* META_MINMAX_QUERY */
371
};
372
373
/* Types for skipping parts of a parsed pattern. */
374
375
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376
377
/* Macro for setting individual bits in class bitmaps. It took some
378
experimenting to figure out how to stop gcc 5.3.0 from warning with
379
-Wconversion. This version gets a warning:
380
381
  #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382
383
Let's hope the apparently less efficient version isn't actually so bad if the
384
compiler is clever with identical subexpressions. */
385
386
95.5k
#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387
388
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389
variables, which are concerned with first and required code units. A value
390
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391
matching xxcu variable is set, and the low valued bits are relevant. */
392
393
132k
#define REQ_UNSET     0xffffffffu  /* Not yet found anything */
394
23.6k
#define REQ_NONE      0xfffffffeu  /* Found not fixed character */
395
1.75k
#define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
396
20.9k
#define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
397
398
/* These flags are used in the groupinfo vector. */
399
400
0
#define GI_SET_FIXED_LENGTH    0x80000000u
401
0
#define GI_NOT_FIXED_LENGTH    0x40000000u
402
0
#define GI_FIXED_LENGTH_MASK   0x0000ffffu
403
404
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405
and is fast (a good compiler can turn it into a subtraction and unsigned
406
comparison). */
407
408
3.40k
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409
410
/* Table to identify hex digits. The tables in chartables are dependent on the
411
locale, and may mark arbitrary characters as digits. We want to recognize only
412
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413
costs 256 bytes, but it is a lot faster than doing character value tests (at
414
least in some simple cases I timed), and in some applications one wants PCRE2
415
to compile efficiently as well as match efficiently. The value in the table is
416
the binary hex digit value, or 0xff for non-hex digits. */
417
418
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419
UTF-8 mode. */
420
421
#ifndef EBCDIC
422
static const uint8_t xdigitab[] =
423
  {
424
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
425
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
426
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
427
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
428
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
429
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
430
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
431
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
432
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
433
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
434
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
435
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
436
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
437
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
438
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
439
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
440
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456
457
#else
458
459
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460
461
static const uint8_t xdigitab[] =
462
  {
463
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
464
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
465
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
466
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
467
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
468
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
469
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
470
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
471
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
472
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
473
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
474
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
475
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
476
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
477
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
479
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
480
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
481
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
482
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
483
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
484
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
485
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
486
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
487
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
488
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
489
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
490
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
491
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
492
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
493
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
494
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
495
#endif  /* EBCDIC */
496
497
498
/* Table for handling alphanumeric escaped characters. Positive returns are
499
simple data values; negative values are for special things like \d and so on.
500
Zero means further processing is needed (for things like \x), or the escape is
501
invalid. */
502
503
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
504
in UTF-8 mode. It runs from '0' to 'z'. */
505
506
#ifndef EBCDIC
507
15.8k
#define ESCAPES_FIRST       CHAR_0
508
7.43k
#define ESCAPES_LAST        CHAR_z
509
6
#define UPPER_CASE(c)       (c-32)
510
511
static const short int escapes[] = {
512
     0,                       0,
513
     0,                       0,
514
     0,                       0,
515
     0,                       0,
516
     0,                       0,
517
     CHAR_COLON,              CHAR_SEMICOLON,
518
     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
519
     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
520
     CHAR_COMMERCIAL_AT,      -ESC_A,
521
     -ESC_B,                  -ESC_C,
522
     -ESC_D,                  -ESC_E,
523
     0,                       -ESC_G,
524
     -ESC_H,                  0,
525
     0,                       -ESC_K,
526
     0,                       0,
527
     -ESC_N,                  0,
528
     -ESC_P,                  -ESC_Q,
529
     -ESC_R,                  -ESC_S,
530
     0,                       0,
531
     -ESC_V,                  -ESC_W,
532
     -ESC_X,                  0,
533
     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
534
     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
535
     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
536
     CHAR_GRAVE_ACCENT,       CHAR_BEL,
537
     -ESC_b,                  0,
538
     -ESC_d,                  CHAR_ESC,
539
     CHAR_FF,                 0,
540
     -ESC_h,                  0,
541
     0,                       -ESC_k,
542
     0,                       0,
543
     CHAR_LF,                 0,
544
     -ESC_p,                  0,
545
     CHAR_CR,                 -ESC_s,
546
     CHAR_HT,                 0,
547
     -ESC_v,                  -ESC_w,
548
     0,                       0,
549
     -ESC_z
550
};
551
552
#else
553
554
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555
It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556
is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557
because it is defined as 'a', which of course picks up the ASCII value. */
558
559
#if 'a' == 0x81                    /* Check for a real EBCDIC environment */
560
#define ESCAPES_FIRST       CHAR_a
561
#define ESCAPES_LAST        CHAR_9
562
#define UPPER_CASE(c)       (c+64)
563
#else                              /* Testing in an ASCII environment */
564
#define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
565
#define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
566
#define UPPER_CASE(c)  (c-32)
567
#endif
568
569
static const short int escapes[] = {
570
/*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
571
/*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
572
/*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
573
/*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
574
/*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
575
/*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
576
/*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
577
/*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
578
/*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
579
/*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
580
/*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
581
/*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
582
/*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
583
/*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
584
/*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
585
/*  F8 */      0,        0
586
};
587
588
/* We also need a table of characters that may follow \c in an EBCDIC
589
environment for characters 0-31. */
590
591
static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592
593
#endif   /* EBCDIC */
594
595
596
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597
searched linearly. Put all the names into a single string, in order to reduce
598
the number of relocations when a shared library is dynamically linked. The
599
string is built from string macros so that it works in UTF-8 mode on EBCDIC
600
platforms. */
601
602
typedef struct verbitem {
603
  unsigned int len;          /* Length of verb name */
604
  uint32_t meta;             /* Base META_ code */
605
  int has_arg;               /* Argument requirement */
606
} verbitem;
607
608
static const char verbnames[] =
609
  "\0"                       /* Empty name is a shorthand for MARK */
610
  STRING_MARK0
611
  STRING_ACCEPT0
612
  STRING_F0
613
  STRING_FAIL0
614
  STRING_COMMIT0
615
  STRING_PRUNE0
616
  STRING_SKIP0
617
  STRING_THEN;
618
619
static const verbitem verbs[] = {
620
  { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
621
  { 4, META_MARK,   +1 },
622
  { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
623
  { 1, META_FAIL,   -1 },
624
  { 4, META_FAIL,   -1 },
625
  { 6, META_COMMIT,  0 },
626
  { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
627
  { 4, META_SKIP,    0 },
628
  { 4, META_THEN,    0 }
629
};
630
631
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632
633
/* Verb opcodes, indexed by their META code offset from META_MARK. */
634
635
static const uint32_t verbops[] = {
636
  OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637
  OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638
639
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640
641
typedef struct alasitem {
642
  unsigned int len;          /* Length of name */
643
  uint32_t meta;             /* Base META_ code */
644
} alasitem;
645
646
static const char alasnames[] =
647
  STRING_pla0
648
  STRING_plb0
649
  STRING_napla0
650
  STRING_naplb0
651
  STRING_nla0
652
  STRING_nlb0
653
  STRING_positive_lookahead0
654
  STRING_positive_lookbehind0
655
  STRING_non_atomic_positive_lookahead0
656
  STRING_non_atomic_positive_lookbehind0
657
  STRING_negative_lookahead0
658
  STRING_negative_lookbehind0
659
  STRING_atomic0
660
  STRING_sr0
661
  STRING_asr0
662
  STRING_script_run0
663
  STRING_atomic_script_run;
664
665
static const alasitem alasmeta[] = {
666
  {  3, META_LOOKAHEAD         },
667
  {  3, META_LOOKBEHIND        },
668
  {  5, META_LOOKAHEAD_NA      },
669
  {  5, META_LOOKBEHIND_NA     },
670
  {  3, META_LOOKAHEADNOT      },
671
  {  3, META_LOOKBEHINDNOT     },
672
  { 18, META_LOOKAHEAD         },
673
  { 19, META_LOOKBEHIND        },
674
  { 29, META_LOOKAHEAD_NA      },
675
  { 30, META_LOOKBEHIND_NA     },
676
  { 18, META_LOOKAHEADNOT      },
677
  { 19, META_LOOKBEHINDNOT     },
678
  {  6, META_ATOMIC            },
679
  {  2, META_SCRIPT_RUN        }, /* sr = script run */
680
  {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681
  { 10, META_SCRIPT_RUN        }, /* script run */
682
  { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
683
};
684
685
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686
687
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688
689
static uint32_t chartypeoffset[] = {
690
  OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
691
  OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692
693
/* Tables of names of POSIX character classes and their lengths. The names are
694
now all in a single string, to reduce the number of relocations when a shared
695
library is dynamically loaded. The list of lengths is terminated by a zero
696
length entry. The first three must be alpha, lower, upper, as this is assumed
697
for handling case independence. The indices for several classes are needed, so
698
identify them. */
699
700
static const char posix_names[] =
701
  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702
  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703
  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704
  STRING_word0  STRING_xdigit;
705
706
static const uint8_t posix_name_lengths[] = {
707
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708
709
0
#define PC_DIGIT   7
710
0
#define PC_GRAPH   8
711
0
#define PC_PRINT   9
712
0
#define PC_PUNCT  10
713
0
#define PC_XDIGIT 13
714
715
/* Table of class bit maps for each POSIX class. Each class is formed from a
716
base map, with an optional addition or removal of another map. Then, for some
717
classes, there is some additional tweaking: for [:blank:] the vertical space
718
characters are removed, and for [:alpha:] and [:alnum:] the underscore
719
character is removed. The triples in the table consist of the base map offset,
720
second map offset or -1 if no second map, and a non-negative value for map
721
addition or a negative value for map subtraction (if there are two maps). The
722
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
723
remove vertical space characters, 2 => remove underscore. */
724
725
static const int posix_class_maps[] = {
726
  cbit_word,   cbit_digit, -2,            /* alpha */
727
  cbit_lower,  -1,          0,            /* lower */
728
  cbit_upper,  -1,          0,            /* upper */
729
  cbit_word,   -1,          2,            /* alnum - word without underscore */
730
  cbit_print,  cbit_cntrl,  0,            /* ascii */
731
  cbit_space,  -1,          1,            /* blank - a GNU extension */
732
  cbit_cntrl,  -1,          0,            /* cntrl */
733
  cbit_digit,  -1,          0,            /* digit */
734
  cbit_graph,  -1,          0,            /* graph */
735
  cbit_print,  -1,          0,            /* print */
736
  cbit_punct,  -1,          0,            /* punct */
737
  cbit_space,  -1,          0,            /* space */
738
  cbit_word,   -1,          0,            /* word - a Perl extension */
739
  cbit_xdigit, -1,          0             /* xdigit */
740
};
741
742
#ifdef SUPPORT_UNICODE
743
744
/* The POSIX class Unicode property substitutes that are used in UCP mode must
745
be in the order of the POSIX class names, defined above. */
746
747
static int posix_substitutes[] = {
748
  PT_GC, ucp_L,     /* alpha */
749
  PT_PC, ucp_Ll,    /* lower */
750
  PT_PC, ucp_Lu,    /* upper */
751
  PT_ALNUM, 0,      /* alnum */
752
  -1, 0,            /* ascii, treat as non-UCP */
753
  -1, 1,            /* blank, treat as \h */
754
  PT_PC, ucp_Cc,    /* cntrl */
755
  PT_PC, ucp_Nd,    /* digit */
756
  PT_PXGRAPH, 0,    /* graph */
757
  PT_PXPRINT, 0,    /* print */
758
  PT_PXPUNCT, 0,    /* punct */
759
  PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
760
  PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
761
  PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
762
};
763
#define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
764
#endif  /* SUPPORT_UNICODE */
765
766
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
767
are allowed. */
768
769
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
770
708
  (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
771
708
   PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
772
708
   PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
773
774
#define PUBLIC_COMPILE_OPTIONS \
775
708
  (PUBLIC_LITERAL_COMPILE_OPTIONS| \
776
708
   PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
777
708
   PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
778
708
   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
779
708
   PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
780
708
   PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
781
708
   PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
782
783
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
784
708
   (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_CASELESS_RESTRICT)
785
786
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
787
708
   (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
788
708
    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
789
708
    PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
790
708
    PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
791
708
    PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
792
708
    PCRE2_EXTRA_ASCII_DIGIT)
793
794
/* Compile time error code numbers. They are given names so that they can more
795
easily be tracked. When a new number is added, the tables called eint1 and
796
eint2 in pcre2posix.c may need to be updated, and a new error text must be
797
added to compile_error_texts in pcre2_error.c. Also, the error codes in
798
pcre2.h.in must be updated - their values are exactly 100 greater than these
799
values. */
800
801
enum { ERR0 = COMPILE_ERROR_BASE,
802
       ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
803
       ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
804
       ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
805
       ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
806
       ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
807
       ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
808
       ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
809
       ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
810
       ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
811
       ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99, ERR100,
812
       ERR101 };
813
814
/* This is a table of start-of-pattern options such as (*UTF) and settings such
815
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
816
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
817
generic and always supported. */
818
819
enum { PSO_OPT,     /* Value is an option bit */
820
       PSO_FLG,     /* Value is a flag bit */
821
       PSO_NL,      /* Value is a newline type */
822
       PSO_BSR,     /* Value is a \R type */
823
       PSO_LIMH,    /* Read integer value for heap limit */
824
       PSO_LIMM,    /* Read integer value for match limit */
825
       PSO_LIMD     /* Read integer value for depth limit */
826
     };
827
828
typedef struct pso {
829
  const uint8_t *name;
830
  uint16_t length;
831
  uint16_t type;
832
  uint32_t value;
833
} pso;
834
835
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
836
837
static const pso pso_list[] = {
838
  { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
839
  { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
840
  { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
841
  { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
842
  { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
843
  { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
844
  { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
845
  { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
846
  { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
847
  { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
848
  { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
849
  { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
850
  { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
851
  { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
852
  { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
853
  { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
854
  { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
855
  { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
856
  { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
857
  { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
858
  { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
859
};
860
861
/* This table is used when converting repeating opcodes into possessified
862
versions as a result of an explicit possessive quantifier such as ++. A zero
863
value means there is no possessified version - in those cases the item in
864
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
865
because all relevant opcodes are less than that. */
866
867
static const uint8_t opcode_possessify[] = {
868
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
869
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
870
871
  0,                       /* NOTI */
872
  OP_POSSTAR, 0,           /* STAR, MINSTAR */
873
  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
874
  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
875
  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
876
  0,                       /* EXACT */
877
  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
878
879
  OP_POSSTARI, 0,          /* STARI, MINSTARI */
880
  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
881
  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
882
  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
883
  0,                       /* EXACTI */
884
  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
885
886
  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
887
  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
888
  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
889
  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
890
  0,                       /* NOTEXACT */
891
  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
892
893
  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
894
  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
895
  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
896
  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
897
  0,                       /* NOTEXACTI */
898
  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
899
900
  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
901
  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
902
  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
903
  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
904
  0,                       /* TYPEEXACT */
905
  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
906
907
  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
908
  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
909
  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
910
  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
911
  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
912
913
  0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
914
  0, 0,                    /* REF, REFI */
915
  0, 0,                    /* DNREF, DNREFI */
916
  0, 0                     /* RECURSE, CALLOUT */
917
};
918
919
920
#ifdef DEBUG_SHOW_PARSED
921
/*************************************************
922
*     Show the parsed pattern for debugging      *
923
*************************************************/
924
925
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
926
can be enabled. */
927
928
static void show_parsed(compile_block *cb)
929
{
930
uint32_t *pptr = cb->parsed_pattern;
931
932
for (;;)
933
  {
934
  int max, min;
935
  PCRE2_SIZE offset;
936
  uint32_t i;
937
  uint32_t length;
938
  uint32_t meta_arg = META_DATA(*pptr);
939
940
  fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
941
942
  if (*pptr < META_END)
943
    {
944
    if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
945
    pptr++;
946
    }
947
948
  else switch (META_CODE(*pptr++))
949
    {
950
    default:
951
    fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
952
    return;
953
954
    case META_END:
955
    fprintf(stderr, "META_END\n");
956
    return;
957
958
    case META_CAPTURE:
959
    fprintf(stderr, "META_CAPTURE %d", meta_arg);
960
    break;
961
962
    case META_RECURSE:
963
    GETOFFSET(offset, pptr);
964
    fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
965
    break;
966
967
    case META_BACKREF:
968
    if (meta_arg < 10)
969
      offset = cb->small_ref_offset[meta_arg];
970
    else
971
      GETOFFSET(offset, pptr);
972
    fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
973
    break;
974
975
    case META_ESCAPE:
976
    if (meta_arg == ESC_P || meta_arg == ESC_p)
977
      {
978
      uint32_t ptype = *pptr >> 16;
979
      uint32_t pvalue = *pptr++ & 0xffff;
980
      fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
981
        ptype, pvalue);
982
      }
983
    else
984
      {
985
      uint32_t cc;
986
      /* There's just one escape we might have here that isn't negated in the
987
      escapes table. */
988
      if (meta_arg == ESC_g) cc = CHAR_g;
989
      else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
990
        {
991
        if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
992
        }
993
      if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
994
      fprintf(stderr, "META \\%c", cc);
995
      }
996
    break;
997
998
    case META_MINMAX:
999
    min = *pptr++;
1000
    max = *pptr++;
1001
    if (max != REPEAT_UNLIMITED)
1002
      fprintf(stderr, "META {%d,%d}", min, max);
1003
    else
1004
      fprintf(stderr, "META {%d,}", min);
1005
    break;
1006
1007
    case META_MINMAX_QUERY:
1008
    min = *pptr++;
1009
    max = *pptr++;
1010
    if (max != REPEAT_UNLIMITED)
1011
      fprintf(stderr, "META {%d,%d}?", min, max);
1012
    else
1013
      fprintf(stderr, "META {%d,}?", min);
1014
    break;
1015
1016
    case META_MINMAX_PLUS:
1017
    min = *pptr++;
1018
    max = *pptr++;
1019
    if (max != REPEAT_UNLIMITED)
1020
      fprintf(stderr, "META {%d,%d}+", min, max);
1021
    else
1022
      fprintf(stderr, "META {%d,}+", min);
1023
    break;
1024
1025
    case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1026
    case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1027
    case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1028
    case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1029
    case META_DOT: fprintf(stderr, "META_DOT"); break;
1030
    case META_ASTERISK: fprintf(stderr, "META *"); break;
1031
    case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1032
    case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1033
    case META_PLUS: fprintf(stderr, "META +"); break;
1034
    case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1035
    case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1036
    case META_QUERY: fprintf(stderr, "META ?"); break;
1037
    case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1038
    case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1039
1040
    case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1041
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1042
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1043
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1044
    case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1045
    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1046
    case META_KET: fprintf(stderr, "META )"); break;
1047
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1048
1049
    case META_CLASS: fprintf(stderr, "META ["); break;
1050
    case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1051
    case META_CLASS_END: fprintf(stderr, "META ]"); break;
1052
    case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1053
    case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1054
1055
    case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1056
    case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1057
1058
    case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1059
    case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1060
1061
    case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1062
    case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1063
    case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1064
    case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1065
    case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1066
    case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1067
1068
    case META_OPTIONS:
1069
    fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
1070
    pptr += 2;
1071
    break;
1072
1073
    case META_LOOKBEHIND:
1074
    fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
1075
    pptr += 2;
1076
    break;
1077
1078
    case META_LOOKBEHIND_NA:
1079
    fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
1080
    pptr += 2;
1081
    break;
1082
1083
    case META_LOOKBEHINDNOT:
1084
    fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
1085
    pptr += 2;
1086
    break;
1087
1088
    case META_CALLOUT_NUMBER:
1089
    fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1090
       pptr[1]);
1091
    pptr += 3;
1092
    break;
1093
1094
    case META_CALLOUT_STRING:
1095
      {
1096
      uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1097
      uint32_t patlength = *pptr++;    /* Length of next pattern item */
1098
      fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1099
      GETOFFSET(offset, pptr);
1100
      fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1101
      }
1102
    break;
1103
1104
    case META_RECURSE_BYNAME:
1105
    fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1106
    GETOFFSET(offset, pptr);
1107
    fprintf(stderr, "%zd", offset);
1108
    break;
1109
1110
    case META_BACKREF_BYNAME:
1111
    fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1112
    GETOFFSET(offset, pptr);
1113
    fprintf(stderr, "%zd", offset);
1114
    break;
1115
1116
    case META_COND_NUMBER:
1117
    fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1118
    GETOFFSET(offset, pptr);
1119
    fprintf(stderr, "%zd", offset);
1120
    pptr++;
1121
    break;
1122
1123
    case META_COND_DEFINE:
1124
    fprintf(stderr, "META (?(DEFINE) offset=");
1125
    GETOFFSET(offset, pptr);
1126
    fprintf(stderr, "%zd", offset);
1127
    break;
1128
1129
    case META_COND_VERSION:
1130
    fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1131
    fprintf(stderr, "%d.", *pptr++);
1132
    fprintf(stderr, "%d)", *pptr++);
1133
    break;
1134
1135
    case META_COND_NAME:
1136
    fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1137
    GETOFFSET(offset, pptr);
1138
    fprintf(stderr, "%zd", offset);
1139
    break;
1140
1141
    case META_COND_RNAME:
1142
    fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1143
    GETOFFSET(offset, pptr);
1144
    fprintf(stderr, "%zd", offset);
1145
    break;
1146
1147
    /* This is kept as a name, because it might be. */
1148
1149
    case META_COND_RNUMBER:
1150
    fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1151
    GETOFFSET(offset, pptr);
1152
    fprintf(stderr, "%zd", offset);
1153
    break;
1154
1155
    case META_MARK:
1156
    fprintf(stderr, "META (*MARK:");
1157
    goto SHOWARG;
1158
1159
    case META_COMMIT_ARG:
1160
    fprintf(stderr, "META (*COMMIT:");
1161
    goto SHOWARG;
1162
1163
    case META_PRUNE_ARG:
1164
    fprintf(stderr, "META (*PRUNE:");
1165
    goto SHOWARG;
1166
1167
    case META_SKIP_ARG:
1168
    fprintf(stderr, "META (*SKIP:");
1169
    goto SHOWARG;
1170
1171
    case META_THEN_ARG:
1172
    fprintf(stderr, "META (*THEN:");
1173
    SHOWARG:
1174
    length = *pptr++;
1175
    for (i = 0; i < length; i++)
1176
      {
1177
      uint32_t cc = *pptr++;
1178
      if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1179
        else fprintf(stderr, "\\x{%x}", cc);
1180
      }
1181
    fprintf(stderr, ") length=%u", length);
1182
    break;
1183
    }
1184
  fprintf(stderr, "\n");
1185
  }
1186
return;
1187
}
1188
#endif  /* DEBUG_SHOW_PARSED */
1189
1190
1191
1192
/*************************************************
1193
*               Copy compiled code               *
1194
*************************************************/
1195
1196
/* Compiled JIT code cannot be copied, so the new compiled block has no
1197
associated JIT data. */
1198
1199
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1200
pcre2_code_copy(const pcre2_code *code)
1201
0
{
1202
0
PCRE2_SIZE* ref_count;
1203
0
pcre2_code *newcode;
1204
1205
0
if (code == NULL) return NULL;
1206
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1207
0
if (newcode == NULL) return NULL;
1208
0
memcpy(newcode, code, code->blocksize);
1209
0
newcode->executable_jit = NULL;
1210
1211
/* If the code is one that has been deserialized, increment the reference count
1212
in the decoded tables. */
1213
1214
0
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1215
0
  {
1216
0
  ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1217
0
  (*ref_count)++;
1218
0
  }
1219
1220
0
return newcode;
1221
0
}
1222
1223
1224
1225
/*************************************************
1226
*     Copy compiled code and character tables    *
1227
*************************************************/
1228
1229
/* Compiled JIT code cannot be copied, so the new compiled block has no
1230
associated JIT data. This version of code_copy also makes a separate copy of
1231
the character tables. */
1232
1233
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1234
pcre2_code_copy_with_tables(const pcre2_code *code)
1235
0
{
1236
0
PCRE2_SIZE* ref_count;
1237
0
pcre2_code *newcode;
1238
0
uint8_t *newtables;
1239
1240
0
if (code == NULL) return NULL;
1241
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1242
0
if (newcode == NULL) return NULL;
1243
0
memcpy(newcode, code, code->blocksize);
1244
0
newcode->executable_jit = NULL;
1245
1246
0
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1247
0
  code->memctl.memory_data);
1248
0
if (newtables == NULL)
1249
0
  {
1250
0
  code->memctl.free((void *)newcode, code->memctl.memory_data);
1251
0
  return NULL;
1252
0
  }
1253
0
memcpy(newtables, code->tables, TABLES_LENGTH);
1254
0
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1255
0
*ref_count = 1;
1256
1257
0
newcode->tables = newtables;
1258
0
newcode->flags |= PCRE2_DEREF_TABLES;
1259
0
return newcode;
1260
0
}
1261
1262
1263
1264
/*************************************************
1265
*               Free compiled code               *
1266
*************************************************/
1267
1268
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1269
pcre2_code_free(pcre2_code *code)
1270
360
{
1271
360
PCRE2_SIZE* ref_count;
1272
1273
360
if (code != NULL)
1274
0
  {
1275
#ifdef SUPPORT_JIT
1276
  if (code->executable_jit != NULL)
1277
    PRIV(jit_free)(code->executable_jit, &code->memctl);
1278
#endif
1279
1280
0
  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1281
0
    {
1282
    /* Decoded tables belong to the codes after deserialization, and they must
1283
    be freed when there are no more references to them. The *ref_count should
1284
    always be > 0. */
1285
1286
0
    ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1287
0
    if (*ref_count > 0)
1288
0
      {
1289
0
      (*ref_count)--;
1290
0
      if (*ref_count == 0)
1291
0
        code->memctl.free((void *)code->tables, code->memctl.memory_data);
1292
0
      }
1293
0
    }
1294
1295
0
  code->memctl.free(code, code->memctl.memory_data);
1296
0
  }
1297
360
}
1298
1299
1300
1301
/*************************************************
1302
*         Read a number, possibly signed         *
1303
*************************************************/
1304
1305
/* This function is used to read numbers in the pattern. The initial pointer
1306
must be at the sign or first digit of the number. When relative values
1307
(introduced by + or -) are allowed, they are relative group numbers, and the
1308
result must be greater than zero.
1309
1310
Arguments:
1311
  ptrptr      points to the character pointer variable
1312
  ptrend      points to the end of the input string
1313
  allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1314
  max_value   the largest number allowed
1315
  max_error   the error to give for an over-large number
1316
  intptr      where to put the result
1317
  errcodeptr  where to put an error code
1318
1319
Returns:      TRUE  - a number was read
1320
              FALSE - errorcode == 0 => no number was found
1321
                      errorcode != 0 => an error occurred
1322
*/
1323
1324
static BOOL
1325
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1326
  uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1327
957
{
1328
957
int sign = 0;
1329
957
uint32_t n = 0;
1330
957
PCRE2_SPTR ptr = *ptrptr;
1331
957
BOOL yield = FALSE;
1332
1333
957
*errorcodeptr = 0;
1334
1335
957
if (allow_sign >= 0 && ptr < ptrend)
1336
0
  {
1337
0
  if (*ptr == CHAR_PLUS)
1338
0
    {
1339
0
    sign = +1;
1340
0
    max_value -= allow_sign;
1341
0
    ptr++;
1342
0
    }
1343
0
  else if (*ptr == CHAR_MINUS)
1344
0
    {
1345
0
    sign = -1;
1346
0
    ptr++;
1347
0
    }
1348
0
  }
1349
1350
957
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1351
2.24k
while (ptr < ptrend && IS_DIGIT(*ptr))
1352
1.28k
  {
1353
1.28k
  n = n * 10 + *ptr++ - CHAR_0;
1354
1.28k
  if (n > max_value)
1355
1
    {
1356
1
    *errorcodeptr = max_error;
1357
1
    goto EXIT;
1358
1
    }
1359
1.28k
  }
1360
1361
956
if (allow_sign >= 0 && sign != 0)
1362
0
  {
1363
0
  if (n == 0)
1364
0
    {
1365
0
    *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1366
0
    goto EXIT;
1367
0
    }
1368
1369
0
  if (sign > 0) n += allow_sign;
1370
0
  else if ((int)n > allow_sign)
1371
0
    {
1372
0
    *errorcodeptr = ERR15;  /* Non-existent subpattern */
1373
0
    goto EXIT;
1374
0
    }
1375
0
  else n = allow_sign + 1 - n;
1376
0
  }
1377
1378
956
yield = TRUE;
1379
1380
957
EXIT:
1381
957
*intptr = n;
1382
957
*ptrptr = ptr;
1383
957
return yield;
1384
956
}
1385
1386
1387
1388
/*************************************************
1389
*         Read repeat counts                     *
1390
*************************************************/
1391
1392
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1393
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1394
larger value is used for "unlimited". We have to use signed arguments for
1395
read_number() because it is capable of returning a signed value. As of Perl
1396
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1397
tabs after { and before } and between the numbers and the comma, so we do too.
1398
1399
Arguments:
1400
  ptrptr         points to pointer to character after '{'
1401
  ptrend         pointer to end of input
1402
  minp           if not NULL, pointer to int for min
1403
  maxp           if not NULL, pointer to int for max
1404
  errorcodeptr   points to error code variable
1405
1406
Returns:         FALSE if not a repeat quantifier, errorcode set zero
1407
                 FALSE on error, with errorcode set non-zero
1408
                 TRUE on success, with pointer updated to point after '}'
1409
*/
1410
1411
static BOOL
1412
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1413
  uint32_t *maxp, int *errorcodeptr)
1414
148
{
1415
148
PCRE2_SPTR p = *ptrptr;
1416
148
PCRE2_SPTR pp;
1417
148
BOOL yield = FALSE;
1418
148
BOOL had_minimum = FALSE;
1419
148
int32_t min = 0;
1420
148
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1421
1422
148
*errorcodeptr = 0;
1423
166
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1424
1425
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1426
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1427
error. */
1428
1429
148
pp = p;
1430
148
if (pp < ptrend && IS_DIGIT(*pp))
1431
32
  {
1432
32
  had_minimum = TRUE;
1433
56
  while (++pp < ptrend && IS_DIGIT(*pp)) {}
1434
32
  }
1435
1436
156
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1437
148
if (pp >= ptrend) return FALSE;
1438
1439
138
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1440
20
  {
1441
20
  if (!had_minimum) return FALSE;
1442
20
  }
1443
118
else
1444
118
  {
1445
118
  if (*pp++ != CHAR_COMMA) return FALSE;
1446
40
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1447
2
  if (pp >= ptrend) return FALSE;
1448
2
  if (IS_DIGIT(*pp))
1449
0
    {
1450
0
    while (++pp < ptrend && IS_DIGIT(*pp)) {}
1451
0
    }
1452
2
  else if (!had_minimum) return FALSE;
1453
2
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1454
2
  if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1455
2
  }
1456
1457
/* Now process the quantifier for real. We know it must be {n} or (n,} or {,m}
1458
or {n,m}. The only error that read_number() can return is for a number that is
1459
too big. If *errorcodeptr is returned as zero it means no number was found. */
1460
1461
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1462
check m >= n because n defaults to zero. */
1463
1464
0
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1465
0
  {
1466
0
  if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1467
0
  p++;  /* Skip comma and subsequent spaces */
1468
0
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1469
0
  if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1470
0
    {
1471
0
    if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1472
0
    }
1473
0
  }
1474
1475
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1476
1477
0
else
1478
0
  {
1479
0
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1480
0
  if (*p == CHAR_RIGHT_CURLY_BRACKET)
1481
0
    {
1482
0
    max = min;
1483
0
    }
1484
0
  else   /* Handle {n,} or {n,m} */
1485
0
    {
1486
0
    p++;    /* Skip comma and subsequent spaces */
1487
0
    while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1488
0
    if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1489
0
      {
1490
0
      if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1491
0
      }
1492
1493
0
    if (max < min)
1494
0
      {
1495
0
      *errorcodeptr = ERR4;
1496
0
      goto EXIT;
1497
0
      }
1498
0
    }
1499
0
  }
1500
1501
/* Valid quantifier exists */
1502
1503
0
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1504
0
p++;
1505
0
yield = TRUE;
1506
0
if (minp != NULL) *minp = (uint32_t)min;
1507
0
if (maxp != NULL) *maxp = (uint32_t)max;
1508
1509
/* Update the pattern pointer */
1510
1511
0
EXIT:
1512
0
*ptrptr = p;
1513
0
return yield;
1514
0
}
1515
1516
1517
1518
/*************************************************
1519
*            Handle escapes                      *
1520
*************************************************/
1521
1522
/* This function is called when a \ has been encountered. It either returns a
1523
positive value for a simple escape such as \d, or 0 for a data character, which
1524
is placed in chptr. A backreference to group n is returned as negative n. On
1525
entry, ptr is pointing at the character after \. On exit, it points after the
1526
final code unit of the escape sequence.
1527
1528
This function is also called from pcre2_substitute() to handle escape sequences
1529
in replacement strings. In this case, the cb argument is NULL, and in the case
1530
of escapes that have further processing, only sequences that define a data
1531
character are recognised. The isclass argument is not relevant; the options
1532
argument is the final value of the compiled pattern's options.
1533
1534
Arguments:
1535
  ptrptr         points to the input position pointer
1536
  ptrend         points to the end of the input
1537
  chptr          points to a returned data character
1538
  errorcodeptr   points to the errorcode variable (containing zero)
1539
  options        the current options bits
1540
  xoptions       the current extra options bits
1541
  isclass        TRUE if inside a character class
1542
  cb             compile data block or NULL when called from pcre2_substitute()
1543
1544
Returns:         zero => a data character
1545
                 positive => a special escape sequence
1546
                 negative => a numerical back reference
1547
                 on error, errorcodeptr is set non-zero
1548
*/
1549
1550
int
1551
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1552
  int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass,
1553
  compile_block *cb)
1554
8.88k
{
1555
8.88k
BOOL utf = (options & PCRE2_UTF) != 0;
1556
8.88k
BOOL alt_bsux =
1557
8.88k
  ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1558
8.88k
PCRE2_SPTR ptr = *ptrptr;
1559
8.88k
uint32_t c, cc;
1560
8.88k
int escape = 0;
1561
8.88k
int i;
1562
1563
/* If backslash is at the end of the string, it's an error. */
1564
1565
8.88k
if (ptr >= ptrend)
1566
0
  {
1567
0
  *errorcodeptr = ERR1;
1568
0
  return 0;
1569
0
  }
1570
1571
8.88k
GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1572
8.88k
*errorcodeptr = 0;              /* Be optimistic */
1573
1574
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1575
value test saves a memory lookup for code points outside the alphanumeric
1576
range. */
1577
1578
8.88k
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1579
1580
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1581
positive value is a literal value for something like \n. A negative value is
1582
the negation of one of the ESC_ macros that is passed back for handling by the
1583
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1584
is supported. If the value is zero, further processing is handled below. */
1585
1586
7.00k
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1587
5.69k
  {
1588
5.69k
  if (i > 0)
1589
2.05k
    {
1590
2.05k
    c = (uint32_t)i;
1591
2.05k
    if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1592
0
      c = CHAR_LF;
1593
2.05k
    }
1594
3.63k
  else  /* Negative table entry */
1595
3.63k
    {
1596
3.63k
    escape = -i;                    /* Else return a special escape */
1597
3.63k
    if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1598
353
      cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1599
1600
    /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1601
    Unicode code points, as well as plain \N for "not newline". PCRE does not
1602
    support \N{name}. However, it does support quantification such as \N{2,3},
1603
    so if \N{ is not followed by U+dddd we check for a quantifier. */
1604
1605
3.63k
    if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1606
0
      {
1607
0
      PCRE2_SPTR p = ptr + 1;
1608
1609
      /* Perl ignores spaces and tabs after { */
1610
1611
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1612
1613
      /* \N{U+ can be handled by the \x{ code. However, this construction is
1614
      not valid in EBCDIC environments because it specifies a Unicode
1615
      character, not a codepoint in the local code. For example \N{U+0041}
1616
      must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1617
      casing semantics for the entire pattern, so allow it only in UTF (i.e.
1618
      Unicode) mode. */
1619
1620
0
      if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1621
0
        {
1622
#ifdef EBCDIC
1623
        *errorcodeptr = ERR93;
1624
#else
1625
0
        if (utf)
1626
0
          {
1627
0
          ptr = p + 2;
1628
0
          escape = 0;   /* Not a fancy escape after all */
1629
0
          goto COME_FROM_NU;
1630
0
          }
1631
0
        else *errorcodeptr = ERR93;
1632
0
#endif
1633
0
        }
1634
1635
      /* Give an error if what follows is not a quantifier, but don't override
1636
      an error set by the quantifier reader (e.g. number overflow). */
1637
1638
0
      else
1639
0
        {
1640
0
        if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1641
0
             *errorcodeptr == 0)
1642
0
          *errorcodeptr = ERR37;
1643
0
        }
1644
0
      }
1645
3.63k
    }
1646
5.69k
  }
1647
1648
/* Escapes that need further processing, including those that are unknown, have
1649
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1650
\o, and \x are recognized (\u and \U can never appear as they are used for case
1651
forcing). */
1652
1653
1.31k
else
1654
1.31k
  {
1655
1.31k
  int s;
1656
1.31k
  PCRE2_SPTR oldptr;
1657
1.31k
  BOOL overflow;
1658
1659
  /* Filter calls from pcre2_substitute(). */
1660
1661
1.31k
  if (cb == NULL)
1662
0
    {
1663
0
    if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1664
0
      {
1665
0
      *errorcodeptr = ERR3;
1666
0
      return 0;
1667
0
      }
1668
0
    alt_bsux = FALSE;   /* Do not modify \x handling */
1669
0
    }
1670
1671
1.31k
  switch (c)
1672
1.31k
    {
1673
    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1674
    error. */
1675
1676
1
    case CHAR_F:
1677
1
    case CHAR_l:
1678
1
    case CHAR_L:
1679
1
    *errorcodeptr = ERR37;
1680
1
    break;
1681
1682
    /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1683
    is set. Otherwise, \u must be followed by exactly four hex digits or, if
1684
    PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1685
    Otherwise it is a lowercase u letter. This gives some compatibility with
1686
    ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1687
    allowed. When \u{ is not followed by hex digits, a special return is given
1688
    because otherwise \u{ 12} (for example) would be treated as u{12}. */
1689
1690
0
    case CHAR_u:
1691
0
    if (!alt_bsux) *errorcodeptr = ERR37; else
1692
0
      {
1693
0
      uint32_t xc;
1694
1695
0
      if (ptr >= ptrend) break;
1696
0
      if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1697
0
          (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1698
0
        {
1699
0
        PCRE2_SPTR hptr = ptr + 1;
1700
1701
0
        cc = 0;
1702
0
        while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1703
0
          {
1704
0
          if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1705
0
            {
1706
0
            *errorcodeptr = ERR77;
1707
0
            ptr = hptr;   /* Show where */
1708
0
            break;        /* *hptr != } will cause another break below */
1709
0
            }
1710
0
          cc = (cc << 4) | xc;
1711
0
          hptr++;
1712
0
          }
1713
1714
0
        if (hptr == ptr + 1 ||   /* No hex digits */
1715
0
            hptr >= ptrend ||    /* Hit end of input */
1716
0
            *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1717
0
          {
1718
0
          escape = ESC_ub;    /* Special return */
1719
0
          ptr++;              /* Skip { */
1720
0
          break;              /* Hex escape not recognized */
1721
0
          }
1722
1723
0
        c = cc;          /* Accept the code point */
1724
0
        ptr = hptr + 1;
1725
0
        }
1726
1727
0
      else  /* Must be exactly 4 hex digits */
1728
0
        {
1729
0
        if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1730
0
        if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1731
0
        if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1732
0
        cc = (cc << 4) | xc;
1733
0
        if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1734
0
        cc = (cc << 4) | xc;
1735
0
        if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1736
0
        c = (cc << 4) | xc;
1737
0
        ptr += 4;
1738
0
        }
1739
1740
0
      if (utf)
1741
0
        {
1742
0
        if (c > 0x10ffffU) *errorcodeptr = ERR77;
1743
0
        else
1744
0
          if (c >= 0xd800 && c <= 0xdfff &&
1745
0
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1746
0
                *errorcodeptr = ERR73;
1747
0
        }
1748
0
      else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1749
0
      }
1750
0
    break;
1751
1752
    /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1753
    in which case it is an upper case letter. */
1754
1755
3
    case CHAR_U:
1756
3
    if (!alt_bsux) *errorcodeptr = ERR37;
1757
3
    break;
1758
1759
    /* In a character class, \g is just a literal "g". Outside a character
1760
    class, \g must be followed by one of a number of specific things:
1761
1762
    (1) A number, either plain or braced. If positive, it is an absolute
1763
    backreference. If negative, it is a relative backreference. This is a Perl
1764
    5.10 feature.
1765
1766
    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1767
    is part of Perl's movement towards a unified syntax for back references. As
1768
    this is synonymous with \k{name}, we fudge it up by pretending it really
1769
    was \k{name}.
1770
1771
    (3) For Oniguruma compatibility we also support \g followed by a name or a
1772
    number either in angle brackets or in single quotes. However, these are
1773
    (possibly recursive) subroutine calls, _not_ backreferences. We return
1774
    the ESC_g code.
1775
1776
    Summary: Return a negative number for a numerical back reference, ESC_k for
1777
    a named back reference, and ESC_g for a named or numbered subroutine call.
1778
    */
1779
1780
0
    case CHAR_g:
1781
0
    if (isclass) break;
1782
1783
0
    if (ptr >= ptrend)
1784
0
      {
1785
0
      *errorcodeptr = ERR57;
1786
0
      break;
1787
0
      }
1788
1789
0
    if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790
0
      {
1791
0
      escape = ESC_g;
1792
0
      break;
1793
0
      }
1794
1795
    /* If there is a brace delimiter, try to read a numerical reference. If
1796
    there isn't one, assume we have a name and treat it as \k. */
1797
1798
0
    if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799
0
      {
1800
0
      PCRE2_SPTR p = ptr + 1;
1801
1802
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803
0
      if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804
0
          errorcodeptr))
1805
0
        {
1806
0
        if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1807
0
        break;
1808
0
        }
1809
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810
1811
0
      if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812
0
        {
1813
0
        *errorcodeptr = ERR57;
1814
0
        break;
1815
0
        }
1816
0
      ptr = p + 1;
1817
0
      }
1818
1819
    /* Read an undelimited number */
1820
1821
0
    else
1822
0
      {
1823
0
      if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1824
0
          errorcodeptr))
1825
0
        {
1826
0
        if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1827
0
        break;
1828
0
        }
1829
0
      }
1830
1831
0
    if (s <= 0)
1832
0
      {
1833
0
      *errorcodeptr = ERR15;
1834
0
      break;
1835
0
      }
1836
1837
0
    escape = -s;
1838
0
    break;
1839
1840
    /* The handling of escape sequences consisting of a string of digits
1841
    starting with one that is not zero is not straightforward. Perl has changed
1842
    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1843
    recommended to avoid the ambiguities in the old syntax.
1844
1845
    Outside a character class, the digits are read as a decimal number. If the
1846
    number is less than 10, or if there are that many previous extracting left
1847
    brackets, it is a back reference. Otherwise, up to three octal digits are
1848
    read to form an escaped character code. Thus \123 is likely to be octal 123
1849
    (cf \0123, which is octal 012 followed by the literal 3).
1850
1851
    Inside a character class, \ followed by a digit is always either a literal
1852
    8 or 9 or an octal number. */
1853
1854
677
    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1855
978
    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1856
1857
978
    if (!isclass)
1858
957
      {
1859
957
      oldptr = ptr;
1860
957
      ptr--;   /* Back to the digit */
1861
1862
      /* As we know we are at a digit, the only possible error from
1863
      read_number() is a number that is too large to be a group number. In this
1864
      case we fall through handle this as not a group reference. If we have
1865
      read a small enough number, check for a back reference.
1866
1867
      \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1868
      are octal escapes if there are not that many previous captures. */
1869
1870
957
      if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1871
956
          (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1872
806
        {
1873
806
        if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1874
806
          else escape = -s;     /* Indicates a back reference */
1875
806
        break;
1876
806
        }
1877
1878
151
      ptr = oldptr;      /* Put the pointer back and fall through */
1879
151
      }
1880
1881
    /* Handle a digit following \ when the number is not a back reference, or
1882
    we are within a character class. If the first digit is 8 or 9, Perl used to
1883
    generate a binary zero and then treat the digit as a following literal. At
1884
    least by Perl 5.18 this changed so as not to insert the binary zero. */
1885
1886
172
    if (c >= CHAR_8) break;
1887
1888
    /* Fall through */
1889
1890
    /* \0 always starts an octal number, but we may drop through to here with a
1891
    larger first octal digit. The original code used just to take the least
1892
    significant 8 bits of octal numbers (I think this is what early Perls used
1893
    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1894
    but no more than 3 octal digits. */
1895
1896
458
    case CHAR_0:
1897
458
    c -= CHAR_0;
1898
1.14k
    while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1899
688
        c = c * 8 + *ptr++ - CHAR_0;
1900
458
#if PCRE2_CODE_UNIT_WIDTH == 8
1901
458
    if (!utf && c > 0xff) *errorcodeptr = ERR51;
1902
458
#endif
1903
458
    break;
1904
1905
    /* \o is a relatively new Perl feature, supporting a more general way of
1906
    specifying character codes in octal. The only supported form is \o{ddd},
1907
    with optional spaces or tabs after { and before }. */
1908
1909
0
    case CHAR_o:
1910
0
    if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1911
0
      {
1912
0
      ptr--;
1913
0
      *errorcodeptr = ERR55;
1914
0
      break;
1915
0
      }
1916
1917
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1918
0
    if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1919
0
      {
1920
0
      *errorcodeptr = ERR78;
1921
0
      break;
1922
0
      }
1923
1924
0
    c = 0;
1925
0
    overflow = FALSE;
1926
0
    while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1927
0
      {
1928
0
      cc = *ptr++;
1929
0
      if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1930
#if PCRE2_CODE_UNIT_WIDTH == 32
1931
      if (c >= 0x20000000l) { overflow = TRUE; break; }
1932
#endif
1933
0
      c = (c << 3) + (cc - CHAR_0);
1934
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1935
0
      if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1936
#elif PCRE2_CODE_UNIT_WIDTH == 16
1937
      if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1938
#elif PCRE2_CODE_UNIT_WIDTH == 32
1939
      if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1940
#endif
1941
0
      }
1942
1943
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1944
1945
0
    if (overflow)
1946
0
      {
1947
0
      while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1948
0
      *errorcodeptr = ERR34;
1949
0
      }
1950
0
    else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1951
0
      {
1952
0
      if (utf && c >= 0xd800 && c <= 0xdfff &&
1953
0
          (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1954
0
        {
1955
0
        ptr--;
1956
0
        *errorcodeptr = ERR73;
1957
0
        }
1958
0
      }
1959
0
    else
1960
0
      {
1961
0
      ptr--;
1962
0
      *errorcodeptr = ERR64;
1963
0
      }
1964
0
    break;
1965
1966
    /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1967
    by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1968
1969
27
    case CHAR_x:
1970
27
    if (alt_bsux)
1971
0
      {
1972
0
      uint32_t xc;
1973
0
      if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1974
0
      if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1975
0
      if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1976
0
      c = (cc << 4) | xc;
1977
0
      ptr += 2;
1978
0
      }
1979
1980
    /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1981
    greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1982
    digits. If not, { used to be treated as a data character. However, Perl
1983
    seems to read hex digits up to the first non-such, and ignore the rest, so
1984
    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1985
    now gives an error. */
1986
1987
27
    else
1988
27
      {
1989
27
      if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1990
0
        {
1991
0
        ptr++;
1992
0
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1993
1994
0
#ifndef EBCDIC
1995
0
        COME_FROM_NU:
1996
0
#endif
1997
0
        if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1998
0
          {
1999
0
          *errorcodeptr = ERR78;
2000
0
          break;
2001
0
          }
2002
0
        c = 0;
2003
0
        overflow = FALSE;
2004
2005
0
        while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2006
0
          {
2007
0
          ptr++;
2008
0
          if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2009
#if PCRE2_CODE_UNIT_WIDTH == 32
2010
          if (c >= 0x10000000l) { overflow = TRUE; break; }
2011
#endif
2012
0
          c = (c << 4) | cc;
2013
0
          if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2014
0
            {
2015
0
            overflow = TRUE;
2016
0
            break;
2017
0
            }
2018
0
          }
2019
2020
        /* Perl ignores spaces and tabs before } */
2021
2022
0
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2023
2024
        /* On overflow, skip remaining hex digits */
2025
2026
0
        if (overflow)
2027
0
          {
2028
0
          while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2029
0
          *errorcodeptr = ERR34;
2030
0
          }
2031
0
        else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2032
0
          {
2033
0
          if (utf && c >= 0xd800 && c <= 0xdfff &&
2034
0
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2035
0
            {
2036
0
            ptr--;
2037
0
            *errorcodeptr = ERR73;
2038
0
            }
2039
0
          }
2040
2041
        /* If the sequence of hex digits (followed by optional space) does not
2042
        end with '}', give an error. We used just to recognize this construct
2043
        and fall through to the normal \x handling, but nowadays Perl gives an
2044
        error, which seems much more sensible, so we do too. */
2045
2046
0
        else
2047
0
          {
2048
0
          ptr--;
2049
0
          *errorcodeptr = ERR67;
2050
0
          }
2051
0
        }   /* End of \x{} processing */
2052
2053
      /* Read a up to two hex digits after \x */
2054
2055
27
      else
2056
27
        {
2057
27
        c = 0;
2058
27
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2059
23
        ptr++;
2060
23
        c = cc;
2061
23
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2062
9
        ptr++;
2063
9
        c = (c << 4) | cc;
2064
9
        }     /* End of \xdd handling */
2065
27
      }       /* End of Perl-style \x handling */
2066
9
    break;
2067
2068
    /* The handling of \c is different in ASCII and EBCDIC environments. In an
2069
    ASCII (or Unicode) environment, an error is given if the character
2070
    following \c is not a printable ASCII character. Otherwise, the following
2071
    character is upper-cased if it is a letter, and after that the 0x40 bit is
2072
    flipped. The result is the value of the escape.
2073
2074
    In an EBCDIC environment the handling of \c is compatible with the
2075
    specification in the perlebcdic document. The following character must be
2076
    a letter or one of small number of special characters. These provide a
2077
    means of defining the character values 0-31.
2078
2079
    For testing the EBCDIC handling of \c in an ASCII environment, recognize
2080
    the EBCDIC value of 'c' explicitly. */
2081
2082
#if defined EBCDIC && 'a' != 0x81
2083
    case 0x83:
2084
#else
2085
13
    case CHAR_c:
2086
13
#endif
2087
13
    if (ptr >= ptrend)
2088
0
      {
2089
0
      *errorcodeptr = ERR2;
2090
0
      break;
2091
0
      }
2092
13
    c = *ptr;
2093
13
    if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2094
2095
    /* Handle \c in an ASCII/Unicode environment. */
2096
2097
13
#ifndef EBCDIC    /* ASCII/UTF-8 coding */
2098
13
    if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2099
0
      {
2100
0
      *errorcodeptr = ERR68;
2101
0
      break;
2102
0
      }
2103
13
    c ^= 0x40;
2104
2105
    /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2106
    255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2107
    POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2108
    The other valid sequences correspond to a list of specific characters. */
2109
2110
#else
2111
    if (c == CHAR_QUESTION_MARK)
2112
      c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2113
    else
2114
      {
2115
      for (i = 0; i < 32; i++)
2116
        {
2117
        if (c == ebcdic_escape_c[i]) break;
2118
        }
2119
      if (i < 32) c = i; else *errorcodeptr = ERR68;
2120
      }
2121
#endif  /* EBCDIC */
2122
2123
13
    ptr++;
2124
13
    break;
2125
2126
    /* Any other alphanumeric following \ is an error. Perl gives an error only
2127
    if in warning mode, but PCRE doesn't have a warning mode. */
2128
2129
7
    default:
2130
7
    *errorcodeptr = ERR3;
2131
7
    *ptrptr = ptr - 1;     /* Point to the character at fault */
2132
7
    return 0;
2133
1.31k
    }
2134
1.31k
  }
2135
2136
/* Set the pointer to the next character before returning. */
2137
2138
8.87k
*ptrptr = ptr;
2139
8.87k
*chptr = c;
2140
8.87k
return escape;
2141
8.88k
}
2142
2143
2144
2145
#ifdef SUPPORT_UNICODE
2146
/*************************************************
2147
*               Handle \P and \p                 *
2148
*************************************************/
2149
2150
/* This function is called after \P or \p has been encountered, provided that
2151
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2152
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2153
after the final code unit of the escape sequence.
2154
2155
Arguments:
2156
  ptrptr         the pattern position pointer
2157
  negptr         a boolean that is set TRUE for negation else FALSE
2158
  ptypeptr       an unsigned int that is set to the type value
2159
  pdataptr       an unsigned int that is set to the detailed property value
2160
  errorcodeptr   the error code variable
2161
  cb             the compile data
2162
2163
Returns:         TRUE if the type value was found, or FALSE for an invalid type
2164
*/
2165
2166
static BOOL
2167
get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2168
  uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2169
304
{
2170
304
PCRE2_UCHAR c;
2171
304
PCRE2_SIZE i, bot, top;
2172
304
PCRE2_SPTR ptr = *ptrptr;
2173
304
PCRE2_UCHAR name[50];
2174
304
PCRE2_UCHAR *vptr = NULL;
2175
304
uint16_t ptscript = PT_NOTSCRIPT;
2176
2177
304
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2178
304
c = *ptr++;
2179
304
*negptr = FALSE;
2180
2181
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2182
negation. */
2183
2184
304
if (c == CHAR_LEFT_CURLY_BRACKET)
2185
3
  {
2186
3
  if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2187
2188
3
  if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2189
0
    {
2190
0
    *negptr = TRUE;
2191
0
    ptr++;
2192
0
    }
2193
2194
43
  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2195
43
    {
2196
43
    if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2197
41
    c = *ptr++;
2198
#if PCRE2_CODE_UNIT_WIDTH != 8
2199
    while (c == '_' || c == '-' || (c <= 0xff && isspace(c)))
2200
#else
2201
54
    while (c == '_' || c == '-' || isspace(c))
2202
13
#endif
2203
13
      {
2204
13
      if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2205
13
      c = *ptr++;
2206
13
      }
2207
41
    if (c == CHAR_NUL) goto ERROR_RETURN;
2208
41
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2209
40
    name[i] = tolower(c);
2210
40
    if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
2211
40
    }
2212
2213
1
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2214
1
  name[i] = 0;
2215
1
  }
2216
2217
/* If { doesn't follow \p or \P there is just one following character, which
2218
must be an ASCII letter. */
2219
2220
301
else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2221
299
  {
2222
299
  name[0] = tolower(c);
2223
299
  name[1] = 0;
2224
299
  }
2225
2
else goto ERROR_RETURN;
2226
2227
300
*ptrptr = ptr;
2228
2229
/* If the property contains ':' or '=' we have class name and value separately
2230
specified. The following are supported:
2231
2232
  . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2233
  . Script (synonym sc) for which the property name is the script name
2234
  . Script_Extensions (synonym scx), ditto
2235
2236
As this is a small number, we currently just check the names directly. If this
2237
grows, a sorted table and a switch will be neater.
2238
2239
For both the script properties, set a PT_xxx value so that (1) they can be
2240
distinguished and (2) invalid script names that happen to be the name of
2241
another property can be diagnosed. */
2242
2243
300
if (vptr != NULL)
2244
1
  {
2245
1
  int offset = 0;
2246
1
  PCRE2_UCHAR sname[8];
2247
2248
1
  *vptr = 0;   /* Terminate property name */
2249
1
  if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2250
1
      PRIV(strcmp_c8)(name, STRING_bc) == 0)
2251
0
    {
2252
0
    offset = 4;
2253
0
    sname[0] = CHAR_b;
2254
0
    sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2255
0
    sname[2] = CHAR_d;
2256
0
    sname[3] = CHAR_i;
2257
0
    }
2258
2259
1
  else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2260
1
           PRIV(strcmp_c8)(name, STRING_sc) == 0)
2261
0
    ptscript = PT_SC;
2262
2263
1
  else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2264
1
           PRIV(strcmp_c8)(name, STRING_scx) == 0)
2265
0
    ptscript = PT_SCX;
2266
2267
1
  else
2268
1
    {
2269
1
    *errorcodeptr = ERR47;
2270
1
    return FALSE;
2271
1
    }
2272
2273
  /* Adjust the string in name[] as needed */
2274
2275
0
  memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2276
0
  if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2277
0
  }
2278
2279
/* Search for a recognized property using binary chop. */
2280
2281
299
bot = 0;
2282
299
top = PRIV(utt_size);
2283
2284
2.09k
while (bot < top)
2285
2.09k
  {
2286
2.09k
  int r;
2287
2.09k
  i = (bot + top) >> 1;
2288
2.09k
  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2289
2290
  /* When a matching property is found, some extra checking is needed when the
2291
  \p{xx:yy} syntax is used and xx is either sc or scx. */
2292
2293
2.09k
  if (r == 0)
2294
299
    {
2295
299
    *pdataptr = PRIV(utt)[i].value;
2296
299
    if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2297
299
      {
2298
299
      *ptypeptr = PRIV(utt)[i].type;
2299
299
      return TRUE;
2300
299
      }
2301
2302
0
    switch (PRIV(utt)[i].type)
2303
0
      {
2304
0
      case PT_SC:
2305
0
      *ptypeptr = PT_SC;
2306
0
      return TRUE;
2307
2308
0
      case PT_SCX:
2309
0
      *ptypeptr = ptscript;
2310
0
      return TRUE;
2311
0
      }
2312
2313
0
    break;  /* Non-script found */
2314
0
    }
2315
2316
1.79k
  if (r > 0) bot = i + 1; else top = i;
2317
1.79k
  }
2318
2319
0
*errorcodeptr = ERR47;   /* Unrecognized property */
2320
0
return FALSE;
2321
2322
4
ERROR_RETURN:            /* Malformed \P or \p */
2323
4
*errorcodeptr = ERR46;
2324
4
*ptrptr = ptr;
2325
4
return FALSE;
2326
299
}
2327
#endif
2328
2329
2330
2331
/*************************************************
2332
*           Check for POSIX class syntax         *
2333
*************************************************/
2334
2335
/* This function is called when the sequence "[:" or "[." or "[=" is
2336
encountered in a character class. It checks whether this is followed by a
2337
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2338
reach an unescaped ']' without the special preceding character, return FALSE.
2339
2340
Originally, this function only recognized a sequence of letters between the
2341
terminators, but it seems that Perl recognizes any sequence of characters,
2342
though of course unknown POSIX names are subsequently rejected. Perl gives an
2343
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2344
didn't consider this to be a POSIX class. Likewise for [:1234:].
2345
2346
The problem in trying to be exactly like Perl is in the handling of escapes. We
2347
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2348
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2349
below handles the special cases \\ and \], but does not try to do any other
2350
escape processing. This makes it different from Perl for cases such as
2351
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2352
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2353
when Perl does, I think.
2354
2355
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2356
It seems that the appearance of a nested POSIX class supersedes an apparent
2357
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2358
a digit. This is handled by returning FALSE if the start of a new group with
2359
the same terminator is encountered, since the next closing sequence must close
2360
the nested group, not the outer one.
2361
2362
In Perl, unescaped square brackets may also appear as part of class names. For
2363
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2364
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2365
seem right at all. PCRE does not allow closing square brackets in POSIX class
2366
names.
2367
2368
Arguments:
2369
  ptr      pointer to the character after the initial [ (colon, dot, equals)
2370
  ptrend   pointer to the end of the pattern
2371
  endptr   where to return a pointer to the terminating ':', '.', or '='
2372
2373
Returns:   TRUE or FALSE
2374
*/
2375
2376
static BOOL
2377
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2378
249
{
2379
249
PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2380
249
terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2381
2382
6.65k
for (; ptrend - ptr >= 2; ptr++)
2383
6.64k
  {
2384
6.64k
  if (*ptr == CHAR_BACKSLASH &&
2385
364
      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2386
6
    ptr++;
2387
2388
6.64k
  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2389
6.62k
            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2390
2391
6.39k
  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2392
1
    {
2393
1
    *endptr = ptr;
2394
1
    return TRUE;
2395
1
    }
2396
6.64k
  }
2397
2398
2
return FALSE;
2399
249
}
2400
2401
2402
2403
/*************************************************
2404
*          Check POSIX class name                *
2405
*************************************************/
2406
2407
/* This function is called to check the name given in a POSIX-style class entry
2408
such as [:alnum:].
2409
2410
Arguments:
2411
  ptr        points to the first letter
2412
  len        the length of the name
2413
2414
Returns:     a value representing the name, or -1 if unknown
2415
*/
2416
2417
static int
2418
check_posix_name(PCRE2_SPTR ptr, int len)
2419
0
{
2420
0
const char *pn = posix_names;
2421
0
int yield = 0;
2422
0
while (posix_name_lengths[yield] != 0)
2423
0
  {
2424
0
  if (len == posix_name_lengths[yield] &&
2425
0
    PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2426
0
  pn += posix_name_lengths[yield] + 1;
2427
0
  yield++;
2428
0
  }
2429
0
return -1;
2430
0
}
2431
2432
2433
2434
/*************************************************
2435
*       Read a subpattern or VERB name           *
2436
*************************************************/
2437
2438
/* This function is called from parse_regex() below whenever it needs to read
2439
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2440
pointer must be to the preceding character. If that character is '*' we are
2441
reading a verb or alpha assertion name. The pointer is updated to point after
2442
the name, for a VERB or alpha assertion name, or after tha name's terminator
2443
for a subpattern name. Returning both the offset and the name pointer is
2444
redundant information, but some callers use one and some the other, so it is
2445
simplest just to return both. When the name is in braces, spaces and tabs are
2446
allowed (and ignored) at either end.
2447
2448
Arguments:
2449
  ptrptr      points to the character pointer variable
2450
  ptrend      points to the end of the input string
2451
  utf         true if the input is UTF-encoded
2452
  terminator  the terminator of a subpattern name must be this
2453
  offsetptr   where to put the offset from the start of the pattern
2454
  nameptr     where to put a pointer to the name in the input
2455
  namelenptr  where to put the length of the name
2456
  errcodeptr  where to put an error code
2457
  cb          pointer to the compile data block
2458
2459
Returns:    TRUE if a name was read
2460
            FALSE otherwise, with error code set
2461
*/
2462
2463
static BOOL
2464
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2465
  PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2466
  int *errorcodeptr, compile_block *cb)
2467
2
{
2468
2
PCRE2_SPTR ptr = *ptrptr;
2469
2
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2470
2
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2471
2472
2
if (is_braced)
2473
0
  while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2474
2475
2
if (ptr >= ptrend)                 /* No characters in name */
2476
0
  {
2477
0
  *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2478
0
                            ERR60; /* Verb not recognized or malformed */
2479
0
  goto FAILED;
2480
0
  }
2481
2482
2
*nameptr = ptr;
2483
2
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2484
2485
/* In UTF mode, a group name may contain letters and decimal digits as defined
2486
by Unicode properties, and underscores, but must not start with a digit. */
2487
2488
2
#ifdef SUPPORT_UNICODE
2489
2
if (utf && is_group)
2490
0
  {
2491
0
  uint32_t c, type;
2492
2493
0
  GETCHAR(c, ptr);
2494
0
  type = UCD_CHARTYPE(c);
2495
2496
0
  if (type == ucp_Nd)
2497
0
    {
2498
0
    *errorcodeptr = ERR44;
2499
0
    goto FAILED;
2500
0
    }
2501
2502
0
  for(;;)
2503
0
    {
2504
0
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2505
0
        c != CHAR_UNDERSCORE) break;
2506
0
    ptr++;
2507
0
    FORWARDCHARTEST(ptr, ptrend);
2508
0
    if (ptr >= ptrend) break;
2509
0
    GETCHAR(c, ptr);
2510
0
    type = UCD_CHARTYPE(c);
2511
0
    }
2512
0
  }
2513
2
else
2514
#else
2515
(void)utf;  /* Avoid compiler warning */
2516
#endif      /* SUPPORT_UNICODE */
2517
2518
/* Handle non-group names and group names in non-UTF modes. A group name must
2519
not start with a digit. If either of the others start with a digit it just
2520
won't be recognized. */
2521
2522
2
  {
2523
2
  if (is_group && IS_DIGIT(*ptr))
2524
0
    {
2525
0
    *errorcodeptr = ERR44;
2526
0
    goto FAILED;
2527
0
    }
2528
2529
19
  while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2530
17
    {
2531
17
    ptr++;
2532
17
    }
2533
2
  }
2534
2535
/* Check name length */
2536
2537
2
if (ptr > *nameptr + MAX_NAME_SIZE)
2538
0
  {
2539
0
  *errorcodeptr = ERR48;
2540
0
  goto FAILED;
2541
0
  }
2542
2
*namelenptr = (uint32_t)(ptr - *nameptr);
2543
2544
/* Subpattern names must not be empty, and their terminator is checked here.
2545
(What follows a verb or alpha assertion name is checked separately.) */
2546
2547
2
if (is_group)
2548
0
  {
2549
0
  if (ptr == *nameptr)
2550
0
    {
2551
0
    *errorcodeptr = ERR62;   /* Subpattern name expected */
2552
0
    goto FAILED;
2553
0
    }
2554
0
  if (is_braced)
2555
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2556
0
  if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2557
0
    {
2558
0
    *errorcodeptr = ERR42;
2559
0
    goto FAILED;
2560
0
    }
2561
0
  ptr++;
2562
0
  }
2563
2564
2
*ptrptr = ptr;
2565
2
return TRUE;
2566
2567
0
FAILED:
2568
0
*ptrptr = ptr;
2569
0
return FALSE;
2570
2
}
2571
2572
2573
2574
/*************************************************
2575
*          Manage callouts at start of cycle     *
2576
*************************************************/
2577
2578
/* At the start of a new item in parse_regex() we are able to record the
2579
details of the previous item in a prior callout, and also to set up an
2580
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2581
which would otherwise happen for items such as \Q that contribute nothing to
2582
the parsed pattern.
2583
2584
Arguments:
2585
  ptr              current pattern pointer
2586
  pcalloutptr      points to a pointer to previous callout, or NULL
2587
  auto_callout     TRUE if auto_callouts are enabled
2588
  parsed_pattern   the parsed pattern pointer
2589
  cb               compile block
2590
2591
Returns: possibly updated parsed_pattern pointer.
2592
*/
2593
2594
static uint32_t *
2595
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2596
  uint32_t *parsed_pattern, compile_block *cb)
2597
180k
{
2598
180k
uint32_t *previous_callout = *pcalloutptr;
2599
2600
180k
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2601
0
  cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2602
2603
180k
if (!auto_callout) previous_callout = NULL; else
2604
0
  {
2605
0
  if (previous_callout == NULL ||
2606
0
      previous_callout != parsed_pattern - 4 ||
2607
0
      previous_callout[3] != 255)
2608
0
    {
2609
0
    previous_callout = parsed_pattern;  /* Set up new automatic callout */
2610
0
    parsed_pattern += 4;
2611
0
    previous_callout[0] = META_CALLOUT_NUMBER;
2612
0
    previous_callout[2] = 0;
2613
0
    previous_callout[3] = 255;
2614
0
    }
2615
0
  previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2616
0
  }
2617
2618
180k
*pcalloutptr = previous_callout;
2619
180k
return parsed_pattern;
2620
180k
}
2621
2622
2623
2624
/*************************************************
2625
*          Handle \d, \D, \s, \S, \w, \W         *
2626
*************************************************/
2627
2628
/* This function is called from parse_regex() below, both for freestanding
2629
escapes, and those within classes, to handle those escapes that may change when
2630
Unicode property support is requested. Note that PCRE2_UCP will never be set
2631
without Unicode support because that is checked when pcre2_compile() is called.
2632
2633
Arguments:
2634
  escape          the ESC_... value
2635
  parsed_pattern  where to add the code
2636
  options         options bits
2637
  xoptions        extra options bits
2638
2639
Returns:          updated value of parsed_pattern
2640
*/
2641
static uint32_t *
2642
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2643
  uint32_t xoptions)
2644
2.82k
{
2645
2.82k
uint32_t ascii_option = 0;
2646
2.82k
uint32_t prop = ESC_p;
2647
2648
2.82k
switch(escape)
2649
2.82k
  {
2650
33
  case ESC_D:
2651
33
  prop = ESC_P;
2652
  /* Fall through */
2653
704
  case ESC_d:
2654
704
  ascii_option = PCRE2_EXTRA_ASCII_BSD;
2655
704
  break;
2656
2657
58
  case ESC_S:
2658
58
  prop = ESC_P;
2659
  /* Fall through */
2660
458
  case ESC_s:
2661
458
  ascii_option = PCRE2_EXTRA_ASCII_BSS;
2662
458
  break;
2663
2664
308
  case ESC_W:
2665
308
  prop = ESC_P;
2666
  /* Fall through */
2667
1.66k
  case ESC_w:
2668
1.66k
  ascii_option = PCRE2_EXTRA_ASCII_BSW;
2669
1.66k
  break;
2670
2.82k
  }
2671
2672
2.82k
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2673
2.63k
  {
2674
2.63k
  *parsed_pattern++ = META_ESCAPE + escape;
2675
2.63k
  }
2676
193
else
2677
193
  {
2678
193
  *parsed_pattern++ = META_ESCAPE + prop;
2679
193
  switch(escape)
2680
193
    {
2681
14
    case ESC_d:
2682
17
    case ESC_D:
2683
17
    *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2684
17
    break;
2685
2686
5
    case ESC_s:
2687
42
    case ESC_S:
2688
42
    *parsed_pattern++ = PT_SPACE << 16;
2689
42
    break;
2690
2691
122
    case ESC_w:
2692
134
    case ESC_W:
2693
134
    *parsed_pattern++ = PT_WORD << 16;
2694
134
    break;
2695
193
    }
2696
193
  }
2697
2698
2.82k
return parsed_pattern;
2699
2.82k
}
2700
2701
2702
2703
/*************************************************
2704
*      Parse regex and identify named groups     *
2705
*************************************************/
2706
2707
/* This function is called first of all. It scans the pattern and does two
2708
things: (1) It identifies capturing groups and makes a table of named capturing
2709
groups so that information about them is fully available to both the compiling
2710
scans. (2) It writes a parsed version of the pattern with comments omitted and
2711
escapes processed into the parsed_pattern vector.
2712
2713
Arguments:
2714
  ptr             points to the start of the pattern
2715
  options         compiling dynamic options (may change during the scan)
2716
  has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2717
  cb              pointer to the compile data block
2718
2719
Returns:   zero on success or a non-zero error code, with the
2720
             error offset placed in the cb field
2721
*/
2722
2723
/* A structure and some flags for dealing with nested groups. */
2724
2725
typedef struct nest_save {
2726
  uint16_t  nest_depth;
2727
  uint16_t  reset_group;
2728
  uint16_t  max_group;
2729
  uint16_t  flags;
2730
  uint32_t  options;
2731
  uint32_t  xoptions;
2732
} nest_save;
2733
2734
12
#define NSF_RESET          0x0001u
2735
5
#define NSF_CONDASSERT     0x0002u
2736
5
#define NSF_ATOMICSR       0x0004u
2737
2738
/* Options that are changeable within the pattern must be tracked during
2739
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2740
but all must be tracked so that META_OPTIONS items set the correct values for
2741
the main compiling phase. */
2742
2743
10
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2744
10
  PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2745
10
  PCRE2_UNGREEDY)
2746
2747
10
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2748
10
  PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2749
10
  PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2750
2751
/* States used for analyzing ranges in character classes. The two OK values
2752
must be last. */
2753
2754
enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2755
2756
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2757
the storing of literal values in the main parsed pattern, where they can always
2758
be quantified. */
2759
2760
#if PCRE2_CODE_UNIT_WIDTH == 32
2761
#define PARSED_LITERAL(c, p) \
2762
  { \
2763
  if (c >= META_END) *p++ = META_BIGVALUE; \
2764
  *p++ = c; \
2765
  okquantifier = TRUE; \
2766
  }
2767
#else
2768
246k
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2769
#endif
2770
2771
/* Here's the actual function. */
2772
2773
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2774
  compile_block *cb)
2775
704
{
2776
704
uint32_t c;
2777
704
uint32_t delimiter;
2778
704
uint32_t namelen;
2779
704
uint32_t class_range_state;
2780
704
uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2781
704
uint32_t *verbstartptr = NULL;
2782
704
uint32_t *previous_callout = NULL;
2783
704
uint32_t *parsed_pattern = cb->parsed_pattern;
2784
704
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2785
704
uint32_t *this_parsed_item = NULL;
2786
704
uint32_t *prev_parsed_item = NULL;
2787
704
uint32_t meta_quantifier = 0;
2788
704
uint32_t add_after_mark = 0;
2789
704
uint32_t xoptions = cb->cx->extra_options;
2790
704
uint16_t nest_depth = 0;
2791
704
int after_manual_callout = 0;
2792
704
int expect_cond_assert = 0;
2793
704
int errorcode = 0;
2794
704
int escape;
2795
704
int i;
2796
704
BOOL inescq = FALSE;
2797
704
BOOL inverbname = FALSE;
2798
704
BOOL utf = (options & PCRE2_UTF) != 0;
2799
704
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2800
704
BOOL isdupname;
2801
704
BOOL negate_class;
2802
704
BOOL okquantifier = FALSE;
2803
704
PCRE2_SPTR thisptr;
2804
704
PCRE2_SPTR name;
2805
704
PCRE2_SPTR ptrend = cb->end_pattern;
2806
704
PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2807
704
named_group *ng;
2808
704
nest_save *top_nest, *end_nests;
2809
2810
/* Insert leading items for word and line matching (features provided for the
2811
benefit of pcre2grep). */
2812
2813
704
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2814
0
  {
2815
0
  *parsed_pattern++ = META_CIRCUMFLEX;
2816
0
  *parsed_pattern++ = META_NOCAPTURE;
2817
0
  }
2818
704
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2819
0
  {
2820
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
2821
0
  *parsed_pattern++ = META_NOCAPTURE;
2822
0
  }
2823
2824
/* If the pattern is actually a literal string, process it separately to avoid
2825
cluttering up the main loop. */
2826
2827
704
if ((options & PCRE2_LITERAL) != 0)
2828
0
  {
2829
0
  while (ptr < ptrend)
2830
0
    {
2831
0
    if (parsed_pattern >= parsed_pattern_end)
2832
0
      {
2833
0
      errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2834
0
      goto FAILED;
2835
0
      }
2836
0
    thisptr = ptr;
2837
0
    GETCHARINCTEST(c, ptr);
2838
0
    if (auto_callout)
2839
0
      parsed_pattern = manage_callouts(thisptr, &previous_callout,
2840
0
        auto_callout, parsed_pattern, cb);
2841
0
    PARSED_LITERAL(c, parsed_pattern);
2842
0
    }
2843
0
  goto PARSED_END;
2844
0
  }
2845
2846
/* Process a real regex which may contain meta-characters. */
2847
2848
704
top_nest = NULL;
2849
704
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2850
2851
/* The size of the nest_save structure might not be a factor of the size of the
2852
workspace. Therefore we must round down end_nests so as to correctly avoid
2853
creating a nest_save that spans the end of the workspace. */
2854
2855
704
end_nests = (nest_save *)((char *)end_nests -
2856
704
  ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2857
2858
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2859
2860
704
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2861
2862
/* Now scan the pattern */
2863
2864
198k
while (ptr < ptrend)
2865
198k
  {
2866
198k
  int prev_expect_cond_assert;
2867
198k
  uint32_t min_repeat = 0, max_repeat = 0;
2868
198k
  uint32_t set, unset, *optset;
2869
198k
  uint32_t xset, xunset, *xoptset;
2870
198k
  uint32_t terminator;
2871
198k
  uint32_t prev_meta_quantifier;
2872
198k
  BOOL prev_okquantifier;
2873
198k
  PCRE2_SPTR tempptr;
2874
198k
  PCRE2_SIZE offset;
2875
2876
198k
  if (parsed_pattern >= parsed_pattern_end)
2877
0
    {
2878
0
    errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2879
0
    goto FAILED;
2880
0
    }
2881
2882
198k
  if (nest_depth > cb->cx->parens_nest_limit)
2883
0
    {
2884
0
    errorcode = ERR19;
2885
0
    goto FAILED;        /* Parentheses too deeply nested */
2886
0
    }
2887
2888
  /* If the last time round this loop something was added, parsed_pattern will
2889
  no longer be equal to this_parsed_item. Remember where the previous item
2890
  started and reset for the next item. Note that sometimes round the loop,
2891
  nothing gets added (e.g. for ignored white space). */
2892
2893
198k
  if (this_parsed_item != parsed_pattern)
2894
193k
    {
2895
193k
    prev_parsed_item = this_parsed_item;
2896
193k
    this_parsed_item = parsed_pattern;
2897
193k
    }
2898
2899
  /* Get next input character, save its position for callout handling. */
2900
2901
198k
  thisptr = ptr;
2902
198k
  GETCHARINCTEST(c, ptr);
2903
2904
  /* Copy quoted literals until \E, allowing for the possibility of automatic
2905
  callouts, except when processing a (*VERB) "name".  */
2906
2907
198k
  if (inescq)
2908
3.15k
    {
2909
3.15k
    if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2910
0
      {
2911
0
      inescq = FALSE;
2912
0
      ptr++;   /* Skip E */
2913
0
      }
2914
3.15k
    else
2915
3.15k
      {
2916
3.15k
      if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
2917
0
        {                           /* expecting a conditional assertion, */
2918
0
        ptr--;                      /* but an empty \Q\E sequence is OK.  */
2919
0
        errorcode = ERR28;
2920
0
        goto FAILED;
2921
0
        }
2922
3.15k
      if (inverbname)
2923
0
        {                          /* Don't use PARSED_LITERAL() because it */
2924
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2925
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2926
#endif
2927
0
        *parsed_pattern++ = c;
2928
0
        }
2929
3.15k
      else
2930
3.15k
        {
2931
3.15k
        if (after_manual_callout-- <= 0)
2932
3.15k
          parsed_pattern = manage_callouts(thisptr, &previous_callout,
2933
3.15k
            auto_callout, parsed_pattern, cb);
2934
3.15k
        PARSED_LITERAL(c, parsed_pattern);
2935
3.15k
        }
2936
3.15k
      meta_quantifier = 0;
2937
3.15k
      }
2938
3.15k
    continue;  /* Next character */
2939
3.15k
    }
2940
2941
  /* If we are processing the "name" part of a (*VERB:NAME) item, all
2942
  characters up to the closing parenthesis are literals except when
2943
  PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2944
  and \E and escaped characters are allowed (no character types such as \d). If
2945
  PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2946
  this by not entering the special (*VERB:NAME) processing - they are then
2947
  picked up below. Note that c is a character, not a code unit, so we must not
2948
  use MAX_255 to test its size because MAX_255 tests code units and is assumed
2949
  TRUE in 8-bit mode. */
2950
2951
195k
  if (inverbname &&
2952
10
       (
2953
        /* EITHER: not both options set */
2954
10
        ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2955
10
                    (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2956
0
#ifdef SUPPORT_UNICODE
2957
        /* OR: character > 255 AND not Unicode Pattern White Space */
2958
0
        (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2959
0
#endif
2960
        /* OR: not a # comment or isspace() white space */
2961
0
        (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2962
0
#ifdef SUPPORT_UNICODE
2963
        /* and not CHAR_NEL when Unicode is supported */
2964
0
          && c != CHAR_NEL
2965
0
#endif
2966
0
       )))
2967
10
    {
2968
10
    PCRE2_SIZE verbnamelength;
2969
2970
10
    switch(c)
2971
10
      {
2972
10
      default:                     /* Don't use PARSED_LITERAL() because it */
2973
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2974
      if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2975
#endif
2976
10
      *parsed_pattern++ = c;
2977
10
      break;
2978
2979
0
      case CHAR_RIGHT_PARENTHESIS:
2980
0
      inverbname = FALSE;
2981
      /* This is the length in characters */
2982
0
      verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2983
      /* But the limit on the length is in code units */
2984
0
      if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2985
0
        {
2986
0
        ptr--;
2987
0
        errorcode = ERR76;
2988
0
        goto FAILED;
2989
0
        }
2990
0
      *verblengthptr = (uint32_t)verbnamelength;
2991
2992
      /* If this name was on a verb such as (*ACCEPT) which does not continue,
2993
      a (*MARK) was generated for the name. We now add the original verb as the
2994
      next item. */
2995
2996
0
      if (add_after_mark != 0)
2997
0
        {
2998
0
        *parsed_pattern++ = add_after_mark;
2999
0
        add_after_mark = 0;
3000
0
        }
3001
0
      break;
3002
3003
0
      case CHAR_BACKSLASH:
3004
0
      if ((options & PCRE2_ALT_VERBNAMES) != 0)
3005
0
        {
3006
0
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3007
0
          xoptions, FALSE, cb);
3008
0
        if (errorcode != 0) goto FAILED;
3009
0
        }
3010
0
      else escape = 0;   /* Treat all as literal */
3011
3012
0
      switch(escape)
3013
0
        {
3014
0
        case 0:                    /* Don't use PARSED_LITERAL() because it */
3015
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3016
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3017
#endif
3018
0
        *parsed_pattern++ = c;
3019
0
        break;
3020
3021
0
        case ESC_ub:
3022
0
        *parsed_pattern++ = CHAR_u;
3023
0
        PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3024
0
        break;
3025
3026
0
        case ESC_Q:
3027
0
        inescq = TRUE;
3028
0
        break;
3029
3030
0
        case ESC_E:           /* Ignore */
3031
0
        break;
3032
3033
0
        default:
3034
0
        errorcode = ERR40;    /* Invalid in verb name */
3035
0
        goto FAILED;
3036
0
        }
3037
10
      }
3038
10
    continue;   /* Next character in pattern */
3039
10
    }
3040
3041
  /* Not a verb name character. At this point we must process everything that
3042
  must not change the quantification state. This is mainly comments, but we
3043
  handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3044
  A+, as in Perl. An isolated \E is ignored. */
3045
3046
194k
  if (c == CHAR_BACKSLASH && ptr < ptrend)
3047
7.16k
    {
3048
7.16k
    if (*ptr == CHAR_Q || *ptr == CHAR_E)
3049
3
      {
3050
3
      inescq = *ptr == CHAR_Q;
3051
3
      ptr++;
3052
3
      continue;
3053
3
      }
3054
7.16k
    }
3055
3056
  /* Skip over whitespace and # comments in extended mode. Note that c is a
3057
  character, not a code unit, so we must not use MAX_255 to test its size
3058
  because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3059
  whitespace characters are those designated as "Pattern White Space" by
3060
  Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3061
  U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3062
  subset of space characters that match \h and \v. */
3063
3064
194k
  if ((options & PCRE2_EXTENDED) != 0)
3065
29
    {
3066
29
    if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3067
28
#ifdef SUPPORT_UNICODE
3068
28
    if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3069
28
#endif
3070
28
    if (c == CHAR_NUMBER_SIGN)
3071
0
      {
3072
0
      while (ptr < ptrend)
3073
0
        {
3074
0
        if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3075
0
          {                       /* IS_NEWLINE sets cb->nllen. */
3076
0
          ptr += cb->nllen;
3077
0
          break;
3078
0
          }
3079
0
        ptr++;
3080
0
#ifdef SUPPORT_UNICODE
3081
0
        if (utf) FORWARDCHARTEST(ptr, ptrend);
3082
0
#endif
3083
0
        }
3084
0
      continue;  /* Next character in pattern */
3085
0
      }
3086
28
    }
3087
3088
  /* Skip over bracketed comments */
3089
3090
194k
  if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3091
1.43k
      ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3092
0
    {
3093
0
    while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3094
0
    if (ptr >= ptrend)
3095
0
      {
3096
0
      errorcode = ERR18;  /* A special error for missing ) in a comment */
3097
0
      goto FAILED;        /* to make it easier to debug. */
3098
0
      }
3099
0
    ptr++;
3100
0
    continue;  /* Next character in pattern */
3101
0
    }
3102
3103
  /* If the next item is not a quantifier, fill in length of any previous
3104
  callout and create an auto callout if required. */
3105
3106
194k
  if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3107
176k
       (c != CHAR_LEFT_CURLY_BRACKET ||
3108
74
         (tempptr = ptr,
3109
74
         !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3110
176k
    {
3111
176k
    if (after_manual_callout-- <= 0)
3112
176k
      {
3113
176k
      parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3114
176k
        parsed_pattern, cb);
3115
176k
      this_parsed_item = parsed_pattern;  /* New start for current item */
3116
176k
      }
3117
176k
    }
3118
3119
  /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3120
  assertion, possibly preceded by a callout. If the value is 1, we have just
3121
  had the callout and expect an assertion. There must be at least 3 more
3122
  characters in all cases. When expect_cond_assert is 2, we know that the
3123
  current character is an opening parenthesis, as otherwise we wouldn't be
3124
  here. However, when it is 1, we need to check, and it's easiest just to check
3125
  always. Note that expect_cond_assert may be negative, since all callouts just
3126
  decrement it. */
3127
3128
194k
  if (expect_cond_assert > 0)
3129
0
    {
3130
0
    BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3131
0
              (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3132
0
    if (ok)
3133
0
      {
3134
0
      if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3135
0
        {
3136
0
        ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3137
0
        }
3138
0
      else switch(ptr[1])  /* Traditional symbolic format */
3139
0
        {
3140
0
        case CHAR_C:
3141
0
        ok = expect_cond_assert == 2;
3142
0
        break;
3143
3144
0
        case CHAR_EQUALS_SIGN:
3145
0
        case CHAR_EXCLAMATION_MARK:
3146
0
        break;
3147
3148
0
        case CHAR_LESS_THAN_SIGN:
3149
0
        ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3150
0
        break;
3151
3152
0
        default:
3153
0
        ok = FALSE;
3154
0
        }
3155
0
      }
3156
3157
0
    if (!ok)
3158
0
      {
3159
0
      ptr--;   /* Adjust error offset */
3160
0
      errorcode = ERR28;
3161
0
      goto FAILED;
3162
0
      }
3163
0
    }
3164
3165
  /* Remember whether we are expecting a conditional assertion, and set the
3166
  default for this item. */
3167
3168
194k
  prev_expect_cond_assert = expect_cond_assert;
3169
194k
  expect_cond_assert = 0;
3170
3171
  /* Remember quantification status for the previous significant item, then set
3172
  default for this item. */
3173
3174
194k
  prev_okquantifier = okquantifier;
3175
194k
  prev_meta_quantifier = meta_quantifier;
3176
194k
  okquantifier = FALSE;
3177
194k
  meta_quantifier = 0;
3178
3179
  /* If the previous significant item was a quantifier, adjust the parsed code
3180
  if there is a following modifier. The base meta value is always followed by
3181
  the PLUS and QUERY values, in that order. We do this here rather than after
3182
  reading a quantifier so that intervening comments and /x whitespace can be
3183
  ignored without having to replicate code. */
3184
3185
194k
  if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3186
4.40k
    {
3187
4.40k
    parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3188
4.40k
      prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3189
3.54k
        0x00020000u : 0x00010000u);
3190
4.40k
    continue;  /* Next character in pattern */
3191
4.40k
    }
3192
3193
  /* Process the next item in the main part of a pattern. */
3194
3195
190k
  switch(c)
3196
190k
    {
3197
152k
    default:              /* Non-special character */
3198
152k
    PARSED_LITERAL(c, parsed_pattern);
3199
152k
    break;
3200
3201
3202
    /* ---- Escape sequence ---- */
3203
3204
7.16k
    case CHAR_BACKSLASH:
3205
7.16k
    tempptr = ptr;
3206
7.16k
    escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3207
7.16k
      xoptions, FALSE, cb);
3208
7.16k
    if (errorcode != 0)
3209
14
      {
3210
18
      ESCAPE_FAILED:
3211
18
      if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3212
18
        goto FAILED;
3213
0
      ptr = tempptr;
3214
0
      if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3215
0
        {
3216
0
        GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3217
0
        }
3218
0
      escape = 0;                 /* Treat as literal character */
3219
0
      }
3220
3221
    /* The escape was a data escape or literal character. */
3222
3223
7.14k
    if (escape == 0)
3224
3.57k
      {
3225
3.57k
      PARSED_LITERAL(c, parsed_pattern);
3226
3.57k
      }
3227
3228
    /* The escape was a back (or forward) reference. We keep the offset in
3229
    order to give a more useful diagnostic for a bad forward reference. For
3230
    references to groups numbered less than 10 we can't use more than two items
3231
    in parsed_pattern because they may be just two characters in the input (and
3232
    in a 64-bit world an offset may need two elements). So for them, the offset
3233
    of the first occurrent is held in a special vector. */
3234
3235
3.57k
    else if (escape < 0)
3236
806
      {
3237
806
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3238
806
      escape = -escape;
3239
806
      *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3240
806
      if (escape < 10)
3241
669
        {
3242
669
        if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3243
103
          cb->small_ref_offset[escape] = offset;
3244
669
        }
3245
137
      else
3246
137
        {
3247
137
        PUTOFFSET(offset, parsed_pattern);
3248
137
        }
3249
806
      okquantifier = TRUE;
3250
806
      }
3251
3252
    /* The escape was a character class such as \d etc. or other special
3253
    escape indicator such as \A or \X. Most of them generate just a single
3254
    parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3255
    value. They are supported only when Unicode is available. The type and
3256
    value are packed into a single 32-bit value so that the whole sequences
3257
    uses only two elements in the parsed_vector. This is because the same
3258
    coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3259
    set.
3260
3261
    There are also some cases where the escape sequence is followed by a name:
3262
    \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3263
    and \g'name' are subroutine calls by name; \g{name} is a synonym for
3264
    \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3265
    and returned as a negative value (handled above). A name is coded as an
3266
    offset into the pattern and a length. */
3267
3268
2.76k
    else switch (escape)
3269
2.76k
      {
3270
29
      case ESC_C:
3271
#ifdef NEVER_BACKSLASH_C
3272
      errorcode = ERR85;
3273
      goto ESCAPE_FAILED;
3274
#else
3275
29
      if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3276
0
        {
3277
0
        errorcode = ERR83;
3278
0
        goto ESCAPE_FAILED;
3279
0
        }
3280
29
#endif
3281
29
      okquantifier = TRUE;
3282
29
      *parsed_pattern++ = META_ESCAPE + escape;
3283
29
      break;
3284
3285
      /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3286
      when \u{ is not followed by hex digits and }. It requests two literal
3287
      characters, u and { and we need this, as otherwise \u{ 12} (for example)
3288
      would be treated as u{12} now that spaces are allowed in quantifiers. */
3289
3290
0
      case ESC_ub:
3291
0
      *parsed_pattern++ = CHAR_u;
3292
0
      PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3293
0
      break;
3294
3295
49
      case ESC_X:
3296
#ifndef SUPPORT_UNICODE
3297
      errorcode = ERR45;   /* Supported only with Unicode support */
3298
      goto ESCAPE_FAILED;
3299
#endif
3300
98
      case ESC_H:
3301
112
      case ESC_h:
3302
168
      case ESC_N:
3303
285
      case ESC_R:
3304
343
      case ESC_V:
3305
387
      case ESC_v:
3306
387
      okquantifier = TRUE;
3307
387
      *parsed_pattern++ = META_ESCAPE + escape;
3308
387
      break;
3309
3310
88
      default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3311
88
      *parsed_pattern++ = META_ESCAPE + escape;
3312
88
      break;
3313
3314
      /* Escapes that may change in UCP mode. */
3315
3316
383
      case ESC_d:
3317
413
      case ESC_D:
3318
798
      case ESC_s:
3319
855
      case ESC_S:
3320
1.93k
      case ESC_w:
3321
1.97k
      case ESC_W:
3322
1.97k
      okquantifier = TRUE;
3323
1.97k
      parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3324
1.97k
        xoptions);
3325
1.97k
      break;
3326
3327
      /* Unicode property matching */
3328
3329
267
      case ESC_P:
3330
285
      case ESC_p:
3331
285
#ifdef SUPPORT_UNICODE
3332
285
        {
3333
285
        BOOL negated;
3334
285
        uint16_t ptype = 0, pdata = 0;
3335
285
        if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3336
4
          goto ESCAPE_FAILED;
3337
281
        if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3338
281
        *parsed_pattern++ = META_ESCAPE + escape;
3339
281
        *parsed_pattern++ = (ptype << 16) | pdata;
3340
281
        okquantifier = TRUE;
3341
281
        }
3342
#else
3343
      errorcode = ERR45;
3344
      goto ESCAPE_FAILED;
3345
#endif
3346
0
      break;  /* End \P and \p */
3347
3348
      /* When \g is used with quotes or angle brackets as delimiters, it is a
3349
      numerical or named subroutine call, and control comes here. When used
3350
      with brace delimiters it is a numberical back reference and does not come
3351
      here because check_escape() returns it directly as a reference. \k is
3352
      always a named back reference. */
3353
3354
0
      case ESC_g:
3355
0
      case ESC_k:
3356
0
      if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3357
0
          *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3358
0
        {
3359
0
        errorcode = (escape == ESC_g)? ERR57 : ERR69;
3360
0
        goto ESCAPE_FAILED;
3361
0
        }
3362
0
      terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3363
0
        CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3364
0
        CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3365
3366
      /* For a non-braced \g, check for a numerical recursion. */
3367
3368
0
      if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3369
0
        {
3370
0
        PCRE2_SPTR p = ptr + 1;
3371
3372
0
        if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3373
0
            &errorcode))
3374
0
          {
3375
0
          if (p >= ptrend || *p != terminator)
3376
0
            {
3377
0
            errorcode = ERR57;
3378
0
            goto ESCAPE_FAILED;
3379
0
            }
3380
0
          ptr = p;
3381
0
          goto SET_RECURSION;
3382
0
          }
3383
0
        if (errorcode != 0) goto ESCAPE_FAILED;
3384
0
        }
3385
3386
      /* Not a numerical recursion. Perl allows spaces and tabs after { and
3387
      before } but not for other delimiters. */
3388
3389
0
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3390
0
          &errorcode, cb)) goto ESCAPE_FAILED;
3391
3392
      /* \k and \g when used with braces are back references, whereas \g used
3393
      with quotes or angle brackets is a recursion */
3394
3395
0
      *parsed_pattern++ =
3396
0
        (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3397
0
          META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3398
0
      *parsed_pattern++ = namelen;
3399
3400
0
      PUTOFFSET(offset, parsed_pattern);
3401
0
      okquantifier = TRUE;
3402
0
      break;  /* End special escape processing */
3403
2.76k
      }
3404
7.14k
    break;    /* End escape sequence processing */
3405
3406
3407
    /* ---- Single-character special items ---- */
3408
3409
7.14k
    case CHAR_CIRCUMFLEX_ACCENT:
3410
1.06k
    *parsed_pattern++ = META_CIRCUMFLEX;
3411
1.06k
    break;
3412
3413
162
    case CHAR_DOLLAR_SIGN:
3414
162
    *parsed_pattern++ = META_DOLLAR;
3415
162
    break;
3416
3417
1.87k
    case CHAR_DOT:
3418
1.87k
    *parsed_pattern++ = META_DOT;
3419
1.87k
    okquantifier = TRUE;
3420
1.87k
    break;
3421
3422
3423
    /* ---- Single-character quantifiers ---- */
3424
3425
3.94k
    case CHAR_ASTERISK:
3426
3.94k
    meta_quantifier = META_ASTERISK;
3427
3.94k
    goto CHECK_QUANTIFIER;
3428
3429
2.46k
    case CHAR_PLUS:
3430
2.46k
    meta_quantifier = META_PLUS;
3431
2.46k
    goto CHECK_QUANTIFIER;
3432
3433
7.50k
    case CHAR_QUESTION_MARK:
3434
7.50k
    meta_quantifier = META_QUERY;
3435
7.50k
    goto CHECK_QUANTIFIER;
3436
3437
3438
    /* ---- Potential {n,m} quantifier ---- */
3439
3440
74
    case CHAR_LEFT_CURLY_BRACKET:
3441
74
    if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3442
74
        &errorcode))
3443
74
      {
3444
74
      if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3445
74
      PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3446
74
      break;                               /* No more quantifier processing */
3447
74
      }
3448
0
    meta_quantifier = META_MINMAX;
3449
    /* Fall through */
3450
3451
3452
    /* ---- Quantifier post-processing ---- */
3453
3454
    /* Check that a quantifier is allowed after the previous item. This
3455
    guarantees that there is a previous item. */
3456
3457
13.9k
    CHECK_QUANTIFIER:
3458
13.9k
    if (!prev_okquantifier)
3459
0
      {
3460
0
      errorcode = ERR9;
3461
0
      goto FAILED_BACK;
3462
0
      }
3463
3464
    /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3465
    quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3466
    sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3467
    wrapping it in non-capturing brackets, but we have to allow for a preceding
3468
    (*MARK) for when (*ACCEPT) has an argument. */
3469
3470
13.9k
    if (*prev_parsed_item == META_ACCEPT)
3471
0
      {
3472
0
      uint32_t *p;
3473
0
      for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3474
0
      *verbstartptr = META_NOCAPTURE;
3475
0
      parsed_pattern[1] = META_KET;
3476
0
      parsed_pattern += 2;
3477
0
      }
3478
3479
    /* Now we can put the quantifier into the parsed pattern vector. At this
3480
    stage, we have only the basic quantifier. The check for a following + or ?
3481
    modifier happens at the top of the loop, after any intervening comments
3482
    have been removed. */
3483
3484
13.9k
    *parsed_pattern++ = meta_quantifier;
3485
13.9k
    if (c == CHAR_LEFT_CURLY_BRACKET)
3486
0
      {
3487
0
      *parsed_pattern++ = min_repeat;
3488
0
      *parsed_pattern++ = max_repeat;
3489
0
      }
3490
13.9k
    break;
3491
3492
3493
    /* ---- Character class ---- */
3494
3495
5.09k
    case CHAR_LEFT_SQUARE_BRACKET:
3496
5.09k
    okquantifier = TRUE;
3497
3498
    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3499
    used for "start of word" and "end of word". As these are otherwise illegal
3500
    sequences, we don't break anything by recognizing them. They are replaced
3501
    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3502
    erroneous and are handled by the normal code below. */
3503
3504
5.09k
    if (ptrend - ptr >= 6 &&
3505
5.08k
         (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3506
5.08k
          PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3507
0
      {
3508
0
      *parsed_pattern++ = META_ESCAPE + ESC_b;
3509
3510
0
      if (ptr[2] == CHAR_LESS_THAN_SIGN)
3511
0
        {
3512
0
        *parsed_pattern++ = META_LOOKAHEAD;
3513
0
        }
3514
0
      else
3515
0
        {
3516
0
        *parsed_pattern++ = META_LOOKBEHIND;
3517
0
        *has_lookbehind = TRUE;
3518
3519
        /* The offset is used only for the "non-fixed length" error; this won't
3520
        occur here, so just store zero. */
3521
3522
0
        PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3523
0
        }
3524
3525
0
      if ((options & PCRE2_UCP) == 0)
3526
0
        *parsed_pattern++ = META_ESCAPE + ESC_w;
3527
0
      else
3528
0
        {
3529
0
        *parsed_pattern++ = META_ESCAPE + ESC_p;
3530
0
        *parsed_pattern++ = PT_WORD << 16;
3531
0
        }
3532
0
      *parsed_pattern++ = META_KET;
3533
0
      ptr += 6;
3534
0
      break;
3535
0
      }
3536
3537
    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3538
    they are encountered at the top level, so we'll do that too. */
3539
3540
5.09k
    if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3541
5.04k
         *ptr == CHAR_EQUALS_SIGN) &&
3542
219
        check_posix_syntax(ptr, ptrend, &tempptr))
3543
1
      {
3544
1
      errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3545
1
      goto FAILED;
3546
1
      }
3547
3548
    /* Process a regular character class. If the first character is '^', set
3549
    the negation flag. If the first few characters (either before or after ^)
3550
    are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3551
    This makes for compatibility with Perl. */
3552
3553
5.09k
    negate_class = FALSE;
3554
6.43k
    while (ptr < ptrend)
3555
6.43k
      {
3556
6.43k
      GETCHARINCTEST(c, ptr);
3557
6.43k
      if (c == CHAR_BACKSLASH)
3558
11
        {
3559
11
        if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3560
11
        else if (ptrend - ptr >= 3 &&
3561
11
             PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3562
0
          ptr += 3;
3563
11
        else
3564
11
          break;
3565
11
        }
3566
6.42k
      else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3567
0
               (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
3568
0
        continue;
3569
6.42k
      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3570
1.34k
        negate_class = TRUE;
3571
5.07k
      else break;
3572
6.43k
      }
3573
3574
    /* Now the real contents of the class; c has the first "real" character.
3575
    Empty classes are permitted only if the option is set. */
3576
3577
5.09k
    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3578
160
        (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3579
0
      {
3580
0
      *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3581
0
      break;  /* End of class processing */
3582
0
      }
3583
3584
    /* Process a non-empty class. */
3585
3586
5.09k
    *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3587
5.09k
    class_range_state = RANGE_NO;
3588
3589
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3590
    because there are holes in the encoding, and simply using the range A-Z
3591
    (for example) would include the characters in the holes. This applies only
3592
    to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3593
    in this respect. In order to accommodate this, we keep track of whether
3594
    character values are literal or not, and a state variable for handling
3595
    ranges. */
3596
3597
    /* Loop for the contents of the class */
3598
3599
5.09k
    for (;;)
3600
88.8k
      {
3601
88.8k
      BOOL char_is_literal = TRUE;
3602
3603
      /* Inside \Q...\E everything is literal except \E */
3604
3605
88.8k
      if (inescq)
3606
0
        {
3607
0
        if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3608
0
          {
3609
0
          inescq = FALSE;                   /* Reset literal state */
3610
0
          ptr++;                            /* Skip the 'E' */
3611
0
          goto CLASS_CONTINUE;
3612
0
          }
3613
0
        goto CLASS_LITERAL;
3614
0
        }
3615
3616
      /* Skip over space and tab (only) in extended-more mode. */
3617
3618
88.8k
      if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3619
0
          (c == CHAR_SPACE || c == CHAR_HT))
3620
0
        goto CLASS_CONTINUE;
3621
3622
      /* Handle POSIX class names. Perl allows a negation extension of the
3623
      form [:^name:]. A square bracket that doesn't match the syntax is
3624
      treated as a literal. We also recognize the POSIX constructions
3625
      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3626
      5.6 and 5.8 do. */
3627
3628
88.8k
      if (c == CHAR_LEFT_SQUARE_BRACKET &&
3629
1.61k
          ptrend - ptr >= 3 &&
3630
1.61k
          (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3631
1.60k
           *ptr == CHAR_EQUALS_SIGN) &&
3632
30
          check_posix_syntax(ptr, ptrend, &tempptr))
3633
0
        {
3634
0
        BOOL posix_negate = FALSE;
3635
0
        int posix_class;
3636
3637
        /* Perl treats a hyphen before a POSIX class as a literal, not the
3638
        start of a range. However, it gives a warning in its warning mode. PCRE
3639
        does not have a warning mode, so we give an error, because this is
3640
        likely an error on the user's part. */
3641
3642
0
        if (class_range_state == RANGE_STARTED)
3643
0
          {
3644
0
          errorcode = ERR50;
3645
0
          goto FAILED;
3646
0
          }
3647
3648
0
        if (*ptr != CHAR_COLON)
3649
0
          {
3650
0
          errorcode = ERR13;
3651
0
          goto FAILED_BACK;
3652
0
          }
3653
3654
0
        if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3655
0
          {
3656
0
          posix_negate = TRUE;
3657
0
          ptr++;
3658
0
          }
3659
3660
0
        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3661
0
        if (posix_class < 0)
3662
0
          {
3663
0
          errorcode = ERR30;
3664
0
          goto FAILED;
3665
0
          }
3666
0
        ptr = tempptr + 2;
3667
3668
        /* Perl treats a hyphen after a POSIX class as a literal, not the
3669
        start of a range. However, it gives a warning in its warning mode
3670
        unless the hyphen is the last character in the class. PCRE does not
3671
        have a warning mode, so we give an error, because this is likely an
3672
        error on the user's part. */
3673
3674
0
        if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3675
0
            ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3676
0
          {
3677
0
          errorcode = ERR50;
3678
0
          goto FAILED;
3679
0
          }
3680
3681
        /* Set "a hyphen is not the start of a range" for the -] case, and also
3682
        in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3683
        fuzzers do that kind of thing) and *then* a hyphen. This causes that
3684
        hyphen to be treated as a literal. I don't think it's worth setting up
3685
        special apparatus to do otherwise. */
3686
3687
0
        class_range_state = RANGE_NO;
3688
3689
        /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3690
        of the POSIX classes are converted to use Unicode properties \p or \P
3691
        or, in one case, \h or \H. The substitutes table has two values per
3692
        class, containing the type and value of a \p or \P item. The special
3693
        cases are specified with a negative type: a non-zero value causes \h or
3694
        \H to be used, and a zero value falls through to behave like a non-UCP
3695
        POSIX class. There are now also some extra options that force ASCII for
3696
        some classes. */
3697
3698
0
#ifdef SUPPORT_UNICODE
3699
0
        if ((options & PCRE2_UCP) != 0 &&
3700
0
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3701
0
            !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3702
0
              (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3703
0
          {
3704
0
          int ptype = posix_substitutes[2*posix_class];
3705
0
          int pvalue = posix_substitutes[2*posix_class + 1];
3706
3707
0
          if (ptype >= 0)
3708
0
            {
3709
0
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3710
0
            *parsed_pattern++ = (ptype << 16) | pvalue;
3711
0
            goto CLASS_CONTINUE;
3712
0
            }
3713
3714
0
          if (pvalue != 0)
3715
0
            {
3716
0
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3717
0
            goto CLASS_CONTINUE;
3718
0
            }
3719
3720
          /* Fall through */
3721
0
          }
3722
0
#endif  /* SUPPORT_UNICODE */
3723
3724
        /* Non-UCP POSIX class */
3725
3726
0
        *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3727
0
        *parsed_pattern++ = posix_class;
3728
0
        }
3729
3730
      /* Handle potential start of range */
3731
3732
88.8k
      else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3733
730
        {
3734
730
        *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3735
729
          META_RANGE_LITERAL : META_RANGE_ESCAPED;
3736
730
        class_range_state = RANGE_STARTED;
3737
730
        }
3738
3739
      /* Handle a literal character */
3740
3741
88.1k
      else if (c != CHAR_BACKSLASH)
3742
86.4k
        {
3743
87.2k
        CLASS_LITERAL:
3744
87.2k
        if (class_range_state == RANGE_STARTED)
3745
730
          {
3746
730
          if (c == parsed_pattern[-2])       /* Optimize one-char range */
3747
262
            parsed_pattern--;
3748
468
          else if (parsed_pattern[-2] > c)   /* Check range is in order */
3749
6
            {
3750
6
            errorcode = ERR8;
3751
6
            goto FAILED_BACK;
3752
6
            }
3753
462
          else
3754
462
            {
3755
462
            if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3756
1
              parsed_pattern[-1] = META_RANGE_ESCAPED;
3757
462
            PARSED_LITERAL(c, parsed_pattern);
3758
462
            }
3759
724
          class_range_state = RANGE_NO;
3760
724
          }
3761
86.5k
        else  /* Potential start of range */
3762
86.5k
          {
3763
86.5k
          class_range_state = char_is_literal?
3764
85.7k
            RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3765
86.5k
          PARSED_LITERAL(c, parsed_pattern);
3766
86.5k
          }
3767
87.2k
        }
3768
3769
      /* Handle escapes in a class */
3770
3771
1.72k
      else
3772
1.72k
        {
3773
1.72k
        tempptr = ptr;
3774
1.72k
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3775
1.72k
          xoptions, TRUE, cb);
3776
3777
1.72k
        if (errorcode != 0)
3778
0
          {
3779
0
          if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3780
0
            goto FAILED;
3781
0
          ptr = tempptr;
3782
0
          if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3783
0
            {
3784
0
            GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3785
0
            }
3786
0
          escape = 0;                 /* Treat as literal character */
3787
0
          }
3788
3789
1.72k
        switch(escape)
3790
1.72k
          {
3791
848
          case 0:  /* Escaped character code point is in c */
3792
848
          char_is_literal = FALSE;
3793
848
          goto CLASS_LITERAL;      /* (a few lines above) */
3794
3795
1
          case ESC_b:
3796
1
          c = CHAR_BS;    /* \b is backspace in a class */
3797
1
          char_is_literal = FALSE;
3798
1
          goto CLASS_LITERAL;
3799
3800
0
          case ESC_Q:
3801
0
          inescq = TRUE;  /* Enter literal mode */
3802
0
          goto CLASS_CONTINUE;
3803
3804
0
          case ESC_E:     /* Ignore orphan \E */
3805
0
          goto CLASS_CONTINUE;
3806
3807
1
          case ESC_B:     /* Always an error in a class */
3808
1
          case ESC_R:
3809
1
          case ESC_X:
3810
1
          errorcode = ERR7;
3811
1
          ptr--;
3812
1
          goto FAILED;
3813
1.72k
          }
3814
3815
        /* The second part of a range can be a single-character escape
3816
        sequence (detected above), but not any of the other escapes. Perl
3817
        treats a hyphen as a literal in such circumstances. However, in Perl's
3818
        warning mode, a warning is given, so PCRE now faults it, as it is
3819
        almost certainly a mistake on the user's part. */
3820
3821
871
        if (class_range_state == RANGE_STARTED)
3822
0
          {
3823
0
          errorcode = ERR50;
3824
0
          goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
3825
0
          }
3826
3827
        /* Of the remaining escapes, only those that define characters are
3828
        allowed in a class. None may start a range. */
3829
3830
871
        class_range_state = RANGE_NO;
3831
871
        switch(escape)
3832
871
          {
3833
0
          case ESC_N:
3834
0
          errorcode = ERR71;
3835
0
          goto FAILED;
3836
3837
4
          case ESC_H:
3838
6
          case ESC_h:
3839
6
          case ESC_V:
3840
6
          case ESC_v:
3841
6
          *parsed_pattern++ = META_ESCAPE + escape;
3842
6
          break;
3843
3844
          /* These escapes may be converted to Unicode property tests when
3845
          PCRE2_UCP is set. */
3846
3847
288
          case ESC_d:
3848
291
          case ESC_D:
3849
306
          case ESC_s:
3850
307
          case ESC_S:
3851
579
          case ESC_w:
3852
846
          case ESC_W:
3853
846
          parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3854
846
            xoptions);
3855
846
          break;
3856
3857
          /* Explicit Unicode property matching */
3858
3859
0
          case ESC_P:
3860
19
          case ESC_p:
3861
19
#ifdef SUPPORT_UNICODE
3862
19
            {
3863
19
            BOOL negated;
3864
19
            uint16_t ptype = 0, pdata = 0;
3865
19
            if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3866
1
              goto FAILED;
3867
18
            if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3868
18
            *parsed_pattern++ = META_ESCAPE + escape;
3869
18
            *parsed_pattern++ = (ptype << 16) | pdata;
3870
18
            }
3871
#else
3872
          errorcode = ERR45;
3873
          goto FAILED;
3874
#endif
3875
0
          break;  /* End \P and \p */
3876
3877
0
          default:    /* All others are not allowed in a class */
3878
0
          errorcode = ERR7;
3879
0
          ptr--;
3880
0
          goto FAILED;
3881
871
          }
3882
3883
        /* Perl gives a warning unless a following hyphen is the last character
3884
        in the class. PCRE throws an error. */
3885
3886
870
        if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3887
0
            ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3888
0
          {
3889
0
          errorcode = ERR50;
3890
0
          goto FAILED;
3891
0
          }
3892
870
        }
3893
3894
      /* Proceed to next thing in the class. */
3895
3896
88.8k
      CLASS_CONTINUE:
3897
88.8k
      if (ptr >= ptrend)
3898
276
        {
3899
276
        errorcode = ERR6;  /* Missing terminating ']' */
3900
276
        goto FAILED;
3901
276
        }
3902
88.6k
      GETCHARINCTEST(c, ptr);
3903
88.6k
      if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3904
88.6k
      }     /* End of class-processing loop */
3905
3906
    /* -] at the end of a class is a literal '-' */
3907
3908
4.80k
    if (class_range_state == RANGE_STARTED)
3909
0
      {
3910
0
      parsed_pattern[-1] = CHAR_MINUS;
3911
0
      class_range_state = RANGE_NO;
3912
0
      }
3913
3914
4.80k
    *parsed_pattern++ = META_CLASS_END;
3915
4.80k
    break;  /* End of character class */
3916
3917
3918
    /* ---- Opening parenthesis ---- */
3919
3920
1.43k
    case CHAR_LEFT_PARENTHESIS:
3921
1.43k
    if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3922
3923
    /* If ( is not followed by ? it is either a capture or a special verb or an
3924
    alpha assertion or a positive non-atomic lookahead. */
3925
3926
1.43k
    if (*ptr != CHAR_QUESTION_MARK)
3927
1.42k
      {
3928
1.42k
      const char *vn;
3929
3930
      /* Handle capturing brackets (or non-capturing if auto-capture is turned
3931
      off). */
3932
3933
1.42k
      if (*ptr != CHAR_ASTERISK)
3934
1.42k
        {
3935
1.42k
        nest_depth++;
3936
1.42k
        if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3937
1.42k
          {
3938
1.42k
          if (cb->bracount >= MAX_GROUP_NUMBER)
3939
0
            {
3940
0
            errorcode = ERR97;
3941
0
            goto FAILED;
3942
0
            }
3943
1.42k
          cb->bracount++;
3944
1.42k
          *parsed_pattern++ = META_CAPTURE | cb->bracount;
3945
1.42k
          }
3946
0
        else *parsed_pattern++ = META_NOCAPTURE;
3947
1.42k
        }
3948
3949
      /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3950
      quantifier" error rather than "(*MARK) must have an argument". */
3951
3952
2
      else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3953
0
        break;
3954
3955
      /* Handle "alpha assertions" such as (*pla:...). Most of these are
3956
      synonyms for the historical symbolic assertions, but the script run and
3957
      non-atomic lookaround ones are new. They are distinguished by starting
3958
      with a lower case letter. Checking both ends of the alphabet makes this
3959
      work in all character codes. */
3960
3961
2
      else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3962
1
        {
3963
1
        uint32_t meta;
3964
3965
1
        vn = alasnames;
3966
1
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3967
1
          &errorcode, cb)) goto FAILED;
3968
1
        if (ptr >= ptrend || *ptr != CHAR_COLON)
3969
1
          {
3970
1
          errorcode = ERR95;  /* Malformed */
3971
1
          goto FAILED;
3972
1
          }
3973
3974
        /* Scan the table of alpha assertion names */
3975
3976
0
        for (i = 0; i < alascount; i++)
3977
0
          {
3978
0
          if (namelen == alasmeta[i].len &&
3979
0
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
3980
0
            break;
3981
0
          vn += alasmeta[i].len + 1;
3982
0
          }
3983
3984
0
        if (i >= alascount)
3985
0
          {
3986
0
          errorcode = ERR95;  /* Alpha assertion not recognized */
3987
0
          goto FAILED;
3988
0
          }
3989
3990
        /* Check for expecting an assertion condition. If so, only atomic
3991
        lookaround assertions are valid. */
3992
3993
0
        meta = alasmeta[i].meta;
3994
0
        if (prev_expect_cond_assert > 0 &&
3995
0
            (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3996
0
          {
3997
0
          errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3998
0
            ERR98 : ERR28;  /* (Atomic) assertion expected */
3999
0
          goto FAILED;
4000
0
          }
4001
4002
        /* The lookaround alphabetic synonyms can mostly be handled by jumping
4003
        to the code that handles the traditional symbolic forms. */
4004
4005
0
        switch(meta)
4006
0
          {
4007
0
          default:
4008
0
          errorcode = ERR89;  /* Unknown code; should never occur because */
4009
0
          goto FAILED;        /* the meta values come from a table above. */
4010
4011
0
          case META_ATOMIC:
4012
0
          goto ATOMIC_GROUP;
4013
4014
0
          case META_LOOKAHEAD:
4015
0
          goto POSITIVE_LOOK_AHEAD;
4016
4017
0
          case META_LOOKAHEAD_NA:
4018
0
          goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4019
4020
0
          case META_LOOKAHEADNOT:
4021
0
          goto NEGATIVE_LOOK_AHEAD;
4022
4023
0
          case META_LOOKBEHIND:
4024
0
          case META_LOOKBEHINDNOT:
4025
0
          case META_LOOKBEHIND_NA:
4026
0
          *parsed_pattern++ = meta;
4027
0
          ptr--;
4028
0
          goto POST_LOOKBEHIND;
4029
4030
          /* The script run facilities are handled here. Unicode support is
4031
          required (give an error if not, as this is a security issue). Always
4032
          record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4033
          META_ATOMIC and remember that we need two META_KETs at the end. */
4034
4035
0
          case META_SCRIPT_RUN:
4036
0
          case META_ATOMIC_SCRIPT_RUN:
4037
0
#ifdef SUPPORT_UNICODE
4038
0
          *parsed_pattern++ = META_SCRIPT_RUN;
4039
0
          nest_depth++;
4040
0
          ptr++;
4041
0
          if (meta == META_ATOMIC_SCRIPT_RUN)
4042
0
            {
4043
0
            *parsed_pattern++ = META_ATOMIC;
4044
0
            if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4045
0
            else if (++top_nest >= end_nests)
4046
0
              {
4047
0
              errorcode = ERR84;
4048
0
              goto FAILED;
4049
0
              }
4050
0
            top_nest->nest_depth = nest_depth;
4051
0
            top_nest->flags = NSF_ATOMICSR;
4052
0
            top_nest->options = options & PARSE_TRACKED_OPTIONS;
4053
0
            top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4054
0
            }
4055
0
          break;
4056
#else  /* SUPPORT_UNICODE */
4057
          errorcode = ERR96;
4058
          goto FAILED;
4059
#endif
4060
0
          }
4061
0
        }
4062
4063
4064
      /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4065
4066
1
      else
4067
1
        {
4068
1
        vn = verbnames;
4069
1
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4070
1
          &errorcode, cb)) goto FAILED;
4071
1
        if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4072
0
                              *ptr != CHAR_RIGHT_PARENTHESIS))
4073
0
          {
4074
0
          errorcode = ERR60;  /* Malformed */
4075
0
          goto FAILED;
4076
0
          }
4077
4078
        /* Scan the table of verb names */
4079
4080
1
        for (i = 0; i < verbcount; i++)
4081
1
          {
4082
1
          if (namelen == verbs[i].len &&
4083
1
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4084
1
            break;
4085
0
          vn += verbs[i].len + 1;
4086
0
          }
4087
4088
1
        if (i >= verbcount)
4089
0
          {
4090
0
          errorcode = ERR60;  /* Verb not recognized */
4091
0
          goto FAILED;
4092
0
          }
4093
4094
        /* An empty argument is treated as no argument. */
4095
4096
1
        if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4097
1
             ptr[1] == CHAR_RIGHT_PARENTHESIS)
4098
0
          ptr++;    /* Advance to the closing parens */
4099
4100
        /* Check for mandatory non-empty argument; this is (*MARK) */
4101
4102
1
        if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4103
0
          {
4104
0
          errorcode = ERR66;
4105
0
          goto FAILED;
4106
0
          }
4107
4108
        /* Remember where this verb, possibly with a preceding (*MARK), starts,
4109
        for handling quantified (*ACCEPT). */
4110
4111
1
        verbstartptr = parsed_pattern;
4112
1
        okquantifier = (verbs[i].meta == META_ACCEPT);
4113
4114
        /* It appears that Perl allows any characters whatsoever, other than a
4115
        closing parenthesis, to appear in arguments ("names"), so we no longer
4116
        insist on letters, digits, and underscores. Perl does not, however, do
4117
        any interpretation within arguments, and has no means of including a
4118
        closing parenthesis. PCRE supports escape processing but only when it
4119
        is requested by an option. We set inverbname TRUE here, and let the
4120
        main loop take care of this so that escape and \x processing is done by
4121
        the main code above. */
4122
4123
1
        if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4124
1
          {
4125
          /* Some optional arguments can be treated as a preceding (*MARK) */
4126
4127
1
          if (verbs[i].has_arg < 0)
4128
0
            {
4129
0
            add_after_mark = verbs[i].meta;
4130
0
            *parsed_pattern++ = META_MARK;
4131
0
            }
4132
4133
          /* The remaining verbs with arguments (except *MARK) need a different
4134
          opcode. */
4135
4136
1
          else
4137
1
            {
4138
1
            *parsed_pattern++ = verbs[i].meta +
4139
1
              ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4140
1
            }
4141
4142
          /* Set up for reading the name in the main loop. */
4143
4144
1
          verblengthptr = parsed_pattern++;
4145
1
          verbnamestart = ptr;
4146
1
          inverbname = TRUE;
4147
1
          }
4148
0
        else  /* No verb "name" argument */
4149
0
          {
4150
0
          *parsed_pattern++ = verbs[i].meta;
4151
0
          }
4152
1
        }     /* End of (*VERB) handling */
4153
1.42k
      break;  /* Done with this parenthesis */
4154
1.42k
      }       /* End of groups that don't start with (? */
4155
4156
4157
    /* ---- Items starting (? ---- */
4158
4159
    /* The type of item is determined by what follows (?. Handle (?| and option
4160
    changes under "default" because both need a new block on the nest stack.
4161
    Comments starting with (?# are handled above. Note that there is some
4162
    ambiguity about the sequence (?- because if a digit follows it's a relative
4163
    recursion or subroutine call whereas otherwise it's an option unsetting. */
4164
4165
12
    if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4166
4167
12
    switch(*ptr)
4168
12
      {
4169
5
      default:
4170
5
      if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4171
0
        goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4172
4173
      /* We now have either (?| or a (possibly empty) option setting,
4174
      optionally followed by a non-capturing group. */
4175
4176
5
      nest_depth++;
4177
5
      if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4178
0
      else if (++top_nest >= end_nests)
4179
0
        {
4180
0
        errorcode = ERR84;
4181
0
        goto FAILED;
4182
0
        }
4183
5
      top_nest->nest_depth = nest_depth;
4184
5
      top_nest->flags = 0;
4185
5
      top_nest->options = options & PARSE_TRACKED_OPTIONS;
4186
5
      top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4187
4188
      /* Start of non-capturing group that resets the capture count for each
4189
      branch. */
4190
4191
5
      if (*ptr == CHAR_VERTICAL_LINE)
4192
0
        {
4193
0
        top_nest->reset_group = (uint16_t)cb->bracount;
4194
0
        top_nest->max_group = (uint16_t)cb->bracount;
4195
0
        top_nest->flags |= NSF_RESET;
4196
0
        cb->external_flags |= PCRE2_DUPCAPUSED;
4197
0
        *parsed_pattern++ = META_NOCAPTURE;
4198
0
        ptr++;
4199
0
        }
4200
4201
      /* Scan for options imnrsxJU to be set or unset. */
4202
4203
5
      else
4204
5
        {
4205
5
        BOOL hyphenok = TRUE;
4206
5
        uint32_t oldoptions = options;
4207
5
        uint32_t oldxoptions = xoptions;
4208
4209
5
        top_nest->reset_group = 0;
4210
5
        top_nest->max_group = 0;
4211
5
        set = unset = 0;
4212
5
        optset = &set;
4213
5
        xset = xunset = 0;
4214
5
        xoptset = &xset;
4215
4216
        /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4217
4218
5
        if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4219
0
          {
4220
0
          options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4221
0
                       PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4222
0
          xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4223
0
          hyphenok = FALSE;
4224
0
          ptr++;
4225
0
          }
4226
4227
5
        while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4228
5
                               *ptr != CHAR_COLON)
4229
0
          {
4230
0
          switch (*ptr++)
4231
0
            {
4232
0
            case CHAR_MINUS:
4233
0
            if (!hyphenok)
4234
0
              {
4235
0
              errorcode = ERR94;
4236
0
              ptr--;  /* Correct the offset */
4237
0
              goto FAILED;
4238
0
              }
4239
0
            optset = &unset;
4240
0
            xoptset = &xunset;
4241
0
            hyphenok = FALSE;
4242
0
            break;
4243
4244
            /* There are some two-character sequences that start with 'a'. */
4245
4246
0
            case CHAR_a:
4247
0
            if (ptr < ptrend)
4248
0
              {
4249
0
              if (*ptr == CHAR_D)
4250
0
                {
4251
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSD;
4252
0
                ptr++;
4253
0
                break;
4254
0
                }
4255
0
              if (*ptr == CHAR_P)
4256
0
                {
4257
0
                *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4258
0
                ptr++;
4259
0
                break;
4260
0
                }
4261
0
              if (*ptr == CHAR_S)
4262
0
                {
4263
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSS;
4264
0
                ptr++;
4265
0
                break;
4266
0
                }
4267
0
              if (*ptr == CHAR_T)
4268
0
                {
4269
0
                *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4270
0
                ptr++;
4271
0
                break;
4272
0
                }
4273
0
              if (*ptr == CHAR_W)
4274
0
                {
4275
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSW;
4276
0
                ptr++;
4277
0
                break;
4278
0
                }
4279
0
              }
4280
0
            *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4281
0
                        PCRE2_EXTRA_ASCII_BSW|
4282
0
                        PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4283
0
            break;
4284
4285
0
            case CHAR_J:  /* Record that it changed in the external options */
4286
0
            *optset |= PCRE2_DUPNAMES;
4287
0
            cb->external_flags |= PCRE2_JCHANGED;
4288
0
            break;
4289
4290
0
            case CHAR_i: *optset |= PCRE2_CASELESS; break;
4291
0
            case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4292
0
            case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4293
0
            case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4294
0
            case CHAR_s: *optset |= PCRE2_DOTALL; break;
4295
0
            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4296
4297
            /* If x appears twice it sets the extended extended option. */
4298
4299
0
            case CHAR_x:
4300
0
            *optset |= PCRE2_EXTENDED;
4301
0
            if (ptr < ptrend && *ptr == CHAR_x)
4302
0
              {
4303
0
              *optset |= PCRE2_EXTENDED_MORE;
4304
0
              ptr++;
4305
0
              }
4306
0
            break;
4307
4308
0
            default:
4309
0
            errorcode = ERR11;
4310
0
            ptr--;    /* Correct the offset */
4311
0
            goto FAILED;
4312
0
            }
4313
0
          }
4314
4315
        /* If we are setting extended without extended-more, ensure that any
4316
        existing extended-more gets unset. Also, unsetting extended must also
4317
        unset extended-more. */
4318
4319
5
        if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4320
5
            (unset & PCRE2_EXTENDED) != 0)
4321
0
          unset |= PCRE2_EXTENDED_MORE;
4322
4323
5
        options = (options | set) & (~unset);
4324
5
        xoptions = (xoptions | xset) & (~xunset);
4325
4326
        /* If the options ended with ')' this is not the start of a nested
4327
        group with option changes, so the options change at this level.
4328
        In this case, if the previous level set up a nest block, discard the
4329
        one we have just created. Otherwise adjust it for the previous level.
4330
        If the options ended with ':' we are starting a non-capturing group,
4331
        possibly with an options setting. */
4332
4333
5
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4334
5
        if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4335
0
          {
4336
0
          nest_depth--;  /* This is not a nested group after all. */
4337
0
          if (top_nest > (nest_save *)(cb->start_workspace) &&
4338
0
              (top_nest-1)->nest_depth == nest_depth) top_nest--;
4339
0
          else top_nest->nest_depth = nest_depth;
4340
0
          }
4341
5
        else *parsed_pattern++ = META_NOCAPTURE;
4342
4343
        /* If nothing changed, no need to record. */
4344
4345
5
        if (options != oldoptions || xoptions != oldxoptions)
4346
0
          {
4347
0
          *parsed_pattern++ = META_OPTIONS;
4348
0
          *parsed_pattern++ = options;
4349
0
          *parsed_pattern++ = xoptions;
4350
0
          }
4351
5
        }     /* End options processing */
4352
5
      break;  /* End default case after (? */
4353
4354
4355
      /* ---- Python syntax support ---- */
4356
4357
5
      case CHAR_P:
4358
0
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4359
4360
      /* (?P<name> is the same as (?<name>, which defines a named group. */
4361
4362
0
      if (*ptr == CHAR_LESS_THAN_SIGN)
4363
0
        {
4364
0
        terminator = CHAR_GREATER_THAN_SIGN;
4365
0
        goto DEFINE_NAME;
4366
0
        }
4367
4368
      /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4369
      call. */
4370
4371
0
      if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4372
4373
      /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4374
      else after (?P is an error. */
4375
4376
0
      if (*ptr != CHAR_EQUALS_SIGN)
4377
0
        {
4378
0
        errorcode = ERR41;
4379
0
        goto FAILED;
4380
0
        }
4381
0
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4382
0
          &namelen, &errorcode, cb)) goto FAILED;
4383
0
      *parsed_pattern++ = META_BACKREF_BYNAME;
4384
0
      *parsed_pattern++ = namelen;
4385
0
      PUTOFFSET(offset, parsed_pattern);
4386
0
      okquantifier = TRUE;
4387
0
      break;   /* End of (?P processing */
4388
4389
4390
      /* ---- Recursion/subroutine calls by number ---- */
4391
4392
0
      case CHAR_R:
4393
0
      i = 0;         /* (?R) == (?R0) */
4394
0
      ptr++;
4395
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4396
0
        {
4397
0
        errorcode = ERR58;
4398
0
        goto FAILED;
4399
0
        }
4400
0
      goto SET_RECURSION;
4401
4402
      /* An item starting (?- followed by a digit comes here via the "default"
4403
      case because (?- followed by a non-digit is an options setting. */
4404
4405
0
      case CHAR_PLUS:
4406
0
      if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4407
0
        {
4408
0
        errorcode = ERR29;   /* Missing number */
4409
0
        goto FAILED;
4410
0
        }
4411
      /* Fall through */
4412
4413
0
      case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4414
0
      case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4415
0
      RECURSION_BYNUMBER:
4416
0
      if (!read_number(&ptr, ptrend,
4417
0
          (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4418
0
          MAX_GROUP_NUMBER, ERR61,
4419
0
          &i, &errorcode)) goto FAILED;
4420
0
      if (i < 0)  /* NB (?0) is permitted */
4421
0
        {
4422
0
        errorcode = ERR15;   /* Unknown group */
4423
0
        goto FAILED_BACK;
4424
0
        }
4425
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4426
0
        goto UNCLOSED_PARENTHESIS;
4427
4428
0
      SET_RECURSION:
4429
0
      *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4430
0
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4431
0
      ptr++;
4432
0
      PUTOFFSET(offset, parsed_pattern);
4433
0
      okquantifier = TRUE;
4434
0
      break;  /* End of recursive call by number handling */
4435
4436
4437
      /* ---- Recursion/subroutine calls by name ---- */
4438
4439
0
      case CHAR_AMPERSAND:
4440
0
      RECURSE_BY_NAME:
4441
0
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4442
0
          &namelen, &errorcode, cb)) goto FAILED;
4443
0
      *parsed_pattern++ = META_RECURSE_BYNAME;
4444
0
      *parsed_pattern++ = namelen;
4445
0
      PUTOFFSET(offset, parsed_pattern);
4446
0
      okquantifier = TRUE;
4447
0
      break;
4448
4449
      /* ---- Callout with numerical or string argument ---- */
4450
4451
0
      case CHAR_C:
4452
0
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4453
4454
      /* If the previous item was a condition starting (?(? an assertion,
4455
      optionally preceded by a callout, is expected. This is checked later on,
4456
      during actual compilation. However we need to identify this kind of
4457
      assertion in this pass because it must not be qualified. The value of
4458
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4459
      for a callout - still leaving a positive value that identifies the
4460
      assertion. Multiple callouts or any other items will make it zero or
4461
      less, which doesn't matter because they will cause an error later. */
4462
4463
0
      expect_cond_assert = prev_expect_cond_assert - 1;
4464
4465
      /* If previous_callout is not NULL, it means this follows a previous
4466
      callout. If it was a manual callout, do nothing; this means its "length
4467
      of next pattern item" field will remain zero. If it was an automatic
4468
      callout, abolish it. */
4469
4470
0
      if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4471
0
          previous_callout == parsed_pattern - 4 &&
4472
0
          parsed_pattern[-1] == 255)
4473
0
        parsed_pattern = previous_callout;
4474
4475
      /* Save for updating next pattern item length, and skip one item before
4476
      completing. */
4477
4478
0
      previous_callout = parsed_pattern;
4479
0
      after_manual_callout = 1;
4480
4481
      /* Handle a string argument; specific delimiter is required. */
4482
4483
0
      if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4484
0
        {
4485
0
        PCRE2_SIZE calloutlength;
4486
0
        PCRE2_SPTR startptr = ptr;
4487
4488
0
        delimiter = 0;
4489
0
        for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4490
0
          {
4491
0
          if (*ptr == PRIV(callout_start_delims)[i])
4492
0
            {
4493
0
            delimiter = PRIV(callout_end_delims)[i];
4494
0
            break;
4495
0
            }
4496
0
          }
4497
0
        if (delimiter == 0)
4498
0
          {
4499
0
          errorcode = ERR82;
4500
0
          goto FAILED;
4501
0
          }
4502
4503
0
        *parsed_pattern = META_CALLOUT_STRING;
4504
0
        parsed_pattern += 3;   /* Skip pattern info */
4505
4506
0
        for (;;)
4507
0
          {
4508
0
          if (++ptr >= ptrend)
4509
0
            {
4510
0
            errorcode = ERR81;
4511
0
            ptr = startptr;   /* To give a more useful message */
4512
0
            goto FAILED;
4513
0
            }
4514
0
          if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4515
0
            break;
4516
0
          }
4517
4518
0
        calloutlength = (PCRE2_SIZE)(ptr - startptr);
4519
0
        if (calloutlength > UINT32_MAX)
4520
0
          {
4521
0
          errorcode = ERR72;
4522
0
          goto FAILED;
4523
0
          }
4524
0
        *parsed_pattern++ = (uint32_t)calloutlength;
4525
0
        offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4526
0
        PUTOFFSET(offset, parsed_pattern);
4527
0
        }
4528
4529
      /* Handle a callout with an optional numerical argument, which must be
4530
      less than or equal to 255. A missing argument gives 0. */
4531
4532
0
      else
4533
0
        {
4534
0
        int n = 0;
4535
0
        *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
4536
0
        parsed_pattern += 3;                       /* Skip pattern info */
4537
0
        while (ptr < ptrend && IS_DIGIT(*ptr))
4538
0
          {
4539
0
          n = n * 10 + *ptr++ - CHAR_0;
4540
0
          if (n > 255)
4541
0
            {
4542
0
            errorcode = ERR38;
4543
0
            goto FAILED;
4544
0
            }
4545
0
          }
4546
0
        *parsed_pattern++ = n;
4547
0
        }
4548
4549
      /* Both formats must have a closing parenthesis */
4550
4551
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4552
0
        {
4553
0
        errorcode = ERR39;
4554
0
        goto FAILED;
4555
0
        }
4556
0
      ptr++;
4557
4558
      /* Remember the offset to the next item in the pattern, and set a default
4559
      length. This should get updated after the next item is read. */
4560
4561
0
      previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4562
0
      previous_callout[2] = 0;
4563
0
      break;                  /* End callout */
4564
4565
4566
      /* ---- Conditional group ---- */
4567
4568
      /* A condition can be an assertion, a number (referring to a numbered
4569
      group's having been set), a name (referring to a named group), or 'R',
4570
      referring to overall recursion. R<digits> and R&name are also permitted
4571
      for recursion state tests. Numbers may be preceded by + or - to specify a
4572
      relative group number.
4573
4574
      There are several syntaxes for testing a named group: (?(name)) is used
4575
      by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4576
4577
      There are two unfortunate ambiguities. 'R' can be the recursive thing or
4578
      the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4579
      the Perl DEFINE feature or the Python named test. We look for a name
4580
      first; if not found, we try the other case.
4581
4582
      For compatibility with auto-callouts, we allow a callout to be specified
4583
      before a condition that is an assertion. */
4584
4585
0
      case CHAR_LEFT_PARENTHESIS:
4586
0
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4587
0
      nest_depth++;
4588
4589
      /* If the next character is ? or * there must be an assertion next
4590
      (optionally preceded by a callout). We do not check this here, but
4591
      instead we set expect_cond_assert to 2. If this is still greater than
4592
      zero (callouts decrement it) when the next assertion is read, it will be
4593
      marked as a condition that must not be repeated. A value greater than
4594
      zero also causes checking that an assertion (possibly with callout)
4595
      follows. */
4596
4597
0
      if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4598
0
        {
4599
0
        *parsed_pattern++ = META_COND_ASSERT;
4600
0
        ptr--;   /* Pull pointer back to the opening parenthesis. */
4601
0
        expect_cond_assert = 2;
4602
0
        break;  /* End of conditional */
4603
0
        }
4604
4605
      /* Handle (?([+-]number)... */
4606
4607
0
      if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4608
0
          &errorcode))
4609
0
        {
4610
0
        if (i <= 0)
4611
0
          {
4612
0
          errorcode = ERR15;
4613
0
          goto FAILED;
4614
0
          }
4615
0
        *parsed_pattern++ = META_COND_NUMBER;
4616
0
        offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4617
0
        PUTOFFSET(offset, parsed_pattern);
4618
0
        *parsed_pattern++ = i;
4619
0
        }
4620
0
      else if (errorcode != 0) goto FAILED;   /* Number too big */
4621
4622
      /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4623
4624
0
      else if (ptrend - ptr >= 10 &&
4625
0
               PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4626
0
               ptr[7] != CHAR_RIGHT_PARENTHESIS)
4627
0
        {
4628
0
        uint32_t ge = 0;
4629
0
        int major = 0;
4630
0
        int minor = 0;
4631
4632
0
        ptr += 7;
4633
0
        if (*ptr == CHAR_GREATER_THAN_SIGN)
4634
0
          {
4635
0
          ge = 1;
4636
0
          ptr++;
4637
0
          }
4638
4639
        /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4640
        references its argument twice. */
4641
4642
0
        if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4643
0
          goto BAD_VERSION_CONDITION;
4644
4645
0
        if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4646
0
          goto FAILED;
4647
4648
0
        if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4649
0
        if (*ptr == CHAR_DOT)
4650
0
          {
4651
0
          if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4652
0
          minor = (*ptr++ - CHAR_0) * 10;
4653
0
          if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4654
0
          if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4655
0
          if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4656
0
            goto BAD_VERSION_CONDITION;
4657
0
          }
4658
4659
0
        *parsed_pattern++ = META_COND_VERSION;
4660
0
        *parsed_pattern++ = ge;
4661
0
        *parsed_pattern++ = major;
4662
0
        *parsed_pattern++ = minor;
4663
0
        }
4664
4665
      /* All the remaining cases now require us to read a name. We cannot at
4666
      this stage distinguish ambiguous cases such as (?(R12) which might be a
4667
      recursion test by number or a name, because the named groups have not yet
4668
      all been identified. Those cases are treated as names, but given a
4669
      different META code. */
4670
4671
0
      else
4672
0
        {
4673
0
        BOOL was_r_ampersand = FALSE;
4674
4675
0
        if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4676
0
          {
4677
0
          terminator = CHAR_RIGHT_PARENTHESIS;
4678
0
          was_r_ampersand = TRUE;
4679
0
          ptr++;
4680
0
          }
4681
0
        else if (*ptr == CHAR_LESS_THAN_SIGN)
4682
0
          terminator = CHAR_GREATER_THAN_SIGN;
4683
0
        else if (*ptr == CHAR_APOSTROPHE)
4684
0
          terminator = CHAR_APOSTROPHE;
4685
0
        else
4686
0
          {
4687
0
          terminator = CHAR_RIGHT_PARENTHESIS;
4688
0
          ptr--;   /* Point to char before name */
4689
0
          }
4690
0
        if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4691
0
            &errorcode, cb)) goto FAILED;
4692
4693
        /* Handle (?(R&name) */
4694
4695
0
        if (was_r_ampersand)
4696
0
          {
4697
0
          *parsed_pattern = META_COND_RNAME;
4698
0
          ptr--;   /* Back to closing parens */
4699
0
          }
4700
4701
        /* Handle (?(name). If the name is "DEFINE" we identify it with a
4702
        special code. Likewise if the name consists of R followed only by
4703
        digits. Otherwise, handle it like a quoted name. */
4704
4705
0
        else if (terminator == CHAR_RIGHT_PARENTHESIS)
4706
0
          {
4707
0
          if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4708
0
            *parsed_pattern = META_COND_DEFINE;
4709
0
          else
4710
0
            {
4711
0
            for (i = 1; i < (int)namelen; i++)
4712
0
              if (!IS_DIGIT(name[i])) break;
4713
0
            *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4714
0
              META_COND_RNUMBER : META_COND_NAME;
4715
0
            }
4716
0
          ptr--;   /* Back to closing parens */
4717
0
          }
4718
4719
        /* Handle (?('name') or (?(<name>) */
4720
4721
0
        else *parsed_pattern = META_COND_NAME;
4722
4723
        /* All these cases except DEFINE end with the name length and offset;
4724
        DEFINE just has an offset (for the "too many branches" error). */
4725
4726
0
        if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4727
0
        PUTOFFSET(offset, parsed_pattern);
4728
0
        }  /* End cases that read a name */
4729
4730
      /* Check the closing parenthesis of the condition */
4731
4732
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4733
0
        {
4734
0
        errorcode = ERR24;
4735
0
        goto FAILED;
4736
0
        }
4737
0
      ptr++;
4738
0
      break;  /* End of condition processing */
4739
4740
4741
      /* ---- Atomic group ---- */
4742
4743
0
      case CHAR_GREATER_THAN_SIGN:
4744
0
      ATOMIC_GROUP:                          /* Come from (*atomic: */
4745
0
      *parsed_pattern++ = META_ATOMIC;
4746
0
      nest_depth++;
4747
0
      ptr++;
4748
0
      break;
4749
4750
4751
      /* ---- Lookahead assertions ---- */
4752
4753
5
      case CHAR_EQUALS_SIGN:
4754
5
      POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
4755
5
      *parsed_pattern++ = META_LOOKAHEAD;
4756
5
      ptr++;
4757
5
      goto POST_ASSERTION;
4758
4759
0
      case CHAR_ASTERISK:
4760
0
      POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (?* */
4761
0
      *parsed_pattern++ = META_LOOKAHEAD_NA;
4762
0
      ptr++;
4763
0
      goto POST_ASSERTION;
4764
4765
1
      case CHAR_EXCLAMATION_MARK:
4766
1
      NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
4767
1
      *parsed_pattern++ = META_LOOKAHEADNOT;
4768
1
      ptr++;
4769
1
      goto POST_ASSERTION;
4770
4771
4772
      /* ---- Lookbehind assertions ---- */
4773
4774
      /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4775
      is the start of the name of a capturing group. */
4776
4777
1
      case CHAR_LESS_THAN_SIGN:
4778
1
      if (ptrend - ptr <= 1 ||
4779
1
         (ptr[1] != CHAR_EQUALS_SIGN &&
4780
0
          ptr[1] != CHAR_EXCLAMATION_MARK &&
4781
0
          ptr[1] != CHAR_ASTERISK))
4782
0
        {
4783
0
        terminator = CHAR_GREATER_THAN_SIGN;
4784
0
        goto DEFINE_NAME;
4785
0
        }
4786
1
      *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4787
1
        META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4788
0
        META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4789
4790
1
      POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
4791
1
      *has_lookbehind = TRUE;
4792
1
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4793
1
      PUTOFFSET(offset, parsed_pattern);
4794
1
      ptr += 2;
4795
      /* Fall through */
4796
4797
      /* If the previous item was a condition starting (?(? an assertion,
4798
      optionally preceded by a callout, is expected. This is checked later on,
4799
      during actual compilation. However we need to identify this kind of
4800
      assertion in this pass because it must not be qualified. The value of
4801
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4802
      for a callout - still leaving a positive value that identifies the
4803
      assertion. Multiple callouts or any other items will make it zero or
4804
      less, which doesn't matter because they will cause an error later. */
4805
4806
7
      POST_ASSERTION:
4807
7
      nest_depth++;
4808
7
      if (prev_expect_cond_assert > 0)
4809
0
        {
4810
0
        if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4811
0
        else if (++top_nest >= end_nests)
4812
0
          {
4813
0
          errorcode = ERR84;
4814
0
          goto FAILED;
4815
0
          }
4816
0
        top_nest->nest_depth = nest_depth;
4817
0
        top_nest->flags = NSF_CONDASSERT;
4818
0
        top_nest->options = options & PARSE_TRACKED_OPTIONS;
4819
0
        top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4820
0
        }
4821
7
      break;
4822
4823
4824
      /* ---- Define a named group ---- */
4825
4826
      /* A named group may be defined as (?'name') or (?<name>). In the latter
4827
      case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4828
      terminator set to '>'. */
4829
4830
7
      case CHAR_APOSTROPHE:
4831
0
      terminator = CHAR_APOSTROPHE;    /* Terminator */
4832
4833
0
      DEFINE_NAME:
4834
0
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4835
0
          &errorcode, cb)) goto FAILED;
4836
4837
      /* We have a name for this capturing group. It is also assigned a number,
4838
      which is its primary means of identification. */
4839
4840
0
      if (cb->bracount >= MAX_GROUP_NUMBER)
4841
0
        {
4842
0
        errorcode = ERR97;
4843
0
        goto FAILED;
4844
0
        }
4845
0
      cb->bracount++;
4846
0
      *parsed_pattern++ = META_CAPTURE | cb->bracount;
4847
0
      nest_depth++;
4848
4849
      /* Check not too many names */
4850
4851
0
      if (cb->names_found >= MAX_NAME_COUNT)
4852
0
        {
4853
0
        errorcode = ERR49;
4854
0
        goto FAILED;
4855
0
        }
4856
4857
      /* Adjust the entry size to accommodate the longest name found. */
4858
4859
0
      if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4860
0
        cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4861
4862
      /* Scan the list to check for duplicates. For duplicate names, if the
4863
      number is the same, break the loop, which causes the name to be
4864
      discarded; otherwise, if DUPNAMES is not set, give an error.
4865
      If it is set, allow the name with a different number, but continue
4866
      scanning in case this is a duplicate with the same number. For
4867
      non-duplicate names, give an error if the number is duplicated. */
4868
4869
0
      isdupname = FALSE;
4870
0
      ng = cb->named_groups;
4871
0
      for (i = 0; i < cb->names_found; i++, ng++)
4872
0
        {
4873
0
        if (namelen == ng->length &&
4874
0
            PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4875
0
          {
4876
0
          if (ng->number == cb->bracount) break;
4877
0
          if ((options & PCRE2_DUPNAMES) == 0)
4878
0
            {
4879
0
            errorcode = ERR43;
4880
0
            goto FAILED;
4881
0
            }
4882
0
          isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
4883
0
          cb->dupnames = TRUE;              /* Duplicate names exist */
4884
0
          }
4885
0
        else if (ng->number == cb->bracount)
4886
0
          {
4887
0
          errorcode = ERR65;
4888
0
          goto FAILED;
4889
0
          }
4890
0
        }
4891
4892
0
      if (i < cb->names_found) break;   /* Ignore duplicate with same number */
4893
4894
      /* Increase the list size if necessary */
4895
4896
0
      if (cb->names_found >= cb->named_group_list_size)
4897
0
        {
4898
0
        uint32_t newsize = cb->named_group_list_size * 2;
4899
0
        named_group *newspace =
4900
0
          cb->cx->memctl.malloc(newsize * sizeof(named_group),
4901
0
          cb->cx->memctl.memory_data);
4902
0
        if (newspace == NULL)
4903
0
          {
4904
0
          errorcode = ERR21;
4905
0
          goto FAILED;
4906
0
          }
4907
4908
0
        memcpy(newspace, cb->named_groups,
4909
0
          cb->named_group_list_size * sizeof(named_group));
4910
0
        if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4911
0
          cb->cx->memctl.free((void *)cb->named_groups,
4912
0
          cb->cx->memctl.memory_data);
4913
0
        cb->named_groups = newspace;
4914
0
        cb->named_group_list_size = newsize;
4915
0
        }
4916
4917
      /* Add this name to the list */
4918
4919
0
      cb->named_groups[cb->names_found].name = name;
4920
0
      cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4921
0
      cb->named_groups[cb->names_found].number = cb->bracount;
4922
0
      cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4923
0
      cb->names_found++;
4924
0
      break;
4925
12
      }        /* End of (? switch */
4926
12
    break;     /* End of ( handling */
4927
4928
4929
    /* ---- Branch terminators ---- */
4930
4931
    /* Alternation: reset the capture count if we are in a (?| group. */
4932
4933
5.60k
    case CHAR_VERTICAL_LINE:
4934
5.60k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4935
7
        (top_nest->flags & NSF_RESET) != 0)
4936
0
      {
4937
0
      if (cb->bracount > top_nest->max_group)
4938
0
        top_nest->max_group = (uint16_t)cb->bracount;
4939
0
      cb->bracount = top_nest->reset_group;
4940
0
      }
4941
5.60k
    *parsed_pattern++ = META_ALT;
4942
5.60k
    break;
4943
4944
    /* End of group; reset the capture count to the maximum if we are in a (?|
4945
    group and/or reset the options that are tracked during parsing. Disallow
4946
    quantifier for a condition that is an assertion. */
4947
4948
1.41k
    case CHAR_RIGHT_PARENTHESIS:
4949
1.41k
    okquantifier = TRUE;
4950
1.41k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4951
5
      {
4952
5
      options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4953
5
      xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
4954
5
      if ((top_nest->flags & NSF_RESET) != 0 &&
4955
0
          top_nest->max_group > cb->bracount)
4956
0
        cb->bracount = top_nest->max_group;
4957
5
      if ((top_nest->flags & NSF_CONDASSERT) != 0)
4958
0
        okquantifier = FALSE;
4959
4960
5
      if ((top_nest->flags & NSF_ATOMICSR) != 0)
4961
0
        {
4962
0
        *parsed_pattern++ = META_KET;
4963
0
        }
4964
4965
5
      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4966
0
        else top_nest--;
4967
5
      }
4968
1.41k
    if (nest_depth == 0)    /* Unmatched closing parenthesis */
4969
15
      {
4970
15
      errorcode = ERR22;
4971
15
      goto FAILED_BACK;
4972
15
      }
4973
1.40k
    nest_depth--;
4974
1.40k
    *parsed_pattern++ = META_KET;
4975
1.40k
    break;
4976
190k
    }  /* End of switch on pattern character */
4977
190k
  }    /* End of main character scan loop */
4978
4979
/* End of pattern reached. Check for missing ) at the end of a verb name. */
4980
4981
385
if (inverbname && ptr >= ptrend)
4982
1
  {
4983
1
  errorcode = ERR60;
4984
1
  goto FAILED;
4985
1
  }
4986
4987
/* Manage callout for the final item */
4988
4989
384
PARSED_END:
4990
384
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4991
384
  parsed_pattern, cb);
4992
4993
/* Insert trailing items for word and line matching (features provided for the
4994
benefit of pcre2grep). */
4995
4996
384
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
4997
0
  {
4998
0
  *parsed_pattern++ = META_KET;
4999
0
  *parsed_pattern++ = META_DOLLAR;
5000
0
  }
5001
384
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5002
0
  {
5003
0
  *parsed_pattern++ = META_KET;
5004
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
5005
0
  }
5006
5007
/* Terminate the parsed pattern, then return success if all groups are closed.
5008
Otherwise we have unclosed parentheses. */
5009
5010
384
if (parsed_pattern >= parsed_pattern_end)
5011
0
  {
5012
0
  errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5013
0
  goto FAILED;
5014
0
  }
5015
5016
384
*parsed_pattern = META_END;
5017
384
if (nest_depth == 0) return 0;
5018
5019
16
UNCLOSED_PARENTHESIS:
5020
16
errorcode = ERR14;
5021
5022
/* Come here for all failures. */
5023
5024
336
FAILED:
5025
336
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5026
336
return errorcode;
5027
5028
/* Some errors need to indicate the previous character. */
5029
5030
21
FAILED_BACK:
5031
21
ptr--;
5032
21
goto FAILED;
5033
5034
/* This failure happens several times. */
5035
5036
0
BAD_VERSION_CONDITION:
5037
0
errorcode = ERR79;
5038
0
goto FAILED;
5039
16
}
5040
5041
5042
5043
/*************************************************
5044
*       Find first significant opcode            *
5045
*************************************************/
5046
5047
/* This is called by several functions that scan a compiled expression looking
5048
for a fixed first character, or an anchoring opcode etc. It skips over things
5049
that do not influence this. For some calls, it makes sense to skip negative
5050
forward and all backward assertions, and also the \b assertion; for others it
5051
does not.
5052
5053
Arguments:
5054
  code         pointer to the start of the group
5055
  skipassert   TRUE if certain assertions are to be skipped
5056
5057
Returns:       pointer to the first significant opcode
5058
*/
5059
5060
static const PCRE2_UCHAR*
5061
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5062
934
{
5063
934
for (;;)
5064
972
  {
5065
972
  switch ((int)*code)
5066
972
    {
5067
0
    case OP_ASSERT_NOT:
5068
0
    case OP_ASSERTBACK:
5069
0
    case OP_ASSERTBACK_NOT:
5070
0
    case OP_ASSERTBACK_NA:
5071
0
    if (!skipassert) return code;
5072
0
    do code += GET(code, 1); while (*code == OP_ALT);
5073
0
    code += PRIV(OP_lengths)[*code];
5074
0
    break;
5075
5076
3
    case OP_WORD_BOUNDARY:
5077
31
    case OP_NOT_WORD_BOUNDARY:
5078
46
    case OP_UCP_WORD_BOUNDARY:
5079
118
    case OP_NOT_UCP_WORD_BOUNDARY:
5080
118
    if (!skipassert) return code;
5081
    /* Fall through */
5082
5083
38
    case OP_CALLOUT:
5084
38
    case OP_CREF:
5085
38
    case OP_DNCREF:
5086
38
    case OP_RREF:
5087
38
    case OP_DNRREF:
5088
38
    case OP_FALSE:
5089
38
    case OP_TRUE:
5090
38
    code += PRIV(OP_lengths)[*code];
5091
38
    break;
5092
5093
0
    case OP_CALLOUT_STR:
5094
0
    code += GET(code, 1 + 2*LINK_SIZE);
5095
0
    break;
5096
5097
0
    case OP_SKIPZERO:
5098
0
    code += 2 + GET(code, 2) + LINK_SIZE;
5099
0
    break;
5100
5101
0
    case OP_COND:
5102
0
    case OP_SCOND:
5103
0
    if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
5104
0
        code[GET(code, 1)] != OP_KET)      /* More than one branch */
5105
0
      return code;
5106
0
    code += GET(code, 1) + 1 + LINK_SIZE;
5107
0
    break;
5108
5109
0
    case OP_MARK:
5110
0
    case OP_COMMIT_ARG:
5111
0
    case OP_PRUNE_ARG:
5112
0
    case OP_SKIP_ARG:
5113
0
    case OP_THEN_ARG:
5114
0
    code += code[1] + PRIV(OP_lengths)[*code];
5115
0
    break;
5116
5117
854
    default:
5118
854
    return code;
5119
972
    }
5120
972
  }
5121
/* Control never reaches here */
5122
934
}
5123
5124
5125
5126
#ifdef SUPPORT_UNICODE
5127
/*************************************************
5128
*           Get othercase range                  *
5129
*************************************************/
5130
5131
/* This function is passed the start and end of a class range in UCP mode. For
5132
single characters the range may be just one character long. The function
5133
searches up the characters, looking for ranges of characters in the "other"
5134
case. Each call returns the next one, updating the start address. A character
5135
with multiple other cases is returned on its own with a special return value.
5136
5137
Arguments:
5138
  cptr        points to starting character value; updated
5139
  d           end value
5140
  ocptr       where to put start of othercase range
5141
  odptr       where to put end of othercase range
5142
  restricted  TRUE if caseless restriction applies
5143
5144
Yield:        -1 when no more
5145
               0 when a range is returned
5146
              >0 the CASESET offset for char with multiple other cases;
5147
                 for this return, *ocptr contains the original
5148
*/
5149
5150
static int
5151
get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
5152
  uint32_t *odptr, BOOL restricted)
5153
24.6k
{
5154
24.6k
uint32_t c, othercase, next;
5155
24.6k
unsigned int co;
5156
5157
/* Find the first character that has an other case. If it has multiple other
5158
cases, return its case offset value. When CASELESS_RESTRICT is set, ignore the
5159
multi-case entries that begin with ASCII values. In 32-bit mode, a value
5160
greater than the Unicode maximum ends the range. */
5161
5162
54.6k
for (c = *cptr; c <= d; c++)
5163
37.4k
  {
5164
#if PCRE2_CODE_UNIT_WIDTH == 32
5165
  if (c > MAX_UTF_CODE_POINT) return -1;
5166
#endif
5167
37.4k
  if ((co = UCD_CASESET(c)) != 0 &&
5168
1.19k
      (!restricted || PRIV(ucd_caseless_sets)[co] > 127))
5169
1.19k
    {
5170
1.19k
    *ocptr = c++;   /* Character that has the set */
5171
1.19k
    *cptr = c;      /* Rest of input range */
5172
1.19k
    return (int)co;
5173
1.19k
    }
5174
5175
   /* This is not a valid multiple-case character. Check that the single other
5176
   case is different to the original. We don't need to check "restricted" here
5177
   because the non-ASCII characters with multiple cases that include an ASCII
5178
   character don't have a different "othercase". */
5179
5180
36.2k
  if ((othercase = UCD_OTHERCASE(c)) != c) break;
5181
36.2k
  }
5182
5183
23.4k
if (c > d) return -1;  /* Reached end of range */
5184
5185
/* Found a character that has a single other case. Search for the end of the
5186
range, which is either the end of the input range, or a character that has zero
5187
or more than one other cases. */
5188
5189
6.17k
*ocptr = othercase;
5190
6.17k
next = othercase + 1;
5191
5192
13.5k
for (++c; c <= d; c++)
5193
9.04k
  {
5194
9.04k
  if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
5195
7.33k
  next++;
5196
7.33k
  }
5197
5198
6.17k
*odptr = next - 1;     /* End of othercase range */
5199
6.17k
*cptr = c;             /* Rest of input range */
5200
6.17k
return 0;
5201
23.4k
}
5202
#endif  /* SUPPORT_UNICODE */
5203
5204
5205
5206
/*************************************************
5207
* Add a character or range to a class (internal) *
5208
*************************************************/
5209
5210
/* This function packages up the logic of adding a character or range of
5211
characters to a class. The character values in the arguments will be within the
5212
valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
5213
called only from within the "add to class" group of functions, some of which
5214
are recursive and mutually recursive. The external entry point is
5215
add_to_class().
5216
5217
Arguments:
5218
  classbits     the bit map for characters < 256
5219
  uchardptr     points to the pointer for extra data
5220
  options       the options bits
5221
  xoptions      the extra options bits
5222
  cb            compile data
5223
  start         start of range character
5224
  end           end of range character
5225
5226
Returns:        the number of < 256 characters added
5227
                the pointer to extra data is updated
5228
*/
5229
5230
static unsigned int
5231
add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5232
  uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start,
5233
  uint32_t end)
5234
45.2k
{
5235
45.2k
uint32_t c;
5236
45.2k
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5237
45.2k
unsigned int n8 = 0;
5238
5239
/* If caseless matching is required, scan the range and process alternate
5240
cases. In Unicode, there are 8-bit characters that have alternate cases that
5241
are greater than 255 and vice-versa (though these may be ignored if caseless
5242
restriction is in force). Sometimes we can just extend the original range. */
5243
5244
45.2k
if ((options & PCRE2_CASELESS) != 0)
5245
25.7k
  {
5246
25.7k
#ifdef SUPPORT_UNICODE
5247
25.7k
  if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5248
17.2k
    {
5249
17.2k
    int rc;
5250
17.2k
    uint32_t oc, od;
5251
5252
17.2k
    options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
5253
17.2k
    c = start;
5254
5255
24.6k
    while ((rc = get_othercase_range(&c, end, &oc, &od,
5256
24.6k
             (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)) >= 0)
5257
7.37k
      {
5258
      /* Handle a single character that has more than one other case. */
5259
5260
7.37k
      if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr,
5261
1.19k
        options, xoptions, cb, PRIV(ucd_caseless_sets) + rc, oc);
5262
5263
      /* Do nothing if the other case range is within the original range. */
5264
5265
6.17k
      else if (oc >= cb->class_range_start && od <= cb->class_range_end)
5266
652
        continue;
5267
5268
      /* Extend the original range if there is overlap, noting that if oc < c,
5269
      we can't have od > end because a subrange is always shorter than the
5270
      basic range. Otherwise, use a recursive call to add the additional range.
5271
      */
5272
5273
5.52k
      else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5274
5.52k
      else if (od > end && oc <= end + 1)
5275
0
        {
5276
0
        end = od;       /* Extend upwards */
5277
0
        if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5278
0
        }
5279
5.52k
      else n8 += add_to_class_internal(classbits, uchardptr, options, xoptions,
5280
5.52k
        cb, oc, od);
5281
7.37k
      }
5282
17.2k
    }
5283
8.56k
  else
5284
#else
5285
  (void)xoptions;   /* Avoid compiler warning */
5286
#endif  /* SUPPORT_UNICODE */
5287
5288
  /* Not UTF mode */
5289
5290
20.7k
  for (c = start; c <= classbits_end; c++)
5291
12.1k
    {
5292
12.1k
    SETBIT(classbits, cb->fcc[c]);
5293
12.1k
    n8++;
5294
12.1k
    }
5295
25.7k
  }
5296
5297
/* Now handle the originally supplied range. Adjust the final value according
5298
to the bit length - this means that the same lists of (e.g.) horizontal spaces
5299
can be used in all cases. */
5300
5301
45.2k
if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5302
56
  end = MAX_NON_UTF_CHAR;
5303
5304
45.2k
if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5305
5306
/* Use the bitmap for characters < 256. Otherwise use extra data.*/
5307
5308
128k
for (c = start; c <= classbits_end; c++)
5309
83.4k
  {
5310
  /* Regardless of start, c will always be <= 255. */
5311
83.4k
  SETBIT(classbits, c);
5312
83.4k
  n8++;
5313
83.4k
  }
5314
5315
45.2k
#ifdef SUPPORT_WIDE_CHARS
5316
45.2k
if (start <= 0xff) start = 0xff + 1;
5317
5318
45.2k
if (end >= start)
5319
1.33k
  {
5320
1.33k
  PCRE2_UCHAR *uchardata = *uchardptr;
5321
5322
1.33k
#ifdef SUPPORT_UNICODE
5323
1.33k
  if ((options & PCRE2_UTF) != 0)
5324
1.33k
    {
5325
1.33k
    if (start < end)
5326
8
      {
5327
8
      *uchardata++ = XCL_RANGE;
5328
8
      uchardata += PRIV(ord2utf)(start, uchardata);
5329
8
      uchardata += PRIV(ord2utf)(end, uchardata);
5330
8
      }
5331
1.32k
    else if (start == end)
5332
1.32k
      {
5333
1.32k
      *uchardata++ = XCL_SINGLE;
5334
1.32k
      uchardata += PRIV(ord2utf)(start, uchardata);
5335
1.32k
      }
5336
1.33k
    }
5337
0
  else
5338
0
#endif  /* SUPPORT_UNICODE */
5339
5340
  /* Without UTF support, character values are constrained by the bit length,
5341
  and can only be > 256 for 16-bit and 32-bit libraries. */
5342
5343
0
#if PCRE2_CODE_UNIT_WIDTH == 8
5344
0
    {}
5345
#else
5346
  if (start < end)
5347
    {
5348
    *uchardata++ = XCL_RANGE;
5349
    *uchardata++ = start;
5350
    *uchardata++ = end;
5351
    }
5352
  else if (start == end)
5353
    {
5354
    *uchardata++ = XCL_SINGLE;
5355
    *uchardata++ = start;
5356
    }
5357
#endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
5358
1.33k
  *uchardptr = uchardata;   /* Updata extra data pointer */
5359
1.33k
  }
5360
#else  /* SUPPORT_WIDE_CHARS */
5361
  (void)uchardptr;          /* Avoid compiler warning */
5362
#endif /* SUPPORT_WIDE_CHARS */
5363
5364
45.2k
return n8;    /* Number of 8-bit characters */
5365
45.2k
}
5366
5367
5368
5369
#ifdef SUPPORT_UNICODE
5370
/*************************************************
5371
* Add a list of characters to a class (internal) *
5372
*************************************************/
5373
5374
/* This function is used for adding a list of case-equivalent characters to a
5375
class when in UTF mode. This function is called only from within
5376
add_to_class_internal(), with which it is mutually recursive.
5377
5378
Arguments:
5379
  classbits     the bit map for characters < 256
5380
  uchardptr     points to the pointer for extra data
5381
  options       the options bits
5382
  xoptions      the extra options bits
5383
  cb            contains pointers to tables etc.
5384
  p             points to row of 32-bit values, terminated by NOTACHAR
5385
  except        character to omit; this is used when adding lists of
5386
                  case-equivalent characters to avoid including the one we
5387
                  already know about
5388
5389
Returns:        the number of < 256 characters added
5390
                the pointer to extra data is updated
5391
*/
5392
5393
static unsigned int
5394
add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5395
  uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p,
5396
  unsigned int except)
5397
1.19k
{
5398
1.19k
unsigned int n8 = 0;
5399
4.74k
while (p[0] < NOTACHAR)
5400
3.54k
  {
5401
3.54k
  unsigned int n = 0;
5402
3.54k
  if (p[0] != except)
5403
2.36k
    {
5404
2.40k
    while(p[n+1] == p[0] + n + 1) n++;
5405
2.36k
    n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5406
2.36k
      p[0], p[n]);
5407
2.36k
    }
5408
3.54k
  p += n + 1;
5409
3.54k
  }
5410
1.19k
return n8;
5411
1.19k
}
5412
#endif
5413
5414
5415
5416
/*************************************************
5417
*   External entry point for add range to class  *
5418
*************************************************/
5419
5420
/* This function sets the overall range so that the internal functions can try
5421
to avoid duplication when handling case-independence.
5422
5423
Arguments:
5424
  classbits     the bit map for characters < 256
5425
  uchardptr     points to the pointer for extra data
5426
  options       the options bits
5427
  xoptions      the extra options bits
5428
  cb            compile data
5429
  start         start of range character
5430
  end           end of range character
5431
5432
Returns:        the number of < 256 characters added
5433
                the pointer to extra data is updated
5434
*/
5435
5436
static unsigned int
5437
add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5438
  uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end)
5439
37.3k
{
5440
37.3k
cb->class_range_start = start;
5441
37.3k
cb->class_range_end = end;
5442
37.3k
return add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5443
37.3k
  start, end);
5444
37.3k
}
5445
5446
5447
/*************************************************
5448
*   External entry point for add list to class   *
5449
*************************************************/
5450
5451
/* This function is used for adding a list of horizontal or vertical whitespace
5452
characters to a class. The list must be in order so that ranges of characters
5453
can be detected and handled appropriately. This function sets the overall range
5454
so that the internal functions can try to avoid duplication when handling
5455
case-independence.
5456
5457
Arguments:
5458
  classbits     the bit map for characters < 256
5459
  uchardptr     points to the pointer for extra data
5460
  options       the options bits
5461
  xoptions      the extra options bits
5462
  cb            contains pointers to tables etc.
5463
  p             points to row of 32-bit values, terminated by NOTACHAR
5464
  except        character to omit; this is used when adding lists of
5465
                  case-equivalent characters to avoid including the one we
5466
                  already know about
5467
5468
Returns:        the number of < 256 characters added
5469
                the pointer to extra data is updated
5470
*/
5471
5472
static unsigned int
5473
add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5474
  uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except)
5475
4
{
5476
4
unsigned int n8 = 0;
5477
40
while (p[0] < NOTACHAR)
5478
36
  {
5479
36
  unsigned int n = 0;
5480
36
  if (p[0] != except)
5481
36
    {
5482
76
    while(p[n+1] == p[0] + n + 1) n++;
5483
36
    cb->class_range_start = p[0];
5484
36
    cb->class_range_end = p[n];
5485
36
    n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5486
36
      p[0], p[n]);
5487
36
    }
5488
36
  p += n + 1;
5489
36
  }
5490
4
return n8;
5491
4
}
5492
5493
5494
5495
/*************************************************
5496
*    Add characters not in a list to a class     *
5497
*************************************************/
5498
5499
/* This function is used for adding the complement of a list of horizontal or
5500
vertical whitespace to a class. The list must be in order.
5501
5502
Arguments:
5503
  classbits     the bit map for characters < 256
5504
  uchardptr     points to the pointer for extra data
5505
  options       the options bits
5506
  xoptions      the extra options bits
5507
  cb            contains pointers to tables etc.
5508
  p             points to row of 32-bit values, terminated by NOTACHAR
5509
5510
Returns:        the number of < 256 characters added
5511
                the pointer to extra data is updated
5512
*/
5513
5514
static unsigned int
5515
add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5516
  uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p)
5517
8
{
5518
8
BOOL utf = (options & PCRE2_UTF) != 0;
5519
8
unsigned int n8 = 0;
5520
8
if (p[0] > 0)
5521
8
  n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, 0, p[0] - 1);
5522
80
while (p[0] < NOTACHAR)
5523
72
  {
5524
152
  while (p[1] == p[0] + 1) p++;
5525
72
  n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, p[0] + 1,
5526
72
    (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5527
72
  p++;
5528
72
  }
5529
8
return n8;
5530
8
}
5531
5532
5533
5534
/*************************************************
5535
*    Find details of duplicate group names       *
5536
*************************************************/
5537
5538
/* This is called from compile_branch() when it needs to know the index and
5539
count of duplicates in the names table when processing named backreferences,
5540
either directly, or as conditions.
5541
5542
Arguments:
5543
  name          points to the name
5544
  length        the length of the name
5545
  indexptr      where to put the index
5546
  countptr      where to put the count of duplicates
5547
  errorcodeptr  where to put an error code
5548
  cb            the compile block
5549
5550
Returns:        TRUE if OK, FALSE if not, error code set
5551
*/
5552
5553
static BOOL
5554
find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5555
  int *countptr, int *errorcodeptr, compile_block *cb)
5556
0
{
5557
0
uint32_t i, groupnumber;
5558
0
int count;
5559
0
PCRE2_UCHAR *slot = cb->name_table;
5560
5561
/* Find the first entry in the table */
5562
5563
0
for (i = 0; i < cb->names_found; i++)
5564
0
  {
5565
0
  if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5566
0
      slot[IMM2_SIZE+length] == 0) break;
5567
0
  slot += cb->name_entry_size;
5568
0
  }
5569
5570
/* This should not occur, because this function is called only when we know we
5571
have duplicate names. Give an internal error. */
5572
5573
0
if (i >= cb->names_found)
5574
0
  {
5575
0
  *errorcodeptr = ERR53;
5576
0
  cb->erroroffset = name - cb->start_pattern;
5577
0
  return FALSE;
5578
0
  }
5579
5580
/* Record the index and then see how many duplicates there are, updating the
5581
backref map and maximum back reference as we do. */
5582
5583
0
*indexptr = i;
5584
0
count = 0;
5585
5586
0
for (;;)
5587
0
  {
5588
0
  count++;
5589
0
  groupnumber = GET2(slot,0);
5590
0
  cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5591
0
  if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5592
0
  if (++i >= cb->names_found) break;
5593
0
  slot += cb->name_entry_size;
5594
0
  if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5595
0
    (slot+IMM2_SIZE)[length] != 0) break;
5596
0
  }
5597
5598
0
*countptr = count;
5599
0
return TRUE;
5600
0
}
5601
5602
5603
5604
/*************************************************
5605
*           Compile one branch                   *
5606
*************************************************/
5607
5608
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5609
the options are changed during the branch, the pointer is used to change the
5610
external options bits. This function is used during the pre-compile phase when
5611
we are trying to find out the amount of memory needed, as well as during the
5612
real compile phase. The value of lengthptr distinguishes the two phases.
5613
5614
Arguments:
5615
  optionsptr        pointer to the option bits
5616
  xoptionsptr       pointer to the extra option bits
5617
  codeptr           points to the pointer to the current code point
5618
  pptrptr           points to the current parsed pattern pointer
5619
  errorcodeptr      points to error code variable
5620
  firstcuptr        place to put the first required code unit
5621
  firstcuflagsptr   place to put the first code unit flags
5622
  reqcuptr          place to put the last required code unit
5623
  reqcuflagsptr     place to put the last required code unit flags
5624
  bcptr             points to current branch chain
5625
  open_caps         points to current capitem
5626
  cb                contains pointers to tables etc.
5627
  lengthptr         NULL during the real compile phase
5628
                    points to length accumulator during pre-compile phase
5629
5630
Returns:            0 There's been an error, *errorcodeptr is non-zero
5631
                   +1 Success, this branch must match at least one character
5632
                   -1 Success, this branch may match an empty string
5633
*/
5634
5635
static int
5636
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5637
  PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5638
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5639
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5640
  compile_block *cb, PCRE2_SIZE *lengthptr)
5641
6.59k
{
5642
6.59k
int bravalue = 0;
5643
6.59k
int okreturn = -1;
5644
6.59k
int group_return = 0;
5645
6.59k
uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5646
6.59k
uint32_t greedy_default, greedy_non_default;
5647
6.59k
uint32_t repeat_type, op_type;
5648
6.59k
uint32_t options = *optionsptr;               /* May change dynamically */
5649
6.59k
uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
5650
6.59k
uint32_t firstcu, reqcu;
5651
6.59k
uint32_t zeroreqcu, zerofirstcu;
5652
6.59k
uint32_t escape;
5653
6.59k
uint32_t *pptr = *pptrptr;
5654
6.59k
uint32_t meta, meta_arg;
5655
6.59k
uint32_t firstcuflags, reqcuflags;
5656
6.59k
uint32_t zeroreqcuflags, zerofirstcuflags;
5657
6.59k
uint32_t req_caseopt, reqvary, tempreqvary;
5658
6.59k
PCRE2_SIZE offset = 0;
5659
6.59k
PCRE2_SIZE length_prevgroup = 0;
5660
6.59k
PCRE2_UCHAR *code = *codeptr;
5661
6.59k
PCRE2_UCHAR *last_code = code;
5662
6.59k
PCRE2_UCHAR *orig_code = code;
5663
6.59k
PCRE2_UCHAR *tempcode;
5664
6.59k
PCRE2_UCHAR *previous = NULL;
5665
6.59k
PCRE2_UCHAR op_previous;
5666
6.59k
BOOL groupsetfirstcu = FALSE;
5667
6.59k
BOOL had_accept = FALSE;
5668
6.59k
BOOL matched_char = FALSE;
5669
6.59k
BOOL previous_matched_char = FALSE;
5670
6.59k
BOOL reset_caseful = FALSE;
5671
6.59k
const uint8_t *cbits = cb->cbits;
5672
6.59k
uint8_t classbits[32];
5673
5674
/* We can fish out the UTF setting once and for all into a BOOL, but we must
5675
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5676
as we process the pattern. */
5677
5678
6.59k
#ifdef SUPPORT_UNICODE
5679
6.59k
BOOL utf = (options & PCRE2_UTF) != 0;
5680
6.59k
BOOL ucp = (options & PCRE2_UCP) != 0;
5681
#else  /* No Unicode support */
5682
BOOL utf = FALSE;
5683
#endif
5684
5685
/* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5686
class_uchardata always so that it can be passed to add_to_class() always,
5687
though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5688
alternative calls for the different cases. */
5689
5690
6.59k
PCRE2_UCHAR *class_uchardata;
5691
6.59k
#ifdef SUPPORT_WIDE_CHARS
5692
6.59k
BOOL xclass;
5693
6.59k
PCRE2_UCHAR *class_uchardata_base;
5694
6.59k
#endif
5695
5696
/* Set up the default and non-default settings for greediness */
5697
5698
6.59k
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5699
6.59k
greedy_non_default = greedy_default ^ 1;
5700
5701
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
5702
matching encountered yet". It gets changed to REQ_NONE if we hit something that
5703
matches a non-fixed first unit; reqcu just remains unset if we never find one.
5704
5705
When we hit a repeat whose minimum is zero, we may have to adjust these values
5706
to take the zero repeat into account. This is implemented by setting them to
5707
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5708
item types that can be repeated set these backoff variables appropriately. */
5709
5710
6.59k
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5711
6.59k
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5712
5713
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
5714
according to the current setting of the caseless flag. The REQ_CASELESS value
5715
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5716
to record the case status of the value. This is used only for ASCII characters.
5717
*/
5718
5719
6.59k
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5720
5721
/* Switch on next META item until the end of the branch */
5722
5723
134k
for (;; pptr++)
5724
140k
  {
5725
140k
#ifdef SUPPORT_WIDE_CHARS
5726
140k
  BOOL xclass_has_prop;
5727
140k
#endif
5728
140k
  BOOL negate_class;
5729
140k
  BOOL should_flip_negation;
5730
140k
  BOOL match_all_or_no_wide_chars;
5731
140k
  BOOL possessive_quantifier;
5732
140k
  BOOL note_group_empty;
5733
140k
  int class_has_8bitchar;
5734
140k
  uint32_t mclength;
5735
140k
  uint32_t skipunits;
5736
140k
  uint32_t subreqcu, subfirstcu;
5737
140k
  uint32_t groupnumber;
5738
140k
  uint32_t verbarglen, verbculen;
5739
140k
  uint32_t subreqcuflags, subfirstcuflags;
5740
140k
  open_capitem *oc;
5741
140k
  PCRE2_UCHAR mcbuffer[8];
5742
5743
  /* Get next META item in the pattern and its potential argument. */
5744
5745
140k
  meta = META_CODE(*pptr);
5746
140k
  meta_arg = META_DATA(*pptr);
5747
5748
  /* If we are in the pre-compile phase, accumulate the length used for the
5749
  previous cycle of this loop, unless the next item is a quantifier. */
5750
5751
140k
  if (lengthptr != NULL)
5752
75.6k
    {
5753
75.6k
    if (code > cb->start_workspace + cb->workspace_size -
5754
75.6k
        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
5755
0
      {
5756
0
      *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5757
0
        ERR52 : ERR86;
5758
0
      return 0;
5759
0
      }
5760
5761
    /* There is at least one situation where code goes backwards: this is the
5762
    case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5763
    is processed, the whole class is eliminated. However, it is created first,
5764
    so we have to allow memory for it. Therefore, don't ever reduce the length
5765
    at this point. */
5766
5767
75.6k
    if (code < last_code) code = last_code;
5768
5769
    /* If the next thing is not a quantifier, we add the length of the previous
5770
    item into the total, and reset the code pointer to the start of the
5771
    workspace. Otherwise leave the previous item available to be quantified. */
5772
5773
75.6k
    if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5774
71.3k
      {
5775
71.3k
      if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5776
0
        {
5777
0
        *errorcodeptr = ERR20;   /* Integer overflow */
5778
0
        return 0;
5779
0
        }
5780
71.3k
      *lengthptr += (PCRE2_SIZE)(code - orig_code);
5781
71.3k
      if (*lengthptr > MAX_PATTERN_SIZE)
5782
0
        {
5783
0
        *errorcodeptr = ERR20;   /* Pattern is too large */
5784
0
        return 0;
5785
0
        }
5786
71.3k
      code = orig_code;
5787
71.3k
      }
5788
5789
    /* Remember where this code item starts so we can catch the "backwards"
5790
    case above next time round. */
5791
5792
75.6k
    last_code = code;
5793
75.6k
    }
5794
5795
  /* Process the next parsed pattern item. If it is not a quantifier, remember
5796
  where it starts so that it can be quantified when a quantifier follows.
5797
  Checking for the legality of quantifiers happens in parse_regex(), except for
5798
  a quantifier after an assertion that is a condition. */
5799
5800
140k
  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5801
132k
    {
5802
132k
    previous = code;
5803
132k
    if (matched_char && !had_accept) okreturn = 1;
5804
132k
    }
5805
5806
140k
  previous_matched_char = matched_char;
5807
140k
  matched_char = FALSE;
5808
140k
  note_group_empty = FALSE;
5809
140k
  skipunits = 0;         /* Default value for most subgroups */
5810
5811
140k
  switch(meta)
5812
140k
    {
5813
    /* ===================================================================*/
5814
    /* The branch terminates at pattern end or | or ) */
5815
5816
696
    case META_END:
5817
4.94k
    case META_ALT:
5818
6.57k
    case META_KET:
5819
6.57k
    *firstcuptr = firstcu;
5820
6.57k
    *firstcuflagsptr = firstcuflags;
5821
6.57k
    *reqcuptr = reqcu;
5822
6.57k
    *reqcuflagsptr = reqcuflags;
5823
6.57k
    *codeptr = code;
5824
6.57k
    *pptrptr = pptr;
5825
6.57k
    return okreturn;
5826
5827
5828
    /* ===================================================================*/
5829
    /* Handle single-character metacharacters. In multiline mode, ^ disables
5830
    the setting of any following char as a first character. */
5831
5832
1.68k
    case META_CIRCUMFLEX:
5833
1.68k
    if ((options & PCRE2_MULTILINE) != 0)
5834
54
      {
5835
54
      if (firstcuflags == REQ_UNSET)
5836
2
        zerofirstcuflags = firstcuflags = REQ_NONE;
5837
54
      *code++ = OP_CIRCM;
5838
54
      }
5839
1.63k
    else *code++ = OP_CIRC;
5840
1.68k
    break;
5841
5842
258
    case META_DOLLAR:
5843
258
    *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5844
258
    break;
5845
5846
    /* There can never be a first char if '.' is first, whatever happens about
5847
    repeats. The value of reqcu doesn't change either. */
5848
5849
1.01k
    case META_DOT:
5850
1.01k
    matched_char = TRUE;
5851
1.01k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5852
1.01k
    zerofirstcu = firstcu;
5853
1.01k
    zerofirstcuflags = firstcuflags;
5854
1.01k
    zeroreqcu = reqcu;
5855
1.01k
    zeroreqcuflags = reqcuflags;
5856
1.01k
    *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5857
1.01k
    break;
5858
5859
5860
    /* ===================================================================*/
5861
    /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5862
    Otherwise, an initial ']' is taken as a data character. When empty classes
5863
    are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5864
    match any character, so generate OP_ALLANY. */
5865
5866
0
    case META_CLASS_EMPTY:
5867
0
    case META_CLASS_EMPTY_NOT:
5868
0
    matched_char = TRUE;
5869
0
    *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5870
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5871
0
    zerofirstcu = firstcu;
5872
0
    zerofirstcuflags = firstcuflags;
5873
0
    break;
5874
5875
5876
    /* ===================================================================*/
5877
    /* Non-empty character class. If the included characters are all < 256, we
5878
    build a 32-byte bitmap of the permitted characters, except in the special
5879
    case where there is only one such character. For negated classes, we build
5880
    the map as usual, then invert it at the end. However, we use a different
5881
    opcode so that data characters > 255 can be handled correctly.
5882
5883
    If the class contains characters outside the 0-255 range, a different
5884
    opcode is compiled. It may optionally have a bit map for characters < 256,
5885
    but those above are explicitly listed afterwards. A flag code unit tells
5886
    whether the bitmap is present, and whether this is a negated class or
5887
    not. */
5888
5889
1.74k
    case META_CLASS_NOT:
5890
3.41k
    case META_CLASS:
5891
3.41k
    matched_char = TRUE;
5892
3.41k
    negate_class = meta == META_CLASS_NOT;
5893
5894
    /* We can optimize the case of a single character in a class by generating
5895
    OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5896
    negative. In the negative case there can be no first char if this item is
5897
    first, whatever repeat count may follow. In the case of reqcu, save the
5898
    previous value for reinstating. */
5899
5900
    /* NOTE: at present this optimization is not effective if the only
5901
    character in a class in 32-bit, non-UCP mode has its top bit set. */
5902
5903
3.41k
    if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5904
478
      {
5905
478
#ifdef SUPPORT_UNICODE
5906
478
      uint32_t d;
5907
478
#endif
5908
478
      uint32_t c = pptr[1];
5909
5910
478
      pptr += 2;                 /* Move on to class end */
5911
478
      if (meta == META_CLASS)    /* A positive one-char class can be */
5912
124
        {                        /* handled as a normal literal character. */
5913
124
        meta = c;                /* Set up the character */
5914
124
        goto NORMAL_CHAR_SET;
5915
124
        }
5916
5917
      /* Handle a negative one-character class */
5918
5919
354
      zeroreqcu = reqcu;
5920
354
      zeroreqcuflags = reqcuflags;
5921
354
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5922
354
      zerofirstcu = firstcu;
5923
354
      zerofirstcuflags = firstcuflags;
5924
5925
      /* For caseless UTF or UCP mode, check whether this character has more
5926
      than one other case. If so, generate a special OP_NOTPROP item instead of
5927
      OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
5928
      caseless set that starts with an ASCII character. */
5929
5930
354
#ifdef SUPPORT_UNICODE
5931
354
      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5932
120
          (d = UCD_CASESET(c)) != 0 &&
5933
8
          ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
5934
0
          PRIV(ucd_caseless_sets)[d] > 127))
5935
8
        {
5936
8
        *code++ = OP_NOTPROP;
5937
8
        *code++ = PT_CLIST;
5938
8
        *code++ = d;
5939
8
        break;   /* We are finished with this class */
5940
8
        }
5941
346
#endif
5942
      /* Char has only one other (usable) case, or UCP not available */
5943
5944
346
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5945
346
      code += PUTCHAR(c, code);
5946
346
      break;   /* We are finished with this class */
5947
354
      }        /* End of 1-char optimization */
5948
5949
    /* Handle character classes that contain more than just one literal
5950
    character. If there are exactly two characters in a positive class, see if
5951
    they are case partners. This can be optimized to generate a caseless single
5952
    character match (which also sets first/required code units if relevant).
5953
    When casing restrictions apply, ignore a caseless set if both characters
5954
    are ASCII. */
5955
5956
2.93k
    if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5957
1.50k
        pptr[3] == META_CLASS_END)
5958
132
      {
5959
132
      uint32_t c = pptr[1];
5960
5961
132
#ifdef SUPPORT_UNICODE
5962
132
      if (UCD_CASESET(c) == 0 ||
5963
8
         ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
5964
0
         c < 128 && pptr[2] < 128))
5965
124
#endif
5966
124
        {
5967
124
        uint32_t d;
5968
5969
124
#ifdef SUPPORT_UNICODE
5970
124
        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5971
124
#endif
5972
124
          {
5973
#if PCRE2_CODE_UNIT_WIDTH != 8
5974
          if (c > 255) d = c; else
5975
#endif
5976
124
          d = TABLE_GET(c, cb->fcc, c);
5977
124
          }
5978
5979
124
        if (c != d && pptr[2] == d)
5980
0
          {
5981
0
          pptr += 3;                 /* Move on to class end */
5982
0
          meta = c;
5983
0
          if ((options & PCRE2_CASELESS) == 0)
5984
0
            {
5985
0
            reset_caseful = TRUE;
5986
0
            options |= PCRE2_CASELESS;
5987
0
            req_caseopt = REQ_CASELESS;
5988
0
            }
5989
0
          goto CLASS_CASELESS_CHAR;
5990
0
          }
5991
124
        }
5992
132
      }
5993
5994
    /* If a non-extended class contains a negative special such as \S, we need
5995
    to flip the negation flag at the end, so that support for characters > 255
5996
    works correctly (they are all included in the class). An extended class may
5997
    need to insert specific matching or non-matching code for wide characters.
5998
    */
5999
6000
2.93k
    should_flip_negation = match_all_or_no_wide_chars = FALSE;
6001
6002
    /* Extended class (xclass) will be used when characters > 255
6003
    might match. */
6004
6005
2.93k
#ifdef SUPPORT_WIDE_CHARS
6006
2.93k
    xclass = FALSE;
6007
2.93k
    class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
6008
2.93k
    class_uchardata_base = class_uchardata;   /* Save the start */
6009
2.93k
#endif
6010
6011
    /* For optimization purposes, we track some properties of the class:
6012
    class_has_8bitchar will be non-zero if the class contains at least one
6013
    character with a code point less than 256; xclass_has_prop will be TRUE if
6014
    Unicode property checks are present in the class. */
6015
6016
2.93k
    class_has_8bitchar = 0;
6017
2.93k
#ifdef SUPPORT_WIDE_CHARS
6018
2.93k
    xclass_has_prop = FALSE;
6019
2.93k
#endif
6020
6021
    /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
6022
    in a temporary bit of memory, in case the class contains fewer than two
6023
    8-bit characters because in that case the compiled code doesn't use the bit
6024
    map. */
6025
6026
2.93k
    memset(classbits, 0, 32 * sizeof(uint8_t));
6027
6028
    /* Process items until META_CLASS_END is reached. */
6029
6030
40.5k
    while ((meta = *(++pptr)) != META_CLASS_END)
6031
37.5k
      {
6032
      /* Handle POSIX classes such as [:alpha:] etc. */
6033
6034
37.5k
      if (meta == META_POSIX || meta == META_POSIX_NEG)
6035
0
        {
6036
0
        BOOL local_negate = (meta == META_POSIX_NEG);
6037
0
        int posix_class = *(++pptr);
6038
0
        int taboffset, tabopt;
6039
0
        uint8_t pbits[32];
6040
6041
0
        should_flip_negation = local_negate;  /* Note negative special */
6042
6043
        /* If matching is caseless, upper and lower are converted to alpha.
6044
        This relies on the fact that the class table starts with alpha,
6045
        lower, upper as the first 3 entries. */
6046
6047
0
        if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
6048
0
          posix_class = 0;
6049
6050
        /* When PCRE2_UCP is set, some of the POSIX classes are converted to
6051
        different escape sequences that use Unicode properties \p or \P.
6052
        Others that are not available via \p or \P have to generate
6053
        XCL_PROP/XCL_NOTPROP directly, which is done here. */
6054
6055
0
#ifdef SUPPORT_UNICODE
6056
0
        if ((options & PCRE2_UCP) != 0 &&
6057
0
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
6058
0
          {
6059
0
          switch(posix_class)
6060
0
            {
6061
0
            case PC_GRAPH:
6062
0
            case PC_PRINT:
6063
0
            case PC_PUNCT:
6064
0
            *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
6065
0
            *class_uchardata++ = (PCRE2_UCHAR)
6066
0
              ((posix_class == PC_GRAPH)? PT_PXGRAPH :
6067
0
               (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
6068
0
            *class_uchardata++ = 0;
6069
0
            xclass_has_prop = TRUE;
6070
0
            goto CONTINUE_CLASS;
6071
6072
            /* For the other POSIX classes (ex: ascii) we are going to
6073
            fall through to the non-UCP case and build a bit map for
6074
            characters with code points less than 256. However, if we are in
6075
            a negated POSIX class, characters with code points greater than
6076
            255 must either all match or all not match, depending on whether
6077
            the whole class is not or is negated. For example, for
6078
            [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
6079
            they must not.
6080
6081
            In the special case where there are no xclass items, this is
6082
            automatically handled by the use of OP_CLASS or OP_NCLASS, but an
6083
            explicit range is needed for OP_XCLASS. Setting a flag here
6084
            causes the range to be generated later when it is known that
6085
            OP_XCLASS is required. In the 8-bit library this is relevant only in
6086
            utf mode, since no wide characters can exist otherwise. */
6087
6088
0
            default:
6089
0
#if PCRE2_CODE_UNIT_WIDTH == 8
6090
0
            if (utf)
6091
0
#endif
6092
0
            match_all_or_no_wide_chars |= local_negate;
6093
0
            break;
6094
0
            }
6095
0
          }
6096
0
#endif  /* SUPPORT_UNICODE */
6097
6098
        /* In the non-UCP case, or when UCP makes no difference, we build the
6099
        bit map for the POSIX class in a chunk of local store because we may
6100
        be adding and subtracting from it, and we don't want to subtract bits
6101
        that may be in the main map already. At the end we or the result into
6102
        the bit map that is being built. */
6103
6104
0
        posix_class *= 3;
6105
6106
        /* Copy in the first table (always present) */
6107
6108
0
        memcpy(pbits, cbits + posix_class_maps[posix_class],
6109
0
          32 * sizeof(uint8_t));
6110
6111
        /* If there is a second table, add or remove it as required. */
6112
6113
0
        taboffset = posix_class_maps[posix_class + 1];
6114
0
        tabopt = posix_class_maps[posix_class + 2];
6115
6116
0
        if (taboffset >= 0)
6117
0
          {
6118
0
          if (tabopt >= 0)
6119
0
            for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
6120
0
          else
6121
0
            for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
6122
0
          }
6123
6124
        /* Now see if we need to remove any special characters. An option
6125
        value of 1 removes vertical space and 2 removes underscore. */
6126
6127
0
        if (tabopt < 0) tabopt = -tabopt;
6128
0
        if (tabopt == 1) pbits[1] &= ~0x3c;
6129
0
          else if (tabopt == 2) pbits[11] &= 0x7f;
6130
6131
        /* Add the POSIX table or its complement into the main table that is
6132
        being built and we are done. */
6133
6134
0
        if (local_negate)
6135
0
          for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
6136
0
        else
6137
0
          for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
6138
6139
        /* Every class contains at least one < 256 character. */
6140
6141
0
        class_has_8bitchar = 1;
6142
0
        goto CONTINUE_CLASS;    /* End of POSIX handling */
6143
0
        }
6144
6145
      /* Other than POSIX classes, the only items we should encounter are
6146
      \d-type escapes and literal characters (possibly as ranges). */
6147
6148
37.5k
      if (meta == META_BIGVALUE)
6149
0
        {
6150
0
        meta = *(++pptr);
6151
0
        goto CLASS_LITERAL;
6152
0
        }
6153
6154
      /* Any other non-literal must be an escape */
6155
6156
37.5k
      if (meta >= META_END)
6157
304
        {
6158
304
        if (META_CODE(meta) != META_ESCAPE)
6159
0
          {
6160
#ifdef DEBUG_SHOW_PARSED
6161
          fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
6162
                          "in character class\n", meta);
6163
#endif
6164
0
          *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
6165
0
          return 0;
6166
0
          }
6167
304
        escape = META_DATA(meta);
6168
6169
        /* Every class contains at least one < 256 character. */
6170
6171
304
        class_has_8bitchar++;
6172
6173
304
        switch(escape)
6174
304
          {
6175
24
          case ESC_d:
6176
792
          for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
6177
24
          break;
6178
6179
4
          case ESC_D:
6180
4
          should_flip_negation = TRUE;
6181
132
          for (int i = 0; i < 32; i++)
6182
128
            classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
6183
4
          break;
6184
6185
106
          case ESC_w:
6186
3.49k
          for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
6187
106
          break;
6188
6189
0
          case ESC_W:
6190
0
          should_flip_negation = TRUE;
6191
0
          for (int i = 0; i < 32; i++)
6192
0
            classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
6193
0
          break;
6194
6195
          /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
6196
          5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
6197
          previously set by something earlier in the character class.
6198
          Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
6199
          we could just adjust the appropriate bit. From PCRE 8.34 we no
6200
          longer treat \s and \S specially. */
6201
6202
0
          case ESC_s:
6203
0
          for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
6204
0
          break;
6205
6206
2
          case ESC_S:
6207
2
          should_flip_negation = TRUE;
6208
66
          for (int i = 0; i < 32; i++)
6209
64
            classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
6210
2
          break;
6211
6212
          /* When adding the horizontal or vertical space lists to a class, or
6213
          their complements, disable PCRE2_CASELESS, because it justs wastes
6214
          time, and in the "not-x" UTF cases can create unwanted duplicates in
6215
          the XCLASS list (provoked by characters that have more than one other
6216
          case and by both cases being in the same "not-x" sublist). */
6217
6218
4
          case ESC_h:
6219
4
          (void)add_list_to_class(classbits, &class_uchardata,
6220
4
            options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list),
6221
4
              NOTACHAR);
6222
4
          break;
6223
6224
8
          case ESC_H:
6225
8
          (void)add_not_list_to_class(classbits, &class_uchardata,
6226
8
            options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list));
6227
8
          break;
6228
6229
0
          case ESC_v:
6230
0
          (void)add_list_to_class(classbits, &class_uchardata,
6231
0
            options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list),
6232
0
              NOTACHAR);
6233
0
          break;
6234
6235
0
          case ESC_V:
6236
0
          (void)add_not_list_to_class(classbits, &class_uchardata,
6237
0
            options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list));
6238
0
          break;
6239
6240
          /* If Unicode is not supported, \P and \p are not allowed and are
6241
          faulted at parse time, so will never appear here. */
6242
6243
0
#ifdef SUPPORT_UNICODE
6244
156
          case ESC_p:
6245
156
          case ESC_P:
6246
156
            {
6247
156
            uint32_t ptype = *(++pptr) >> 16;
6248
156
            uint32_t pdata = *pptr & 0xffff;
6249
156
            *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
6250
156
            *class_uchardata++ = ptype;
6251
156
            *class_uchardata++ = pdata;
6252
156
            xclass_has_prop = TRUE;
6253
156
            class_has_8bitchar--;                /* Undo! */
6254
156
            }
6255
156
          break;
6256
304
#endif
6257
304
          }
6258
6259
304
        goto CONTINUE_CLASS;
6260
304
        }  /* End handling \d-type escapes */
6261
6262
      /* A literal character may be followed by a range meta. At parse time
6263
      there are checks for out-of-order characters, for ranges where the two
6264
      characters are equal, and for hyphens that cannot indicate a range. At
6265
      this point, therefore, no checking is needed. */
6266
6267
37.2k
      else
6268
37.2k
        {
6269
37.2k
        uint32_t c, d;
6270
6271
37.2k
        CLASS_LITERAL:
6272
37.2k
        c = d = meta;
6273
6274
        /* Remember if \r or \n were explicitly used */
6275
6276
37.2k
        if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6277
6278
        /* Process a character range */
6279
6280
37.2k
        if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6281
384
          {
6282
#ifdef EBCDIC
6283
          BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6284
#endif
6285
384
          pptr += 2;
6286
384
          d = *pptr;
6287
384
          if (d == META_BIGVALUE) d = *(++pptr);
6288
6289
          /* Remember an explicit \r or \n, and add the range to the class. */
6290
6291
384
          if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6292
6293
          /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6294
          because there are holes in the encoding, and simply using the range
6295
          A-Z (for example) would include the characters in the holes. This
6296
          applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6297
6298
#ifdef EBCDIC
6299
          if (range_is_literal &&
6300
               (cb->ctypes[c] & ctype_letter) != 0 &&
6301
               (cb->ctypes[d] & ctype_letter) != 0 &&
6302
               (c <= CHAR_z) == (d <= CHAR_z))
6303
            {
6304
            uint32_t uc = (d <= CHAR_z)? 0 : 64;
6305
            uint32_t C = c - uc;
6306
            uint32_t D = d - uc;
6307
6308
            if (C <= CHAR_i)
6309
              {
6310
              class_has_8bitchar +=
6311
                add_to_class(classbits, &class_uchardata, options, xoptions,
6312
                  cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc);
6313
              C = CHAR_j;
6314
              }
6315
6316
            if (C <= D && C <= CHAR_r)
6317
              {
6318
              class_has_8bitchar +=
6319
                add_to_class(classbits, &class_uchardata, options, xoptions,
6320
                  cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc);
6321
              C = CHAR_s;
6322
              }
6323
6324
            if (C <= D)
6325
              {
6326
              class_has_8bitchar +=
6327
                add_to_class(classbits, &class_uchardata, options, xoptions,
6328
                  cb, C + uc, D + uc);
6329
              }
6330
            }
6331
          else
6332
#endif
6333
          /* Not an EBCDIC special range */
6334
6335
384
          class_has_8bitchar += add_to_class(classbits, &class_uchardata,
6336
384
            options, xoptions, cb, c, d);
6337
384
          goto CONTINUE_CLASS;   /* Go get the next char in the class */
6338
384
          }  /* End of range handling */
6339
6340
6341
        /* Handle a single character. */
6342
6343
36.8k
        class_has_8bitchar +=
6344
36.8k
          add_to_class(classbits, &class_uchardata, options, xoptions, cb,
6345
36.8k
            meta, meta);
6346
36.8k
        }
6347
6348
      /* Continue to the next item in the class. */
6349
6350
37.5k
      CONTINUE_CLASS:
6351
6352
37.5k
#ifdef SUPPORT_WIDE_CHARS
6353
      /* If any wide characters or Unicode properties have been encountered,
6354
      set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6355
      of the extra data and reset the pointer. This is so that very large
6356
      classes that contain a zillion wide characters or Unicode property tests
6357
      do not overwrite the workspace (which is on the stack). */
6358
6359
37.5k
      if (class_uchardata > class_uchardata_base)
6360
3.88k
        {
6361
3.88k
        xclass = TRUE;
6362
3.88k
        if (lengthptr != NULL)
6363
538
          {
6364
538
          *lengthptr += class_uchardata - class_uchardata_base;
6365
538
          class_uchardata = class_uchardata_base;
6366
538
          }
6367
3.88k
        }
6368
37.5k
#endif
6369
6370
37.5k
      continue;  /* Needed to avoid error when not supporting wide chars */
6371
37.5k
      }   /* End of main class-processing loop */
6372
6373
    /* If this class is the first thing in the branch, there can be no first
6374
    char setting, whatever the repeat count. Any reqcu setting must remain
6375
    unchanged after any kind of repeat. */
6376
6377
2.93k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6378
2.93k
    zerofirstcu = firstcu;
6379
2.93k
    zerofirstcuflags = firstcuflags;
6380
2.93k
    zeroreqcu = reqcu;
6381
2.93k
    zeroreqcuflags = reqcuflags;
6382
6383
    /* If there are characters with values > 255, or Unicode property settings
6384
    (\p or \P), we have to compile an extended class, with its own opcode,
6385
    unless there were no property settings and there was a negated special such
6386
    as \S in the class, and PCRE2_UCP is not set, because in that case all
6387
    characters > 255 are in or not in the class, so any that were explicitly
6388
    given as well can be ignored.
6389
6390
    In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
6391
    were present in a class, we either have to match or not match all wide
6392
    characters (depending on whether the whole class is or is not negated).
6393
    This requirement is indicated by match_all_or_no_wide_chars being true.
6394
    We do this by including an explicit range, which works in both cases.
6395
    This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6396
    cannot be any wide characters in 8-bit non-UTF mode.
6397
6398
    When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6399
    class where \S etc is present without PCRE2_UCP, causing an extended class
6400
    to be compiled, we make sure that all characters > 255 are included by
6401
    forcing match_all_or_no_wide_chars to be true.
6402
6403
    If, when generating an xclass, there are no characters < 256, we can omit
6404
    the bitmap in the actual compiled code. */
6405
6406
2.93k
#ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
6407
2.93k
    if (xclass && (
6408
712
#ifdef SUPPORT_UNICODE
6409
712
        (options & PCRE2_UCP) != 0 ||
6410
26
#endif
6411
26
        xclass_has_prop || !should_flip_negation))
6412
712
      {
6413
712
      if (match_all_or_no_wide_chars || (
6414
712
#if PCRE2_CODE_UNIT_WIDTH == 8
6415
712
           utf &&
6416
686
#endif
6417
686
           should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6418
0
        {
6419
0
        *class_uchardata++ = XCL_RANGE;
6420
0
        if (utf)   /* Will always be utf in the 8-bit library */
6421
0
          {
6422
0
          class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6423
0
          class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6424
0
          }
6425
0
        else       /* Can only happen for the 16-bit & 32-bit libraries */
6426
0
          {
6427
#if PCRE2_CODE_UNIT_WIDTH == 16
6428
          *class_uchardata++ = 0x100;
6429
          *class_uchardata++ = 0xffffu;
6430
#elif PCRE2_CODE_UNIT_WIDTH == 32
6431
          *class_uchardata++ = 0x100;
6432
          *class_uchardata++ = 0xffffffffu;
6433
#endif
6434
0
          }
6435
0
        }
6436
712
      *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
6437
712
      *code++ = OP_XCLASS;
6438
712
      code += LINK_SIZE;
6439
712
      *code = negate_class? XCL_NOT:0;
6440
712
      if (xclass_has_prop) *code |= XCL_HASPROP;
6441
6442
      /* If the map is required, move up the extra data to make room for it;
6443
      otherwise just move the code pointer to the end of the extra data. */
6444
6445
712
      if (class_has_8bitchar > 0)
6446
712
        {
6447
712
        *code++ |= XCL_MAP;
6448
712
        (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6449
712
          CU2BYTES(class_uchardata - code));
6450
712
        if (negate_class && !xclass_has_prop)
6451
370
          {
6452
          /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6453
12.2k
          for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6454
370
          }
6455
712
        memcpy(code, classbits, 32);
6456
712
        code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6457
712
        }
6458
0
      else code = class_uchardata;
6459
6460
      /* Now fill in the complete length of the item */
6461
6462
712
      PUT(previous, 1, (int)(code - previous));
6463
712
      break;   /* End of class handling */
6464
712
      }
6465
2.22k
#endif  /* SUPPORT_WIDE_CHARS */
6466
6467
    /* If there are no characters > 255, or they are all to be included or
6468
    excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6469
    whole class was negated and whether there were negative specials such as \S
6470
    (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6471
    negating it if necessary. */
6472
6473
2.22k
    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6474
2.22k
    if (lengthptr == NULL)    /* Save time in the pre-compile phase */
6475
1.11k
      {
6476
1.11k
      if (negate_class)
6477
451
        {
6478
       /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6479
14.8k
       for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6480
451
       }
6481
1.11k
      memcpy(code, classbits, 32);
6482
1.11k
      }
6483
2.22k
    code += 32 / sizeof(PCRE2_UCHAR);
6484
2.22k
    break;  /* End of class processing */
6485
6486
6487
    /* ===================================================================*/
6488
    /* Deal with (*VERB)s. */
6489
6490
    /* Check for open captures before ACCEPT and close those that are within
6491
    the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6492
    assertion. In the first pass, just accumulate the length required;
6493
    otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6494
    workspace overflow. Do not set firstcu after *ACCEPT. */
6495
6496
0
    case META_ACCEPT:
6497
0
    cb->had_accept = had_accept = TRUE;
6498
0
    for (oc = open_caps;
6499
0
         oc != NULL && oc->assert_depth >= cb->assert_depth;
6500
0
         oc = oc->next)
6501
0
      {
6502
0
      if (lengthptr != NULL)
6503
0
        {
6504
0
        *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6505
0
        }
6506
0
      else
6507
0
        {
6508
0
        *code++ = OP_CLOSE;
6509
0
        PUT2INC(code, 0, oc->number);
6510
0
        }
6511
0
      }
6512
0
    *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6513
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6514
0
    break;
6515
6516
0
    case META_PRUNE:
6517
0
    case META_SKIP:
6518
0
    cb->had_pruneorskip = TRUE;
6519
    /* Fall through */
6520
0
    case META_COMMIT:
6521
0
    case META_FAIL:
6522
0
    *code++ = verbops[(meta - META_MARK) >> 16];
6523
0
    break;
6524
6525
0
    case META_THEN:
6526
0
    cb->external_flags |= PCRE2_HASTHEN;
6527
0
    *code++ = OP_THEN;
6528
0
    break;
6529
6530
    /* Handle verbs with arguments. Arguments can be very long, especially in
6531
    16- and 32-bit modes, and can overflow the workspace in the first pass.
6532
    However, the argument length is constrained to be small enough to fit in
6533
    one code unit. This check happens in parse_regex(). In the first pass,
6534
    instead of putting the argument into memory, we just update the length
6535
    counter and set up an empty argument. */
6536
6537
0
    case META_THEN_ARG:
6538
0
    cb->external_flags |= PCRE2_HASTHEN;
6539
0
    goto VERB_ARG;
6540
6541
0
    case META_PRUNE_ARG:
6542
0
    case META_SKIP_ARG:
6543
0
    cb->had_pruneorskip = TRUE;
6544
    /* Fall through */
6545
0
    case META_MARK:
6546
0
    case META_COMMIT_ARG:
6547
0
    VERB_ARG:
6548
0
    *code++ = verbops[(meta - META_MARK) >> 16];
6549
    /* The length is in characters. */
6550
0
    verbarglen = *(++pptr);
6551
0
    verbculen = 0;
6552
0
    tempcode = code++;
6553
0
    for (int i = 0; i < (int)verbarglen; i++)
6554
0
      {
6555
0
      meta = *(++pptr);
6556
0
#ifdef SUPPORT_UNICODE
6557
0
      if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6558
0
#endif
6559
0
        {
6560
0
        mclength = 1;
6561
0
        mcbuffer[0] = meta;
6562
0
        }
6563
0
      if (lengthptr != NULL) *lengthptr += mclength; else
6564
0
        {
6565
0
        memcpy(code, mcbuffer, CU2BYTES(mclength));
6566
0
        code += mclength;
6567
0
        verbculen += mclength;
6568
0
        }
6569
0
      }
6570
6571
0
    *tempcode = verbculen;   /* Fill in the code unit length */
6572
0
    *code++ = 0;             /* Terminating zero */
6573
0
    break;
6574
6575
6576
    /* ===================================================================*/
6577
    /* Handle options change. The new setting must be passed back for use in
6578
    subsequent branches. Reset the greedy defaults and the case value for
6579
    firstcu and reqcu. */
6580
6581
0
    case META_OPTIONS:
6582
0
    *optionsptr = options = *(++pptr);
6583
0
    *xoptionsptr = xoptions = *(++pptr);
6584
0
    greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6585
0
    greedy_non_default = greedy_default ^ 1;
6586
0
    req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6587
0
    break;
6588
6589
6590
    /* ===================================================================*/
6591
    /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6592
    because it could be a numerical check on recursion, or a name check on a
6593
    group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6594
    we can handle it either way. We first try for a name; if not found, process
6595
    the number. */
6596
6597
0
    case META_COND_RNUMBER:   /* (?(Rdigits) */
6598
0
    case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6599
0
    case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6600
0
    bravalue = OP_COND;
6601
0
      {
6602
0
      int count, index;
6603
0
      unsigned int i;
6604
0
      PCRE2_SPTR name;
6605
0
      named_group *ng = cb->named_groups;
6606
0
      uint32_t length = *(++pptr);
6607
6608
0
      GETPLUSOFFSET(offset, pptr);
6609
0
      name = cb->start_pattern + offset;
6610
6611
      /* In the first pass, the names generated in the pre-pass are available,
6612
      but the main name table has not yet been created. Scan the list of names
6613
      generated in the pre-pass in order to get a number and whether or not
6614
      this name is duplicated. If it is not duplicated, we can handle it as a
6615
      numerical group. */
6616
6617
0
      for (i = 0; i < cb->names_found; i++, ng++)
6618
0
        {
6619
0
        if (length == ng->length &&
6620
0
            PRIV(strncmp)(name, ng->name, length) == 0)
6621
0
          {
6622
0
          if (!ng->isdup)
6623
0
            {
6624
0
            code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6625
0
            PUT2(code, 2+LINK_SIZE, ng->number);
6626
0
            if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6627
0
            skipunits = 1+IMM2_SIZE;
6628
0
            goto GROUP_PROCESS_NOTE_EMPTY;
6629
0
            }
6630
0
          break;  /* Found a duplicated name */
6631
0
          }
6632
0
        }
6633
6634
      /* If the name was not found we have a bad reference, unless we are
6635
      dealing with R<digits>, which is treated as a recursion test by number.
6636
      */
6637
6638
0
      if (i >= cb->names_found)
6639
0
        {
6640
0
        groupnumber = 0;
6641
0
        if (meta == META_COND_RNUMBER)
6642
0
          {
6643
0
          for (i = 1; i < length; i++)
6644
0
            {
6645
0
            groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6646
0
            if (groupnumber > MAX_GROUP_NUMBER)
6647
0
              {
6648
0
              *errorcodeptr = ERR61;
6649
0
              cb->erroroffset = offset + i;
6650
0
              return 0;
6651
0
              }
6652
0
            }
6653
0
          }
6654
6655
0
        if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6656
0
          {
6657
0
          *errorcodeptr = ERR15;
6658
0
          cb->erroroffset = offset;
6659
0
          return 0;
6660
0
          }
6661
6662
        /* (?Rdigits) treated as a recursion reference by number. A value of
6663
        zero (which is the result of both (?R) and (?R0)) means "any", and is
6664
        translated into RREF_ANY (which is 0xffff). */
6665
6666
0
        if (groupnumber == 0) groupnumber = RREF_ANY;
6667
0
        code[1+LINK_SIZE] = OP_RREF;
6668
0
        PUT2(code, 2+LINK_SIZE, groupnumber);
6669
0
        skipunits = 1+IMM2_SIZE;
6670
0
        goto GROUP_PROCESS_NOTE_EMPTY;
6671
0
        }
6672
6673
      /* A duplicated name was found. Note that if an R<digits> name is found
6674
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6675
6676
0
      code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6677
6678
      /* We have a duplicated name. In the compile pass we have to search the
6679
      main table in order to get the index and count values. */
6680
6681
0
      count = 0;  /* Values for first pass (avoids compiler warning) */
6682
0
      index = 0;
6683
0
      if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6684
0
            &count, errorcodeptr, cb)) return 0;
6685
6686
      /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6687
      insert appropriate data values. */
6688
6689
0
      code[1+LINK_SIZE]++;
6690
0
      skipunits = 1+2*IMM2_SIZE;
6691
0
      PUT2(code, 2+LINK_SIZE, index);
6692
0
      PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6693
0
      }
6694
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6695
6696
    /* The DEFINE condition is always false. Its internal groups may never
6697
    be called, so matched_char must remain false, hence the jump to
6698
    GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6699
6700
0
    case META_COND_DEFINE:
6701
0
    bravalue = OP_COND;
6702
0
    GETPLUSOFFSET(offset, pptr);
6703
0
    code[1+LINK_SIZE] = OP_DEFINE;
6704
0
    skipunits = 1;
6705
0
    goto GROUP_PROCESS;
6706
6707
    /* Conditional test of a group's being set. */
6708
6709
0
    case META_COND_NUMBER:
6710
0
    bravalue = OP_COND;
6711
0
    GETPLUSOFFSET(offset, pptr);
6712
0
    groupnumber = *(++pptr);
6713
0
    if (groupnumber > cb->bracount)
6714
0
      {
6715
0
      *errorcodeptr = ERR15;
6716
0
      cb->erroroffset = offset;
6717
0
      return 0;
6718
0
      }
6719
0
    if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6720
0
    offset -= 2;   /* Point at initial ( for too many branches error */
6721
0
    code[1+LINK_SIZE] = OP_CREF;
6722
0
    skipunits = 1+IMM2_SIZE;
6723
0
    PUT2(code, 2+LINK_SIZE, groupnumber);
6724
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6725
6726
    /* Test for the PCRE2 version. */
6727
6728
0
    case META_COND_VERSION:
6729
0
    bravalue = OP_COND;
6730
0
    if (pptr[1] > 0)
6731
0
      code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6732
0
        (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6733
0
          OP_TRUE : OP_FALSE;
6734
0
    else
6735
0
      code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6736
0
        OP_TRUE : OP_FALSE;
6737
0
    skipunits = 1;
6738
0
    pptr += 3;
6739
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6740
6741
    /* The condition is an assertion, possibly preceded by a callout. */
6742
6743
0
    case META_COND_ASSERT:
6744
0
    bravalue = OP_COND;
6745
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6746
6747
6748
    /* ===================================================================*/
6749
    /* Handle all kinds of nested bracketed groups. The non-capturing,
6750
    non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6751
6752
8
    case META_LOOKAHEAD:
6753
8
    bravalue = OP_ASSERT;
6754
8
    cb->assert_depth += 1;
6755
8
    goto GROUP_PROCESS;
6756
6757
0
    case META_LOOKAHEAD_NA:
6758
0
    bravalue = OP_ASSERT_NA;
6759
0
    cb->assert_depth += 1;
6760
0
    goto GROUP_PROCESS;
6761
6762
    /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6763
    thing to do, but Perl allows all assertions to be quantified, and when
6764
    they contain capturing parentheses there may be a potential use for
6765
    this feature. Not that that applies to a quantified (?!) but we allow
6766
    it for uniformity. */
6767
6768
2
    case META_LOOKAHEADNOT:
6769
2
    if (pptr[1] == META_KET &&
6770
0
         (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6771
0
      {
6772
0
      *code++ = OP_FAIL;
6773
0
      pptr++;
6774
0
      }
6775
2
    else
6776
2
      {
6777
2
      bravalue = OP_ASSERT_NOT;
6778
2
      cb->assert_depth += 1;
6779
2
      goto GROUP_PROCESS;
6780
2
      }
6781
0
    break;
6782
6783
2
    case META_LOOKBEHIND:
6784
2
    bravalue = OP_ASSERTBACK;
6785
2
    cb->assert_depth += 1;
6786
2
    goto GROUP_PROCESS;
6787
6788
0
    case META_LOOKBEHINDNOT:
6789
0
    bravalue = OP_ASSERTBACK_NOT;
6790
0
    cb->assert_depth += 1;
6791
0
    goto GROUP_PROCESS;
6792
6793
0
    case META_LOOKBEHIND_NA:
6794
0
    bravalue = OP_ASSERTBACK_NA;
6795
0
    cb->assert_depth += 1;
6796
0
    goto GROUP_PROCESS;
6797
6798
0
    case META_ATOMIC:
6799
0
    bravalue = OP_ONCE;
6800
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6801
6802
0
    case META_SCRIPT_RUN:
6803
0
    bravalue = OP_SCRIPT_RUN;
6804
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6805
6806
10
    case META_NOCAPTURE:
6807
10
    bravalue = OP_BRA;
6808
    /* Fall through */
6809
6810
    /* Process nested bracketed regex. The nesting depth is maintained for the
6811
    benefit of the stackguard function. The test for too deep nesting is now
6812
    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6813
    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6814
    note of whether or not they may match an empty string. */
6815
6816
1.61k
    GROUP_PROCESS_NOTE_EMPTY:
6817
1.61k
    note_group_empty = TRUE;
6818
6819
1.62k
    GROUP_PROCESS:
6820
1.62k
    cb->parens_depth += 1;
6821
1.62k
    *code = bravalue;
6822
1.62k
    pptr++;
6823
1.62k
    tempcode = code;
6824
1.62k
    tempreqvary = cb->req_varyopt;        /* Save value before group */
6825
1.62k
    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6826
6827
1.62k
    if ((group_return =
6828
1.62k
         compile_regex(
6829
1.62k
         options,                         /* The options state */
6830
1.62k
         xoptions,                        /* The extra options state */
6831
1.62k
         &tempcode,                       /* Where to put code (updated) */
6832
1.62k
         &pptr,                           /* Input pointer (updated) */
6833
1.62k
         errorcodeptr,                    /* Where to put an error message */
6834
1.62k
         skipunits,                       /* Skip over bracket number */
6835
1.62k
         &subfirstcu,                     /* For possible first char */
6836
1.62k
         &subfirstcuflags,
6837
1.62k
         &subreqcu,                       /* For possible last char */
6838
1.62k
         &subreqcuflags,
6839
1.62k
         bcptr,                           /* Current branch chain */
6840
1.62k
         open_caps,                       /* Pointer to capture stack */
6841
1.62k
         cb,                              /* Compile data block */
6842
1.62k
         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6843
1.62k
           &length_prevgroup              /* Pre-compile phase */
6844
1.62k
         )) == 0)
6845
0
      return 0;  /* Error */
6846
6847
1.62k
    cb->parens_depth -= 1;
6848
6849
    /* If that was a non-conditional significant group (not an assertion, not a
6850
    DEFINE) that matches at least one character, then the current item matches
6851
    a character. Conditionals are handled below. */
6852
6853
1.62k
    if (note_group_empty && bravalue != OP_COND && group_return > 0)
6854
95
      matched_char = TRUE;
6855
6856
    /* If we've just compiled an assertion, pop the assert depth. */
6857
6858
1.62k
    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6859
12
      cb->assert_depth -= 1;
6860
6861
    /* At the end of compiling, code is still pointing to the start of the
6862
    group, while tempcode has been updated to point past the end of the group.
6863
    The parsed pattern pointer (pptr) is on the closing META_KET.
6864
6865
    If this is a conditional bracket, check that there are no more than
6866
    two branches in the group, or just one if it's a DEFINE group. We do this
6867
    in the real compile phase, not in the pre-pass, where the whole group may
6868
    not be available. */
6869
6870
1.62k
    if (bravalue == OP_COND && lengthptr == NULL)
6871
0
      {
6872
0
      PCRE2_UCHAR *tc = code;
6873
0
      int condcount = 0;
6874
6875
0
      do {
6876
0
         condcount++;
6877
0
         tc += GET(tc,1);
6878
0
         }
6879
0
      while (*tc != OP_KET);
6880
6881
      /* A DEFINE group is never obeyed inline (the "condition" is always
6882
      false). It must have only one branch. Having checked this, change the
6883
      opcode to OP_FALSE. */
6884
6885
0
      if (code[LINK_SIZE+1] == OP_DEFINE)
6886
0
        {
6887
0
        if (condcount > 1)
6888
0
          {
6889
0
          cb->erroroffset = offset;
6890
0
          *errorcodeptr = ERR54;
6891
0
          return 0;
6892
0
          }
6893
0
        code[LINK_SIZE+1] = OP_FALSE;
6894
0
        bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6895
0
        }
6896
6897
      /* A "normal" conditional group. If there is just one branch, we must not
6898
      make use of its firstcu or reqcu, because this is equivalent to an
6899
      empty second branch. Also, it may match an empty string. If there are two
6900
      branches, this item must match a character if the group must. */
6901
6902
0
      else
6903
0
        {
6904
0
        if (condcount > 2)
6905
0
          {
6906
0
          cb->erroroffset = offset;
6907
0
          *errorcodeptr = ERR27;
6908
0
          return 0;
6909
0
          }
6910
0
        if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6911
0
          else if (group_return > 0) matched_char = TRUE;
6912
0
        }
6913
0
      }
6914
6915
    /* In the pre-compile phase, update the length by the length of the group,
6916
    less the brackets at either end. Then reduce the compiled code to just a
6917
    set of non-capturing brackets so that it doesn't use much memory if it is
6918
    duplicated by a quantifier.*/
6919
6920
1.62k
    if (lengthptr != NULL)
6921
907
      {
6922
907
      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6923
0
        {
6924
0
        *errorcodeptr = ERR20;
6925
0
        return 0;
6926
0
        }
6927
907
      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6928
907
      code++;   /* This already contains bravalue */
6929
907
      PUTINC(code, 0, 1 + LINK_SIZE);
6930
907
      *code++ = OP_KET;
6931
907
      PUTINC(code, 0, 1 + LINK_SIZE);
6932
907
      break;    /* No need to waste time with special character handling */
6933
907
      }
6934
6935
    /* Otherwise update the main code pointer to the end of the group. */
6936
6937
715
    code = tempcode;
6938
6939
    /* For a DEFINE group, required and first character settings are not
6940
    relevant. */
6941
6942
715
    if (bravalue == OP_DEFINE) break;
6943
6944
    /* Handle updating of the required and first code units for other types of
6945
    group. Update for normal brackets of all kinds, and conditions with two
6946
    branches (see code above). If the bracket is followed by a quantifier with
6947
    zero repeat, we have to back off. Hence the definition of zeroreqcu and
6948
    zerofirstcu outside the main loop so that they can be accessed for the back
6949
    off. */
6950
6951
715
    zeroreqcu = reqcu;
6952
715
    zeroreqcuflags = reqcuflags;
6953
715
    zerofirstcu = firstcu;
6954
715
    zerofirstcuflags = firstcuflags;
6955
715
    groupsetfirstcu = FALSE;
6956
6957
715
    if (bravalue >= OP_ONCE)  /* Not an assertion */
6958
709
      {
6959
      /* If we have not yet set a firstcu in this branch, take it from the
6960
      subpattern, remembering that it was set here so that a repeat of more
6961
      than one can replicate it as reqcu if necessary. If the subpattern has
6962
      no firstcu, set "none" for the whole branch. In both cases, a zero
6963
      repeat forces firstcu to "none". */
6964
6965
709
      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6966
13
        {
6967
13
        if (subfirstcuflags < REQ_NONE)
6968
3
          {
6969
3
          firstcu = subfirstcu;
6970
3
          firstcuflags = subfirstcuflags;
6971
3
          groupsetfirstcu = TRUE;
6972
3
          }
6973
10
        else firstcuflags = REQ_NONE;
6974
13
        zerofirstcuflags = REQ_NONE;
6975
13
        }
6976
6977
      /* If firstcu was previously set, convert the subpattern's firstcu
6978
      into reqcu if there wasn't one, using the vary flag that was in
6979
      existence beforehand. */
6980
6981
696
      else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6982
12
        {
6983
12
        subreqcu = subfirstcu;
6984
12
        subreqcuflags = subfirstcuflags | tempreqvary;
6985
12
        }
6986
6987
      /* If the subpattern set a required code unit (or set a first code unit
6988
      that isn't really the first code unit - see above), set it. */
6989
6990
709
      if (subreqcuflags < REQ_NONE)
6991
23
        {
6992
23
        reqcu = subreqcu;
6993
23
        reqcuflags = subreqcuflags;
6994
23
        }
6995
709
      }
6996
6997
    /* For a forward assertion, we take the reqcu, if set, provided that the
6998
    group has also set a firstcu. This can be helpful if the pattern that
6999
    follows the assertion doesn't set a different char. For example, it's
7000
    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7001
    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7002
    the "real" "a" would then become a reqcu instead of a firstcu. This is
7003
    overcome by a scan at the end if there's no firstcu, looking for an
7004
    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7005
    we must only take the reqcu when the group also set a firstcu. Otherwise,
7006
    in that example, 'X' ends up set for both. */
7007
7008
6
    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7009
4
             subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7010
3
      {
7011
3
      reqcu = subreqcu;
7012
3
      reqcuflags = subreqcuflags;
7013
3
      }
7014
7015
715
    break;  /* End of nested group handling */
7016
7017
7018
    /* ===================================================================*/
7019
    /* Handle named backreferences and recursions. */
7020
7021
0
    case META_BACKREF_BYNAME:
7022
0
    case META_RECURSE_BYNAME:
7023
0
      {
7024
0
      int count, index;
7025
0
      PCRE2_SPTR name;
7026
0
      BOOL is_dupname = FALSE;
7027
0
      named_group *ng = cb->named_groups;
7028
0
      uint32_t length = *(++pptr);
7029
7030
0
      GETPLUSOFFSET(offset, pptr);
7031
0
      name = cb->start_pattern + offset;
7032
7033
      /* In the first pass, the names generated in the pre-pass are available,
7034
      but the main name table has not yet been created. Scan the list of names
7035
      generated in the pre-pass in order to get a number and whether or not
7036
      this name is duplicated. */
7037
7038
0
      groupnumber = 0;
7039
0
      for (unsigned int i = 0; i < cb->names_found; i++, ng++)
7040
0
        {
7041
0
        if (length == ng->length &&
7042
0
            PRIV(strncmp)(name, ng->name, length) == 0)
7043
0
          {
7044
0
          is_dupname = ng->isdup;
7045
0
          groupnumber = ng->number;
7046
7047
          /* For a recursion, that's all that is needed. We can now go to
7048
          the code that handles numerical recursion, applying it to the first
7049
          group with the given name. */
7050
7051
0
          if (meta == META_RECURSE_BYNAME)
7052
0
            {
7053
0
            meta_arg = groupnumber;
7054
0
            goto HANDLE_NUMERICAL_RECURSION;
7055
0
            }
7056
7057
          /* For a back reference, update the back reference map and the
7058
          maximum back reference. */
7059
7060
0
          cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7061
0
          if (groupnumber > cb->top_backref)
7062
0
            cb->top_backref = groupnumber;
7063
0
          }
7064
0
        }
7065
7066
      /* If the name was not found we have a bad reference. */
7067
7068
0
      if (groupnumber == 0)
7069
0
        {
7070
0
        *errorcodeptr = ERR15;
7071
0
        cb->erroroffset = offset;
7072
0
        return 0;
7073
0
        }
7074
7075
      /* If a back reference name is not duplicated, we can handle it as
7076
      a numerical reference. */
7077
7078
0
      if (!is_dupname)
7079
0
        {
7080
0
        meta_arg = groupnumber;
7081
0
        goto HANDLE_SINGLE_REFERENCE;
7082
0
        }
7083
7084
      /* If a back reference name is duplicated, we generate a different
7085
      opcode to a numerical back reference. In the second pass we must
7086
      search for the index and count in the final name table. */
7087
7088
0
      count = 0;  /* Values for first pass (avoids compiler warning) */
7089
0
      index = 0;
7090
0
      if (lengthptr == NULL && !find_dupname_details(name, length, &index,
7091
0
            &count, errorcodeptr, cb)) return 0;
7092
7093
0
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7094
0
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7095
0
      PUT2INC(code, 0, index);
7096
0
      PUT2INC(code, 0, count);
7097
0
      }
7098
0
    break;
7099
7100
7101
    /* ===================================================================*/
7102
    /* Handle a numerical callout. */
7103
7104
0
    case META_CALLOUT_NUMBER:
7105
0
    code[0] = OP_CALLOUT;
7106
0
    PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7107
0
    PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7108
0
    code[1 + 2*LINK_SIZE] = pptr[3];
7109
0
    pptr += 3;
7110
0
    code += PRIV(OP_lengths)[OP_CALLOUT];
7111
0
    break;
7112
7113
7114
    /* ===================================================================*/
7115
    /* Handle a callout with a string argument. In the pre-pass we just compute
7116
    the length without generating anything. The length in pptr[3] includes both
7117
    delimiters; in the actual compile only the first one is copied, but a
7118
    terminating zero is added. Any doubled delimiters within the string make
7119
    this an overestimate, but it is not worth bothering about. */
7120
7121
0
    case META_CALLOUT_STRING:
7122
0
    if (lengthptr != NULL)
7123
0
      {
7124
0
      *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7125
0
      pptr += 3;
7126
0
      SKIPOFFSET(pptr);
7127
0
      }
7128
7129
    /* In the real compile we can copy the string. The starting delimiter is
7130
     included so that the client can discover it if they want. We also pass the
7131
     start offset to help a script language give better error messages. */
7132
7133
0
    else
7134
0
      {
7135
0
      PCRE2_SPTR pp;
7136
0
      uint32_t delimiter;
7137
0
      uint32_t length = pptr[3];
7138
0
      PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7139
7140
0
      code[0] = OP_CALLOUT_STR;
7141
0
      PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7142
0
      PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7143
7144
0
      pptr += 3;
7145
0
      GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7146
0
      pp = cb->start_pattern + offset;
7147
0
      delimiter = *callout_string++ = *pp++;
7148
0
      if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7149
0
        delimiter = CHAR_RIGHT_CURLY_BRACKET;
7150
0
      PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7151
7152
      /* The syntax of the pattern was checked in the parsing scan. The length
7153
      includes both delimiters, but we have passed the opening one just above,
7154
      so we reduce length before testing it. The test is for > 1 because we do
7155
      not want to copy the final delimiter. This also ensures that pp[1] is
7156
      accessible. */
7157
7158
0
      while (--length > 1)
7159
0
        {
7160
0
        if (*pp == delimiter && pp[1] == delimiter)
7161
0
          {
7162
0
          *callout_string++ = delimiter;
7163
0
          pp += 2;
7164
0
          length--;
7165
0
          }
7166
0
        else *callout_string++ = *pp++;
7167
0
        }
7168
0
      *callout_string++ = CHAR_NUL;
7169
7170
      /* Set the length of the entire item, the advance to its end. */
7171
7172
0
      PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7173
0
      code = callout_string;
7174
0
      }
7175
0
    break;
7176
7177
7178
    /* ===================================================================*/
7179
    /* Handle repetition. The different types are all sorted out in the parsing
7180
    pass. */
7181
7182
0
    case META_MINMAX_PLUS:
7183
0
    case META_MINMAX_QUERY:
7184
0
    case META_MINMAX:
7185
0
    repeat_min = *(++pptr);
7186
0
    repeat_max = *(++pptr);
7187
0
    goto REPEAT;
7188
7189
936
    case META_ASTERISK:
7190
1.02k
    case META_ASTERISK_PLUS:
7191
1.38k
    case META_ASTERISK_QUERY:
7192
1.38k
    repeat_min = 0;
7193
1.38k
    repeat_max = REPEAT_UNLIMITED;
7194
1.38k
    goto REPEAT;
7195
7196
2.60k
    case META_PLUS:
7197
2.93k
    case META_PLUS_PLUS:
7198
3.50k
    case META_PLUS_QUERY:
7199
3.50k
    repeat_min = 1;
7200
3.50k
    repeat_max = REPEAT_UNLIMITED;
7201
3.50k
    goto REPEAT;
7202
7203
2.60k
    case META_QUERY:
7204
2.80k
    case META_QUERY_PLUS:
7205
3.53k
    case META_QUERY_QUERY:
7206
3.53k
    repeat_min = 0;
7207
3.53k
    repeat_max = 1;
7208
7209
8.41k
    REPEAT:
7210
8.41k
    if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7211
7212
    /* Remember whether this is a variable length repeat, and default to
7213
    single-char opcodes. */
7214
7215
8.41k
    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7216
8.41k
    op_type = 0;
7217
7218
    /* Adjust first and required code units for a zero repeat. */
7219
7220
8.41k
    if (repeat_min == 0)
7221
4.91k
      {
7222
4.91k
      firstcu = zerofirstcu;
7223
4.91k
      firstcuflags = zerofirstcuflags;
7224
4.91k
      reqcu = zeroreqcu;
7225
4.91k
      reqcuflags = zeroreqcuflags;
7226
4.91k
      }
7227
7228
    /* Note the greediness and possessiveness. */
7229
7230
8.41k
    switch (meta)
7231
8.41k
      {
7232
0
      case META_MINMAX_PLUS:
7233
84
      case META_ASTERISK_PLUS:
7234
415
      case META_PLUS_PLUS:
7235
613
      case META_QUERY_PLUS:
7236
613
      repeat_type = 0;                  /* Force greedy */
7237
613
      possessive_quantifier = TRUE;
7238
613
      break;
7239
7240
0
      case META_MINMAX_QUERY:
7241
360
      case META_ASTERISK_QUERY:
7242
927
      case META_PLUS_QUERY:
7243
1.65k
      case META_QUERY_QUERY:
7244
1.65k
      repeat_type = greedy_non_default;
7245
1.65k
      possessive_quantifier = FALSE;
7246
1.65k
      break;
7247
7248
6.14k
      default:
7249
6.14k
      repeat_type = greedy_default;
7250
6.14k
      possessive_quantifier = FALSE;
7251
6.14k
      break;
7252
8.41k
      }
7253
7254
    /* Save start of previous item, in case we have to move it up in order to
7255
    insert something before it, and remember what it was. */
7256
7257
8.41k
    tempcode = previous;
7258
8.41k
    op_previous = *previous;
7259
7260
    /* Now handle repetition for the different types of item. If the repeat
7261
    minimum and the repeat maximum are both 1, we can ignore the quantifier for
7262
    non-parenthesized items, as they have only one alternative. For anything in
7263
    parentheses, we must not ignore if {1} is possessive. */
7264
7265
8.41k
    switch (op_previous)
7266
8.41k
      {
7267
      /* If previous was a character or negated character match, abolish the
7268
      item and generate a repeat item instead. If a char item has a minimum of
7269
      more than one, ensure that it is set in reqcu - it might not be if a
7270
      sequence such as x{3} is the first thing in a branch because the x will
7271
      have gone into firstcu instead.  */
7272
7273
2.60k
      case OP_CHAR:
7274
4.27k
      case OP_CHARI:
7275
4.33k
      case OP_NOT:
7276
4.55k
      case OP_NOTI:
7277
4.55k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7278
4.55k
      op_type = chartypeoffset[op_previous - OP_CHAR];
7279
7280
      /* Deal with UTF characters that take up more than one code unit. */
7281
7282
4.55k
#ifdef MAYBE_UTF_MULTI
7283
4.55k
      if (utf && NOT_FIRSTCU(code[-1]))
7284
0
        {
7285
0
        PCRE2_UCHAR *lastchar = code - 1;
7286
0
        BACKCHAR(lastchar);
7287
0
        mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7288
0
        memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7289
0
        }
7290
4.55k
      else
7291
4.55k
#endif  /* MAYBE_UTF_MULTI */
7292
7293
      /* Handle the case of a single code unit - either with no UTF support, or
7294
      with UTF disabled, or for a single-code-unit UTF character. In the latter
7295
      case, for a repeated positive match, get the caseless flag for the
7296
      required code unit from the previous character, because a class like [Aa]
7297
      sets a caseless A but by now the req_caseopt flag has been reset. */
7298
7299
4.55k
        {
7300
4.55k
        mcbuffer[0] = code[-1];
7301
4.55k
        mclength = 1;
7302
4.55k
        if (op_previous <= OP_CHARI && repeat_min > 1)
7303
0
          {
7304
0
          reqcu = mcbuffer[0];
7305
0
          reqcuflags = cb->req_varyopt;
7306
0
          if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7307
0
          }
7308
4.55k
        }
7309
4.55k
      goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7310
7311
      /* If previous was a character class or a back reference, we put the
7312
      repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7313
7314
0
#ifdef SUPPORT_WIDE_CHARS
7315
324
      case OP_XCLASS:
7316
324
#endif
7317
1.24k
      case OP_CLASS:
7318
1.63k
      case OP_NCLASS:
7319
1.63k
      case OP_REF:
7320
1.63k
      case OP_REFI:
7321
1.63k
      case OP_DNREF:
7322
1.63k
      case OP_DNREFI:
7323
7324
1.63k
      if (repeat_max == 0)
7325
0
        {
7326
0
        code = previous;
7327
0
        goto END_REPEAT;
7328
0
        }
7329
1.63k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7330
7331
1.63k
      if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7332
611
        *code++ = OP_CRSTAR + repeat_type;
7333
1.02k
      else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7334
624
        *code++ = OP_CRPLUS + repeat_type;
7335
402
      else if (repeat_min == 0 && repeat_max == 1)
7336
402
        *code++ = OP_CRQUERY + repeat_type;
7337
0
      else
7338
0
        {
7339
0
        *code++ = OP_CRRANGE + repeat_type;
7340
0
        PUT2INC(code, 0, repeat_min);
7341
0
        if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7342
0
        PUT2INC(code, 0, repeat_max);
7343
0
        }
7344
1.63k
      break;
7345
7346
      /* If previous is OP_FAIL, it was generated by an empty class []
7347
      (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7348
      generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7349
      time. We can just ignore this repeat. */
7350
7351
0
      case OP_FAIL:
7352
0
      goto END_REPEAT;
7353
7354
      /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7355
      because pcre2_match() could not handle backtracking into recursively
7356
      called groups. Now that this backtracking is available, we no longer need
7357
      to do this. However, we still need to replicate recursions as we do for
7358
      groups so as to have independent backtracking points. We can replicate
7359
      for the minimum number of repeats directly. For optional repeats we now
7360
      wrap the recursion in OP_BRA brackets and make use of the bracket
7361
      repetition. */
7362
7363
0
      case OP_RECURSE:
7364
0
      if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7365
0
        goto END_REPEAT;
7366
7367
      /* Generate unwrapped repeats for a non-zero minimum, except when the
7368
      minimum is 1 and the maximum unlimited, because that can be handled with
7369
      OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7370
      minimum, we just need to generate the appropriate additional copies.
7371
      Otherwise we need to generate one more, to simulate the situation when
7372
      the minimum is zero. */
7373
7374
0
      if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7375
0
        {
7376
0
        int replicate = repeat_min;
7377
0
        if (repeat_min == repeat_max) replicate--;
7378
7379
        /* In the pre-compile phase, we don't actually do the replication. We
7380
        just adjust the length as if we had. Do some paranoid checks for
7381
        potential integer overflow. */
7382
7383
0
        if (lengthptr != NULL)
7384
0
          {
7385
0
          PCRE2_SIZE delta;
7386
0
          if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7387
0
              OFLOW_MAX - *lengthptr < delta)
7388
0
            {
7389
0
            *errorcodeptr = ERR20;
7390
0
            return 0;
7391
0
            }
7392
0
          *lengthptr += delta;
7393
0
          }
7394
7395
0
        else for (int i = 0; i < replicate; i++)
7396
0
          {
7397
0
          memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7398
0
          previous = code;
7399
0
          code += 1 + LINK_SIZE;
7400
0
          }
7401
7402
        /* If the number of repeats is fixed, we are done. Otherwise, adjust
7403
        the counts and fall through. */
7404
7405
0
        if (repeat_min == repeat_max) break;
7406
0
        if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7407
0
        repeat_min = 0;
7408
0
        }
7409
7410
      /* Wrap the recursion call in OP_BRA brackets. */
7411
7412
0
      (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7413
0
      op_previous = *previous = OP_BRA;
7414
0
      PUT(previous, 1, 2 + 2*LINK_SIZE);
7415
0
      previous[2 + 2*LINK_SIZE] = OP_KET;
7416
0
      PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7417
0
      code += 2 + 2 * LINK_SIZE;
7418
0
      length_prevgroup = 3 + 3*LINK_SIZE;
7419
0
      group_return = -1;  /* Set "may match empty string" */
7420
7421
      /* Now treat as a repeated OP_BRA. */
7422
      /* Fall through */
7423
7424
      /* If previous was a bracket group, we may have to replicate it in
7425
      certain cases. Note that at this point we can encounter only the "basic"
7426
      bracket opcodes such as BRA and CBRA, as this is the place where they get
7427
      converted into the more special varieties such as BRAPOS and SBRA.
7428
      Originally, PCRE did not allow repetition of assertions, but now it does,
7429
      for Perl compatibility. */
7430
7431
0
      case OP_ASSERT:
7432
0
      case OP_ASSERT_NOT:
7433
0
      case OP_ASSERT_NA:
7434
0
      case OP_ASSERTBACK:
7435
0
      case OP_ASSERTBACK_NOT:
7436
0
      case OP_ASSERTBACK_NA:
7437
0
      case OP_ONCE:
7438
0
      case OP_SCRIPT_RUN:
7439
0
      case OP_BRA:
7440
581
      case OP_CBRA:
7441
581
      case OP_COND:
7442
581
        {
7443
581
        int len = (int)(code - previous);
7444
581
        PCRE2_UCHAR *bralink = NULL;
7445
581
        PCRE2_UCHAR *brazeroptr = NULL;
7446
7447
581
        if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7448
0
          goto END_REPEAT;
7449
7450
        /* Repeating a DEFINE group (or any group where the condition is always
7451
        FALSE and there is only one branch) is pointless, but Perl allows the
7452
        syntax, so we just ignore the repeat. */
7453
7454
581
        if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7455
0
            previous[GET(previous, 1)] != OP_ALT)
7456
0
          goto END_REPEAT;
7457
7458
        /* Perl allows all assertions to be quantified, and when they contain
7459
        capturing parentheses and/or are optional there are potential uses for
7460
        this feature. PCRE2 used to force the maximum quantifier to 1 on the
7461
        invalid grounds that further repetition was never useful. This was
7462
        always a bit pointless, since an assertion could be wrapped with a
7463
        repeated group to achieve the effect. General repetition is now
7464
        permitted, but if the maximum is unlimited it is set to one more than
7465
        the minimum. */
7466
7467
581
        if (op_previous < OP_ONCE)    /* Assertion */
7468
0
          {
7469
0
          if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7470
0
          }
7471
7472
        /* The case of a zero minimum is special because of the need to stick
7473
        OP_BRAZERO in front of it, and because the group appears once in the
7474
        data, whereas in other cases it appears the minimum number of times. For
7475
        this reason, it is simplest to treat this case separately, as otherwise
7476
        the code gets far too messy. There are several special subcases when the
7477
        minimum is zero. */
7478
7479
581
        if (repeat_min == 0)
7480
2
          {
7481
          /* If the maximum is also zero, we used to just omit the group from
7482
          the output altogether, like this:
7483
7484
          ** if (repeat_max == 0)
7485
          **   {
7486
          **   code = previous;
7487
          **   goto END_REPEAT;
7488
          **   }
7489
7490
          However, that fails when a group or a subgroup within it is
7491
          referenced as a subroutine from elsewhere in the pattern, so now we
7492
          stick in OP_SKIPZERO in front of it so that it is skipped on
7493
          execution. As we don't have a list of which groups are referenced, we
7494
          cannot do this selectively.
7495
7496
          If the maximum is 1 or unlimited, we just have to stick in the
7497
          BRAZERO and do no more at this point. */
7498
7499
2
          if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7500
2
            {
7501
2
            (void)memmove(previous + 1, previous, CU2BYTES(len));
7502
2
            code++;
7503
2
            if (repeat_max == 0)
7504
0
              {
7505
0
              *previous++ = OP_SKIPZERO;
7506
0
              goto END_REPEAT;
7507
0
              }
7508
2
            brazeroptr = previous;    /* Save for possessive optimizing */
7509
2
            *previous++ = OP_BRAZERO + repeat_type;
7510
2
            }
7511
7512
          /* If the maximum is greater than 1 and limited, we have to replicate
7513
          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7514
          The first one has to be handled carefully because it's the original
7515
          copy, which has to be moved up. The remainder can be handled by code
7516
          that is common with the non-zero minimum case below. We have to
7517
          adjust the value or repeat_max, since one less copy is required. */
7518
7519
0
          else
7520
0
            {
7521
0
            int linkoffset;
7522
0
            (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7523
0
            code += 2 + LINK_SIZE;
7524
0
            *previous++ = OP_BRAZERO + repeat_type;
7525
0
            *previous++ = OP_BRA;
7526
7527
            /* We chain together the bracket link offset fields that have to be
7528
            filled in later when the ends of the brackets are reached. */
7529
7530
0
            linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7531
0
            bralink = previous;
7532
0
            PUTINC(previous, 0, linkoffset);
7533
0
            }
7534
7535
2
          if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7536
2
          }
7537
7538
        /* If the minimum is greater than zero, replicate the group as many
7539
        times as necessary, and adjust the maximum to the number of subsequent
7540
        copies that we need. */
7541
7542
579
        else
7543
579
          {
7544
579
          if (repeat_min > 1)
7545
0
            {
7546
            /* In the pre-compile phase, we don't actually do the replication.
7547
            We just adjust the length as if we had. Do some paranoid checks for
7548
            potential integer overflow. */
7549
7550
0
            if (lengthptr != NULL)
7551
0
              {
7552
0
              PCRE2_SIZE delta;
7553
0
              if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7554
0
                                 (int)length_prevgroup) ||
7555
0
                  OFLOW_MAX - *lengthptr < delta)
7556
0
                {
7557
0
                *errorcodeptr = ERR20;
7558
0
                return 0;
7559
0
                }
7560
0
              *lengthptr += delta;
7561
0
              }
7562
7563
            /* This is compiling for real. If there is a set first code unit
7564
            for the group, and we have not yet set a "required code unit", set
7565
            it. */
7566
7567
0
            else
7568
0
              {
7569
0
              if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7570
0
                {
7571
0
                reqcu = firstcu;
7572
0
                reqcuflags = firstcuflags;
7573
0
                }
7574
0
              for (uint32_t i = 1; i < repeat_min; i++)
7575
0
                {
7576
0
                memcpy(code, previous, CU2BYTES(len));
7577
0
                code += len;
7578
0
                }
7579
0
              }
7580
0
            }
7581
7582
579
          if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7583
579
          }
7584
7585
        /* This code is common to both the zero and non-zero minimum cases. If
7586
        the maximum is limited, it replicates the group in a nested fashion,
7587
        remembering the bracket starts on a stack. In the case of a zero
7588
        minimum, the first one was set up above. In all cases the repeat_max
7589
        now specifies the number of additional copies needed. Again, we must
7590
        remember to replicate entries on the forward reference list. */
7591
7592
581
        if (repeat_max != REPEAT_UNLIMITED)
7593
0
          {
7594
          /* In the pre-compile phase, we don't actually do the replication. We
7595
          just adjust the length as if we had. For each repetition we must add
7596
          1 to the length for BRAZERO and for all but the last repetition we
7597
          must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7598
          paranoid checks to avoid integer overflow. */
7599
7600
0
          if (lengthptr != NULL && repeat_max > 0)
7601
0
            {
7602
0
            PCRE2_SIZE delta;
7603
0
            if (PRIV(ckd_smul)(&delta, repeat_max,
7604
0
                               (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7605
0
                OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7606
0
              {
7607
0
              *errorcodeptr = ERR20;
7608
0
              return 0;
7609
0
              }
7610
0
            delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7611
0
            *lengthptr += delta;
7612
0
            }
7613
7614
          /* This is compiling for real */
7615
7616
0
          else for (uint32_t i = repeat_max; i >= 1; i--)
7617
0
            {
7618
0
            *code++ = OP_BRAZERO + repeat_type;
7619
7620
            /* All but the final copy start a new nesting, maintaining the
7621
            chain of brackets outstanding. */
7622
7623
0
            if (i != 1)
7624
0
              {
7625
0
              int linkoffset;
7626
0
              *code++ = OP_BRA;
7627
0
              linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7628
0
              bralink = code;
7629
0
              PUTINC(code, 0, linkoffset);
7630
0
              }
7631
7632
0
            memcpy(code, previous, CU2BYTES(len));
7633
0
            code += len;
7634
0
            }
7635
7636
          /* Now chain through the pending brackets, and fill in their length
7637
          fields (which are holding the chain links pro tem). */
7638
7639
0
          while (bralink != NULL)
7640
0
            {
7641
0
            int oldlinkoffset;
7642
0
            int linkoffset = (int)(code - bralink + 1);
7643
0
            PCRE2_UCHAR *bra = code - linkoffset;
7644
0
            oldlinkoffset = GET(bra, 1);
7645
0
            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7646
0
            *code++ = OP_KET;
7647
0
            PUTINC(code, 0, linkoffset);
7648
0
            PUT(bra, 1, linkoffset);
7649
0
            }
7650
0
          }
7651
7652
        /* If the maximum is unlimited, set a repeater in the final copy. For
7653
        SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7654
        possessively repeated ONCE brackets can be converted into non-capturing
7655
        brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7656
        saves having to deal with possessive ONCEs specially.
7657
7658
        Otherwise, when we are doing the actual compile phase, check to see
7659
        whether this group is one that could match an empty string. If so,
7660
        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7661
        that runtime checking can be done. [This check is also applied to ONCE
7662
        and SCRIPT_RUN groups at runtime, but in a different way.]
7663
7664
        Then, if the quantifier was possessive and the bracket is not a
7665
        conditional, we convert the BRA code to the POS form, and the KET code
7666
        to KETRPOS. (It turns out to be convenient at runtime to detect this
7667
        kind of subpattern at both the start and at the end.) The use of
7668
        special opcodes makes it possible to reduce greatly the stack usage in
7669
        pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7670
        OP_BRAPOSZERO.
7671
7672
        Then, if the minimum number of matches is 1 or 0, cancel the possessive
7673
        flag so that the default action below, of wrapping everything inside
7674
        atomic brackets, does not happen. When the minimum is greater than 1,
7675
        there will be earlier copies of the group, and so we still have to wrap
7676
        the whole thing. */
7677
7678
581
        else
7679
581
          {
7680
581
          PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7681
581
          PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7682
7683
          /* Convert possessive ONCE brackets to non-capturing */
7684
7685
581
          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7686
7687
          /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7688
          to do is to set the KET. */
7689
7690
581
          if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7691
0
            *ketcode = OP_KETRMAX + repeat_type;
7692
7693
          /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7694
          (which have been converted to non-capturing above). */
7695
7696
581
          else
7697
581
            {
7698
            /* In the compile phase, adjust the opcode if the group can match
7699
            an empty string. For a conditional group with only one branch, the
7700
            value of group_return will not show "could be empty", so we must
7701
            check that separately. */
7702
7703
581
            if (lengthptr == NULL)
7704
250
              {
7705
250
              if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7706
250
              if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7707
0
                *bracode = OP_SCOND;
7708
250
              }
7709
7710
            /* Handle possessive quantifiers. */
7711
7712
581
            if (possessive_quantifier)
7713
233
              {
7714
              /* For COND brackets, we wrap the whole thing in a possessively
7715
              repeated non-capturing bracket, because we have not invented POS
7716
              versions of the COND opcodes. */
7717
7718
233
              if (*bracode == OP_COND || *bracode == OP_SCOND)
7719
0
                {
7720
0
                int nlen = (int)(code - bracode);
7721
0
                (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7722
0
                code += 1 + LINK_SIZE;
7723
0
                nlen += 1 + LINK_SIZE;
7724
0
                *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7725
0
                *code++ = OP_KETRPOS;
7726
0
                PUTINC(code, 0, nlen);
7727
0
                PUT(bracode, 1, nlen);
7728
0
                }
7729
7730
              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7731
7732
233
              else
7733
233
                {
7734
233
                *bracode += 1;              /* Switch to xxxPOS opcodes */
7735
233
                *ketcode = OP_KETRPOS;
7736
233
                }
7737
7738
              /* If the minimum is zero, mark it as possessive, then unset the
7739
              possessive flag when the minimum is 0 or 1. */
7740
7741
233
              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7742
233
              if (repeat_min < 2) possessive_quantifier = FALSE;
7743
233
              }
7744
7745
            /* Non-possessive quantifier */
7746
7747
348
            else *ketcode = OP_KETRMAX + repeat_type;
7748
581
            }
7749
581
          }
7750
581
        }
7751
581
      break;
7752
7753
      /* If previous was a character type match (\d or similar), abolish it and
7754
      create a suitable repeat item. The code is shared with single-character
7755
      repeats by setting op_type to add a suitable offset into repeat_type.
7756
      Note the the Unicode property types will be present only when
7757
      SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7758
      here because it just makes it horribly messy. */
7759
7760
1.65k
      default:
7761
1.65k
      if (op_previous >= OP_EODN)   /* Not a character type - internal error */
7762
0
        {
7763
0
        *errorcodeptr = ERR10;
7764
0
        return 0;
7765
0
        }
7766
1.65k
      else
7767
1.65k
        {
7768
1.65k
        int prop_type, prop_value;
7769
1.65k
        PCRE2_UCHAR *oldcode;
7770
7771
1.65k
        if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7772
7773
1.65k
        op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7774
1.65k
        mclength = 0;                         /* Not a character */
7775
7776
1.65k
        if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7777
330
          {
7778
330
          prop_type = previous[1];
7779
330
          prop_value = previous[2];
7780
330
          }
7781
1.32k
        else
7782
1.32k
          {
7783
          /* Come here from just above with a character in mcbuffer/mclength. */
7784
5.87k
          OUTPUT_SINGLE_REPEAT:
7785
5.87k
          prop_type = prop_value = -1;
7786
5.87k
          }
7787
7788
        /* At this point, if prop_type == prop_value == -1 we either have a
7789
        character in mcbuffer when mclength is greater than zero, or we have
7790
        mclength zero, in which case there is a non-property character type in
7791
        op_previous. If prop_type/value are not negative, we have a property
7792
        character type in op_previous. */
7793
7794
6.20k
        oldcode = code;                   /* Save where we were */
7795
6.20k
        code = previous;                  /* Usually overwrite previous item */
7796
7797
        /* If the maximum is zero then the minimum must also be zero; Perl allows
7798
        this case, so we do too - by simply omitting the item altogether. */
7799
7800
6.20k
        if (repeat_max == 0) goto END_REPEAT;
7801
7802
        /* Combine the op_type with the repeat_type */
7803
7804
6.20k
        repeat_type += op_type;
7805
7806
        /* A minimum of zero is handled either as the special case * or ?, or as
7807
        an UPTO, with the maximum given. */
7808
7809
6.20k
        if (repeat_min == 0)
7810
3.89k
          {
7811
3.89k
          if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7812
3.13k
            else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7813
0
          else
7814
0
            {
7815
0
            *code++ = OP_UPTO + repeat_type;
7816
0
            PUT2INC(code, 0, repeat_max);
7817
0
            }
7818
3.89k
          }
7819
7820
        /* A repeat minimum of 1 is optimized into some special cases. If the
7821
        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7822
        left in place and, if the maximum is greater than 1, we use OP_UPTO with
7823
        one less than the maximum. */
7824
7825
2.30k
        else if (repeat_min == 1)
7826
2.30k
          {
7827
2.30k
          if (repeat_max == REPEAT_UNLIMITED)
7828
2.30k
            *code++ = OP_PLUS + repeat_type;
7829
0
          else
7830
0
            {
7831
0
            code = oldcode;  /* Leave previous item in place */
7832
0
            if (repeat_max == 1) goto END_REPEAT;
7833
0
            *code++ = OP_UPTO + repeat_type;
7834
0
            PUT2INC(code, 0, repeat_max - 1);
7835
0
            }
7836
2.30k
          }
7837
7838
        /* The case {n,n} is just an EXACT, while the general case {n,m} is
7839
        handled as an EXACT followed by an UPTO or STAR or QUERY. */
7840
7841
0
        else
7842
0
          {
7843
0
          *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7844
0
          PUT2INC(code, 0, repeat_min);
7845
7846
          /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7847
          and then generate the second opcode. For a repeated Unicode property
7848
          match, there are two extra values that define the required property,
7849
          and mclength is set zero to indicate this. */
7850
7851
0
          if (repeat_max != repeat_min)
7852
0
            {
7853
0
            if (mclength > 0)
7854
0
              {
7855
0
              memcpy(code, mcbuffer, CU2BYTES(mclength));
7856
0
              code += mclength;
7857
0
              }
7858
0
            else
7859
0
              {
7860
0
              *code++ = op_previous;
7861
0
              if (prop_type >= 0)
7862
0
                {
7863
0
                *code++ = prop_type;
7864
0
                *code++ = prop_value;
7865
0
                }
7866
0
              }
7867
7868
            /* Now set up the following opcode */
7869
7870
0
            if (repeat_max == REPEAT_UNLIMITED)
7871
0
              *code++ = OP_STAR + repeat_type;
7872
0
            else
7873
0
              {
7874
0
              repeat_max -= repeat_min;
7875
0
              if (repeat_max == 1)
7876
0
                {
7877
0
                *code++ = OP_QUERY + repeat_type;
7878
0
                }
7879
0
              else
7880
0
                {
7881
0
                *code++ = OP_UPTO + repeat_type;
7882
0
                PUT2INC(code, 0, repeat_max);
7883
0
                }
7884
0
              }
7885
0
            }
7886
0
          }
7887
7888
        /* Fill in the character or character type for the final opcode. */
7889
7890
6.20k
        if (mclength > 0)
7891
4.55k
          {
7892
4.55k
          memcpy(code, mcbuffer, CU2BYTES(mclength));
7893
4.55k
          code += mclength;
7894
4.55k
          }
7895
1.65k
        else
7896
1.65k
          {
7897
1.65k
          *code++ = op_previous;
7898
1.65k
          if (prop_type >= 0)
7899
330
            {
7900
330
            *code++ = prop_type;
7901
330
            *code++ = prop_value;
7902
330
            }
7903
1.65k
          }
7904
6.20k
        }
7905
6.20k
      break;
7906
8.41k
      }  /* End of switch on different op_previous values */
7907
7908
7909
    /* If the character following a repeat is '+', possessive_quantifier is
7910
    TRUE. For some opcodes, there are special alternative opcodes for this
7911
    case. For anything else, we wrap the entire repeated item inside OP_ONCE
7912
    brackets. Logically, the '+' notation is just syntactic sugar, taken from
7913
    Sun's Java package, but the special opcodes can optimize it.
7914
7915
    Some (but not all) possessively repeated subpatterns have already been
7916
    completely handled in the code just above. For them, possessive_quantifier
7917
    is always FALSE at this stage. Note that the repeated item starts at
7918
    tempcode, not at previous, which might be the first part of a string whose
7919
    (former) last char we repeated. */
7920
7921
8.41k
    if (possessive_quantifier)
7922
380
      {
7923
380
      int len;
7924
7925
      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7926
      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7927
      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7928
      remains is greater than zero, there's a further opcode that can be
7929
      handled. If not, do nothing, leaving the EXACT alone. */
7930
7931
380
      switch(*tempcode)
7932
380
        {
7933
0
        case OP_TYPEEXACT:
7934
0
        tempcode += PRIV(OP_lengths)[*tempcode] +
7935
0
          ((tempcode[1 + IMM2_SIZE] == OP_PROP
7936
0
          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7937
0
        break;
7938
7939
        /* CHAR opcodes are used for exacts whose count is 1. */
7940
7941
0
        case OP_CHAR:
7942
0
        case OP_CHARI:
7943
0
        case OP_NOT:
7944
0
        case OP_NOTI:
7945
0
        case OP_EXACT:
7946
0
        case OP_EXACTI:
7947
0
        case OP_NOTEXACT:
7948
0
        case OP_NOTEXACTI:
7949
0
        tempcode += PRIV(OP_lengths)[*tempcode];
7950
0
#ifdef SUPPORT_UNICODE
7951
0
        if (utf && HAS_EXTRALEN(tempcode[-1]))
7952
0
          tempcode += GET_EXTRALEN(tempcode[-1]);
7953
0
#endif
7954
0
        break;
7955
7956
        /* For the class opcodes, the repeat operator appears at the end;
7957
        adjust tempcode to point to it. */
7958
7959
72
        case OP_CLASS:
7960
72
        case OP_NCLASS:
7961
72
        tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7962
72
        break;
7963
7964
0
#ifdef SUPPORT_WIDE_CHARS
7965
0
        case OP_XCLASS:
7966
0
        tempcode += GET(tempcode, 1);
7967
0
        break;
7968
380
#endif
7969
380
        }
7970
7971
      /* If tempcode is equal to code (which points to the end of the repeated
7972
      item), it means we have skipped an EXACT item but there is no following
7973
      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7974
      all other cases, tempcode will be pointing to the repeat opcode, and will
7975
      be less than code, so the value of len will be greater than 0. */
7976
7977
380
      len = (int)(code - tempcode);
7978
380
      if (len > 0)
7979
380
        {
7980
380
        unsigned int repcode = *tempcode;
7981
7982
        /* There is a table for possessifying opcodes, all of which are less
7983
        than OP_CALLOUT. A zero entry means there is no possessified version.
7984
        */
7985
7986
380
        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7987
380
          *tempcode = opcode_possessify[repcode];
7988
7989
        /* For opcode without a special possessified version, wrap the item in
7990
        ONCE brackets. */
7991
7992
0
        else
7993
0
          {
7994
0
          (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7995
0
          code += 1 + LINK_SIZE;
7996
0
          len += 1 + LINK_SIZE;
7997
0
          tempcode[0] = OP_ONCE;
7998
0
          *code++ = OP_KET;
7999
0
          PUTINC(code, 0, len);
8000
0
          PUT(tempcode, 1, len);
8001
0
          }
8002
380
        }
8003
380
      }
8004
8005
    /* We set the "follows varying string" flag for subsequently encountered
8006
    reqcus if it isn't already set and we have just passed a varying length
8007
    item. */
8008
8009
8.41k
    END_REPEAT:
8010
8.41k
    cb->req_varyopt |= reqvary;
8011
8.41k
    break;
8012
8013
8014
    /* ===================================================================*/
8015
    /* Handle a 32-bit data character with a value greater than META_END. */
8016
8017
0
    case META_BIGVALUE:
8018
0
    pptr++;
8019
0
    goto NORMAL_CHAR;
8020
8021
8022
    /* ===============================================================*/
8023
    /* Handle a back reference by number, which is the meta argument. The
8024
    pattern offsets for back references to group numbers less than 10 are held
8025
    in a special vector, to avoid using more than two parsed pattern elements
8026
    in 64-bit environments. We only need the offset to the first occurrence,
8027
    because if that doesn't fail, subsequent ones will also be OK. */
8028
8029
897
    case META_BACKREF:
8030
897
    if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8031
148
      else GETPLUSOFFSET(offset, pptr);
8032
8033
897
    if (meta_arg > cb->bracount)
8034
20
      {
8035
20
      cb->erroroffset = offset;
8036
20
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8037
20
      return 0;
8038
20
      }
8039
8040
    /* Come here from named backref handling when the reference is to a
8041
    single group (that is, not to a duplicated name). The back reference
8042
    data will have already been updated. We must disable firstcu if not
8043
    set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8044
    later. */
8045
8046
877
    HANDLE_SINGLE_REFERENCE:
8047
877
    if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8048
877
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8049
877
    PUT2INC(code, 0, meta_arg);
8050
8051
    /* Update the map of back references, and keep the highest one. We
8052
    could do this in parse_regex() for numerical back references, but not
8053
    for named back references, because we don't know the numbers to which
8054
    named back references refer. So we do it all in this function. */
8055
8056
877
    cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8057
877
    if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8058
877
    break;
8059
8060
8061
    /* ===============================================================*/
8062
    /* Handle recursion by inserting the number of the called group (which is
8063
    the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8064
    scanned and these numbers are replaced by offsets within the pattern. It is
8065
    done like this to avoid problems with forward references and adjusting
8066
    offsets when groups are duplicated and moved (as discovered in previous
8067
    implementations). Note that a recursion does not have a set first
8068
    character. */
8069
8070
0
    case META_RECURSE:
8071
0
    GETPLUSOFFSET(offset, pptr);
8072
0
    if (meta_arg > cb->bracount)
8073
0
      {
8074
0
      cb->erroroffset = offset;
8075
0
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8076
0
      return 0;
8077
0
      }
8078
0
    HANDLE_NUMERICAL_RECURSION:
8079
0
    *code = OP_RECURSE;
8080
0
    PUT(code, 1, meta_arg);
8081
0
    code += 1 + LINK_SIZE;
8082
0
    groupsetfirstcu = FALSE;
8083
0
    cb->had_recurse = TRUE;
8084
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8085
0
    zerofirstcu = firstcu;
8086
0
    zerofirstcuflags = firstcuflags;
8087
0
    break;
8088
8089
8090
    /* ===============================================================*/
8091
    /* Handle capturing parentheses; the number is the meta argument. */
8092
8093
1.60k
    case META_CAPTURE:
8094
1.60k
    bravalue = OP_CBRA;
8095
1.60k
    skipunits = IMM2_SIZE;
8096
1.60k
    PUT2(code, 1+LINK_SIZE, meta_arg);
8097
1.60k
    cb->lastcapture = meta_arg;
8098
1.60k
    goto GROUP_PROCESS_NOTE_EMPTY;
8099
8100
8101
    /* ===============================================================*/
8102
    /* Handle escape sequence items. For ones like \d, the ESC_values are
8103
    arranged to be the same as the corresponding OP_values in the default case
8104
    when PCRE2_UCP is not set (which is the only case in which they will appear
8105
    here).
8106
8107
    Note: \Q and \E are never seen here, as they were dealt with in
8108
    parse_pattern(). Neither are numerical back references or recursions, which
8109
    were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8110
    \g, when followed by names, are turned into META_BACKREF_BYNAME or
8111
    META_RECURSE_BYNAME. */
8112
8113
2.41k
    case META_ESCAPE:
8114
8115
    /* We can test for escape sequences that consume a character because their
8116
    values lie between ESC_b and ESC_Z; this may have to change if any new ones
8117
    are ever created. For these sequences, we disable the setting of a first
8118
    character if it hasn't already been set. */
8119
8120
2.41k
    if (meta_arg > ESC_b && meta_arg < ESC_Z)
8121
2.24k
      {
8122
2.24k
      matched_char = TRUE;
8123
2.24k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8124
2.24k
      }
8125
8126
    /* Set values to reset to if this is followed by a zero repeat. */
8127
8128
2.41k
    zerofirstcu = firstcu;
8129
2.41k
    zerofirstcuflags = firstcuflags;
8130
2.41k
    zeroreqcu = reqcu;
8131
2.41k
    zeroreqcuflags = reqcuflags;
8132
8133
    /* If Unicode is not supported, \P and \p are not allowed and are
8134
    faulted at parse time, so will never appear here. */
8135
8136
2.41k
#ifdef SUPPORT_UNICODE
8137
2.41k
    if (meta_arg == ESC_P || meta_arg == ESC_p)
8138
288
      {
8139
288
      uint32_t ptype = *(++pptr) >> 16;
8140
288
      uint32_t pdata = *pptr & 0xffff;
8141
8142
      /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
8143
      from the auto-anchoring code. */
8144
8145
288
      if (meta_arg == ESC_p && ptype == PT_ANY)
8146
0
        {
8147
0
        *code++ = OP_ALLANY;
8148
0
        }
8149
288
      else
8150
288
        {
8151
288
        *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8152
288
        *code++ = ptype;
8153
288
        *code++ = pdata;
8154
288
        }
8155
288
      break;  /* End META_ESCAPE */
8156
288
      }
8157
2.13k
#endif
8158
8159
    /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8160
    done. However, there's an option, in case anyone was relying on it. */
8161
8162
2.13k
    if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8163
0
        (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8164
0
      {
8165
0
      *errorcodeptr = ERR99;
8166
0
      return 0;
8167
0
      }
8168
8169
    /* For the rest (including \X when Unicode is supported - if not it's
8170
    faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8171
    not set; if it is set, most of them do not show up here because they are
8172
    converted into Unicode property tests in parse_regex().
8173
8174
    In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8175
    instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8176
    There are special UCP codes for \B and \b which are used in UCP mode unless
8177
    "word" matching is being forced to ASCII.
8178
8179
    Note that \b and \B do a one-character lookbehind, and \A also behaves as
8180
    if it does. */
8181
8182
2.13k
    switch(meta_arg)
8183
2.13k
      {
8184
57
      case ESC_C:
8185
57
      cb->external_flags |= PCRE2_HASBKC;  /* Record */
8186
#if PCRE2_CODE_UNIT_WIDTH == 32
8187
      meta_arg = OP_ALLANY;
8188
#else
8189
57
      if (!utf) meta_arg = OP_ALLANY;
8190
57
#endif
8191
57
      break;
8192
8193
101
      case ESC_B:
8194
133
      case ESC_b:
8195
133
      if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8196
87
        meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8197
87
          OP_UCP_WORD_BOUNDARY;
8198
      /* Fall through */
8199
8200
145
      case ESC_A:
8201
145
      if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8202
145
      break;
8203
2.13k
      }
8204
8205
2.13k
    *code++ = meta_arg;
8206
2.13k
    break;  /* End META_ESCAPE */
8207
8208
8209
    /* ===================================================================*/
8210
    /* Handle an unrecognized meta value. A parsed pattern value less than
8211
    META_END is a literal. Otherwise we have a problem. */
8212
8213
114k
    default:
8214
114k
    if (meta >= META_END)
8215
0
      {
8216
#ifdef DEBUG_SHOW_PARSED
8217
      fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
8218
#endif
8219
0
      *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8220
0
      return 0;
8221
0
      }
8222
8223
    /* Handle a literal character. We come here by goto in the case of a
8224
    32-bit, non-UTF character whose value is greater than META_END. */
8225
8226
114k
    NORMAL_CHAR:
8227
114k
    meta = *pptr;     /* Get the full 32 bits */
8228
114k
    NORMAL_CHAR_SET:  /* Character is already in meta */
8229
114k
    matched_char = TRUE;
8230
8231
    /* For caseless UTF or UCP mode, check whether this character has more than
8232
    one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8233
    When casing restrictions apply, ignore caseless sets that start with an
8234
    ASCII character. */
8235
8236
114k
#ifdef SUPPORT_UNICODE
8237
114k
    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8238
11.4k
      {
8239
11.4k
      uint32_t caseset = UCD_CASESET(meta);
8240
11.4k
      if (caseset != 0 &&
8241
650
           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
8242
0
           PRIV(ucd_caseless_sets)[caseset] > 127))
8243
650
        {
8244
650
        *code++ = OP_PROP;
8245
650
        *code++ = PT_CLIST;
8246
650
        *code++ = caseset;
8247
650
        if (firstcuflags == REQ_UNSET)
8248
4
          firstcuflags = zerofirstcuflags = REQ_NONE;
8249
650
        break;  /* End handling this meta item */
8250
650
        }
8251
11.4k
      }
8252
113k
#endif
8253
8254
    /* Caseful matches, or caseless and not one of the multicase characters. We
8255
    come here by goto in the case of a positive class that contains only
8256
    case-partners of a character with just two cases; matched_char has already
8257
    been set TRUE and options fudged if necessary. */
8258
8259
113k
    CLASS_CASELESS_CHAR:
8260
8261
    /* Get the character's code units into mcbuffer, with the length in
8262
    mclength. When not in UTF mode, the length is always 1. */
8263
8264
113k
#ifdef SUPPORT_UNICODE
8265
113k
    if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8266
99.7k
#endif
8267
99.7k
      {
8268
99.7k
      mclength = 1;
8269
99.7k
      mcbuffer[0] = meta;
8270
99.7k
      }
8271
8272
    /* Generate the appropriate code */
8273
8274
113k
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8275
113k
    memcpy(code, mcbuffer, CU2BYTES(mclength));
8276
113k
    code += mclength;
8277
8278
    /* Remember if \r or \n were seen */
8279
8280
113k
    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8281
2.47k
      cb->external_flags |= PCRE2_HASCRORLF;
8282
8283
    /* Set the first and required code units appropriately. If no previous
8284
    first code unit, set it from this character, but revert to none on a zero
8285
    repeat. Otherwise, leave the firstcu value alone, and don't change it on
8286
    a zero repeat. */
8287
8288
113k
    if (firstcuflags == REQ_UNSET)
8289
2.98k
      {
8290
2.98k
      zerofirstcuflags = REQ_NONE;
8291
2.98k
      zeroreqcu = reqcu;
8292
2.98k
      zeroreqcuflags = reqcuflags;
8293
8294
      /* If the character is more than one code unit long, we can set a single
8295
      firstcu only if it is not to be matched caselessly. Multiple possible
8296
      starting code units may be picked up later in the studying code. */
8297
8298
2.98k
      if (mclength == 1 || req_caseopt == 0)
8299
2.97k
        {
8300
2.97k
        firstcu = mcbuffer[0];
8301
2.97k
        firstcuflags = req_caseopt;
8302
2.97k
        if (mclength != 1)
8303
0
          {
8304
0
          reqcu = code[-1];
8305
0
          reqcuflags = cb->req_varyopt;
8306
0
          }
8307
2.97k
        }
8308
8
      else firstcuflags = reqcuflags = REQ_NONE;
8309
2.98k
      }
8310
8311
    /* firstcu was previously set; we can set reqcu only if the length is
8312
    1 or the matching is caseful. */
8313
8314
110k
    else
8315
110k
      {
8316
110k
      zerofirstcu = firstcu;
8317
110k
      zerofirstcuflags = firstcuflags;
8318
110k
      zeroreqcu = reqcu;
8319
110k
      zeroreqcuflags = reqcuflags;
8320
110k
      if (mclength == 1 || req_caseopt == 0)
8321
110k
        {
8322
110k
        reqcu = code[-1];
8323
110k
        reqcuflags = req_caseopt | cb->req_varyopt;
8324
110k
        }
8325
110k
      }
8326
8327
    /* If caselessness was temporarily instated, reset it. */
8328
8329
113k
    if (reset_caseful)
8330
0
      {
8331
0
      options &= ~PCRE2_CASELESS;
8332
0
      req_caseopt = 0;
8333
0
      reset_caseful = FALSE;
8334
0
      }
8335
8336
113k
    break;    /* End literal character handling */
8337
140k
    }         /* End of big switch */
8338
140k
  }           /* End of big loop */
8339
8340
/* Control never reaches here. */
8341
6.59k
}
8342
8343
8344
8345
/*************************************************
8346
*   Compile regex: a sequence of alternatives    *
8347
*************************************************/
8348
8349
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8350
the closing bracket or META_END. The code variable is pointing at the code unit
8351
into which the BRA operator has been stored. This function is used during the
8352
pre-compile phase when we are trying to find out the amount of memory needed,
8353
as well as during the real compile phase. The value of lengthptr distinguishes
8354
the two phases.
8355
8356
Arguments:
8357
  options           option bits, including any changes for this subpattern
8358
  xoptions          extra option bits, ditto
8359
  codeptr           -> the address of the current code pointer
8360
  pptrptr           -> the address of the current parsed pattern pointer
8361
  errorcodeptr      -> pointer to error code variable
8362
  skipunits         skip this many code units at start (for brackets and OP_COND)
8363
  firstcuptr        place to put the first required code unit
8364
  firstcuflagsptr   place to put the first code unit flags
8365
  reqcuptr          place to put the last required code unit
8366
  reqcuflagsptr     place to put the last required code unit flags
8367
  bcptr             pointer to the chain of currently open branches
8368
  cb                points to the data block with tables pointers etc.
8369
  lengthptr         NULL during the real compile phase
8370
                    points to length accumulator during pre-compile phase
8371
8372
Returns:            0 There has been an error
8373
                   +1 Success, this group must match at least one character
8374
                   -1 Success, this group may match an empty string
8375
*/
8376
8377
static int
8378
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8379
  uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8380
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8381
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8382
  compile_block *cb, PCRE2_SIZE *lengthptr)
8383
2.33k
{
8384
2.33k
PCRE2_UCHAR *code = *codeptr;
8385
2.33k
PCRE2_UCHAR *last_branch = code;
8386
2.33k
PCRE2_UCHAR *start_bracket = code;
8387
2.33k
BOOL lookbehind;
8388
2.33k
open_capitem capitem;
8389
2.33k
int capnumber = 0;
8390
2.33k
int okreturn = 1;
8391
2.33k
uint32_t *pptr = *pptrptr;
8392
2.33k
uint32_t firstcu, reqcu;
8393
2.33k
uint32_t lookbehindlength;
8394
2.33k
uint32_t lookbehindminlength;
8395
2.33k
uint32_t firstcuflags, reqcuflags;
8396
2.33k
uint32_t branchfirstcu, branchreqcu;
8397
2.33k
uint32_t branchfirstcuflags, branchreqcuflags;
8398
2.33k
PCRE2_SIZE length;
8399
2.33k
branch_chain bc;
8400
8401
/* If set, call the external function that checks for stack availability. */
8402
8403
2.33k
if (cb->cx->stack_guard != NULL &&
8404
0
    cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8405
0
  {
8406
0
  *errorcodeptr= ERR33;
8407
0
  return 0;
8408
0
  }
8409
8410
/* Miscellaneous initialization */
8411
8412
2.33k
bc.outer = bcptr;
8413
2.33k
bc.current_branch = code;
8414
8415
2.33k
firstcu = reqcu = 0;
8416
2.33k
firstcuflags = reqcuflags = REQ_UNSET;
8417
8418
/* Accumulate the length for use in the pre-compile phase. Start with the
8419
length of the BRA and KET and any extra code units that are required at the
8420
beginning. We accumulate in a local variable to save frequent testing of
8421
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8422
start and end of each alternative, because compiled items are discarded during
8423
the pre-compile phase so that the workspace is not exceeded. */
8424
8425
2.33k
length = 2 + 2*LINK_SIZE + skipunits;
8426
8427
/* Remember if this is a lookbehind assertion, and if it is, save its length
8428
and skip over the pattern offset. */
8429
8430
2.33k
lookbehind = *code == OP_ASSERTBACK ||
8431
2.33k
             *code == OP_ASSERTBACK_NOT ||
8432
2.33k
             *code == OP_ASSERTBACK_NA;
8433
8434
2.33k
if (lookbehind)
8435
2
  {
8436
2
  lookbehindlength = META_DATA(pptr[-1]);
8437
2
  lookbehindminlength = *pptr;
8438
2
  pptr += SIZEOFFSET;
8439
2
  }
8440
2.33k
else lookbehindlength = lookbehindminlength = 0;
8441
8442
/* If this is a capturing subpattern, add to the chain of open capturing items
8443
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8444
need be tested here; changing this opcode to one of its variants, e.g.
8445
OP_SCBRAPOS, happens later, after the group has been compiled. */
8446
8447
2.33k
if (*code == OP_CBRA)
8448
1.60k
  {
8449
1.60k
  capnumber = GET2(code, 1 + LINK_SIZE);
8450
1.60k
  capitem.number = capnumber;
8451
1.60k
  capitem.next = open_caps;
8452
1.60k
  capitem.assert_depth = cb->assert_depth;
8453
1.60k
  open_caps = &capitem;
8454
1.60k
  }
8455
8456
/* Offset is set zero to mark that this bracket is still open */
8457
8458
2.33k
PUT(code, 1, 0);
8459
2.33k
code += 1 + LINK_SIZE + skipunits;
8460
8461
/* Loop for each alternative branch */
8462
8463
2.33k
for (;;)
8464
6.59k
  {
8465
6.59k
  int branch_return;
8466
8467
  /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8468
  is only a single mimimum length for the whole assertion. When the mimimum
8469
  length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8470
  though not necessarily the same length. In this case, the original OP_REVERSE
8471
  can be used. It can also be used if a branch in a variable length lookbehind
8472
  has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8473
  maximum and minimum values. */
8474
8475
6.59k
  if (lookbehind && lookbehindlength > 0)
8476
2
    {
8477
2
    if (lookbehindminlength == LOOKBEHIND_MAX ||
8478
2
        lookbehindminlength == lookbehindlength)
8479
0
      {
8480
0
      *code++ = OP_REVERSE;
8481
0
      PUT2INC(code, 0, lookbehindlength);
8482
0
      length += 1 + IMM2_SIZE;
8483
0
      }
8484
2
    else
8485
2
      {
8486
2
      *code++ = OP_VREVERSE;
8487
2
      PUT2INC(code, 0, lookbehindminlength);
8488
2
      PUT2INC(code, 0, lookbehindlength);
8489
2
      length += 1 + 2*IMM2_SIZE;
8490
2
      }
8491
2
    }
8492
8493
  /* Now compile the branch; in the pre-compile phase its length gets added
8494
  into the length. */
8495
8496
6.59k
  if ((branch_return =
8497
6.59k
        compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8498
6.59k
          &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8499
6.59k
          &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8500
20
    return 0;
8501
8502
  /* If a branch can match an empty string, so can the whole group. */
8503
8504
6.57k
  if (branch_return < 0) okreturn = -1;
8505
8506
  /* In the real compile phase, there is some post-processing to be done. */
8507
8508
6.57k
  if (lengthptr == NULL)
8509
3.10k
    {
8510
    /* If this is the first branch, the firstcu and reqcu values for the
8511
    branch become the values for the regex. */
8512
8513
3.10k
    if (*last_branch != OP_ALT)
8514
1.06k
      {
8515
1.06k
      firstcu = branchfirstcu;
8516
1.06k
      firstcuflags = branchfirstcuflags;
8517
1.06k
      reqcu = branchreqcu;
8518
1.06k
      reqcuflags = branchreqcuflags;
8519
1.06k
      }
8520
8521
    /* If this is not the first branch, the first char and reqcu have to
8522
    match the values from all the previous branches, except that if the
8523
    previous value for reqcu didn't have REQ_VARY set, it can still match,
8524
    and we set REQ_VARY for the group from this branch's value. */
8525
8526
2.04k
    else
8527
2.04k
      {
8528
      /* If we previously had a firstcu, but it doesn't match the new branch,
8529
      we have to abandon the firstcu for the regex, but if there was
8530
      previously no reqcu, it takes on the value of the old firstcu. */
8531
8532
2.04k
      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8533
1.95k
        {
8534
1.95k
        if (firstcuflags < REQ_NONE)
8535
164
          {
8536
164
          if (reqcuflags >= REQ_NONE)
8537
4
            {
8538
4
            reqcu = firstcu;
8539
4
            reqcuflags = firstcuflags;
8540
4
            }
8541
164
          }
8542
1.95k
        firstcuflags = REQ_NONE;
8543
1.95k
        }
8544
8545
      /* If we (now or from before) have no firstcu, a firstcu from the
8546
      branch becomes a reqcu if there isn't a branch reqcu. */
8547
8548
2.04k
      if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8549
978
          branchreqcuflags >= REQ_NONE)
8550
19
        {
8551
19
        branchreqcu = branchfirstcu;
8552
19
        branchreqcuflags = branchfirstcuflags;
8553
19
        }
8554
8555
      /* Now ensure that the reqcus match */
8556
8557
2.04k
      if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8558
350
          reqcu != branchreqcu)
8559
1.89k
        reqcuflags = REQ_NONE;
8560
148
      else
8561
148
        {
8562
148
        reqcu = branchreqcu;
8563
148
        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8564
148
        }
8565
2.04k
      }
8566
3.10k
    }
8567
8568
  /* Handle reaching the end of the expression, either ')' or end of pattern.
8569
  In the real compile phase, go back through the alternative branches and
8570
  reverse the chain of offsets, with the field in the BRA item now becoming an
8571
  offset to the first alternative. If there are no alternatives, it points to
8572
  the end of the group. The length in the terminating ket is always the length
8573
  of the whole bracketed item. Return leaving the pointer at the terminating
8574
  char. */
8575
8576
6.57k
  if (META_CODE(*pptr) != META_ALT)
8577
2.31k
    {
8578
2.31k
    if (lengthptr == NULL)
8579
1.06k
      {
8580
1.06k
      PCRE2_SIZE branch_length = code - last_branch;
8581
1.06k
      do
8582
3.10k
        {
8583
3.10k
        PCRE2_SIZE prev_length = GET(last_branch, 1);
8584
3.10k
        PUT(last_branch, 1, branch_length);
8585
3.10k
        branch_length = prev_length;
8586
3.10k
        last_branch -= branch_length;
8587
3.10k
        }
8588
3.10k
      while (branch_length > 0);
8589
1.06k
      }
8590
8591
    /* Fill in the ket */
8592
8593
2.31k
    *code = OP_KET;
8594
2.31k
    PUT(code, 1, (int)(code - start_bracket));
8595
2.31k
    code += 1 + LINK_SIZE;
8596
8597
    /* Set values to pass back */
8598
8599
2.31k
    *codeptr = code;
8600
2.31k
    *pptrptr = pptr;
8601
2.31k
    *firstcuptr = firstcu;
8602
2.31k
    *firstcuflagsptr = firstcuflags;
8603
2.31k
    *reqcuptr = reqcu;
8604
2.31k
    *reqcuflagsptr = reqcuflags;
8605
2.31k
    if (lengthptr != NULL)
8606
1.25k
      {
8607
1.25k
      if (OFLOW_MAX - *lengthptr < length)
8608
0
        {
8609
0
        *errorcodeptr = ERR20;
8610
0
        return 0;
8611
0
        }
8612
1.25k
      *lengthptr += length;
8613
1.25k
      }
8614
2.31k
    return okreturn;
8615
2.31k
    }
8616
8617
  /* Another branch follows. In the pre-compile phase, we can move the code
8618
  pointer back to where it was for the start of the first branch. (That is,
8619
  pretend that each branch is the only one.)
8620
8621
  In the real compile phase, insert an ALT node. Its length field points back
8622
  to the previous branch while the bracket remains open. At the end the chain
8623
  is reversed. It's done like this so that the start of the bracket has a
8624
  zero offset until it is closed, making it possible to detect recursion. */
8625
8626
4.25k
  if (lengthptr != NULL)
8627
2.20k
    {
8628
2.20k
    code = *codeptr + 1 + LINK_SIZE + skipunits;
8629
2.20k
    length += 1 + LINK_SIZE;
8630
2.20k
    }
8631
2.04k
  else
8632
2.04k
    {
8633
2.04k
    *code = OP_ALT;
8634
2.04k
    PUT(code, 1, (int)(code - last_branch));
8635
2.04k
    bc.current_branch = last_branch = code;
8636
2.04k
    code += 1 + LINK_SIZE;
8637
2.04k
    }
8638
8639
  /* Set the maximum lookbehind length for the next branch (if not in a
8640
  lookbehind the value will be zero) and then advance past the vertical bar. */
8641
8642
4.25k
  lookbehindlength = META_DATA(*pptr);
8643
4.25k
  pptr++;
8644
4.25k
  }
8645
/* Control never reaches here */
8646
2.33k
}
8647
8648
8649
8650
/*************************************************
8651
*          Check for anchored pattern            *
8652
*************************************************/
8653
8654
/* Try to find out if this is an anchored regular expression. Consider each
8655
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8656
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8657
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8658
be found, because ^ generates OP_CIRCM in that mode.
8659
8660
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8661
This is the code for \G, which means "match at start of match position, taking
8662
into account the match offset".
8663
8664
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8665
because that will try the rest of the pattern at all possible matching points,
8666
so there is no point trying again.... er ....
8667
8668
.... except when the .* appears inside capturing parentheses, and there is a
8669
subsequent back reference to those parentheses. We haven't enough information
8670
to catch that case precisely.
8671
8672
At first, the best we could do was to detect when .* was in capturing brackets
8673
and the highest back reference was greater than or equal to that level.
8674
However, by keeping a bitmap of the first 31 back references, we can catch some
8675
of the more common cases more precisely.
8676
8677
... A second exception is when the .* appears inside an atomic group, because
8678
this prevents the number of characters it matches from being adjusted.
8679
8680
Arguments:
8681
  code           points to start of the compiled pattern
8682
  bracket_map    a bitmap of which brackets we are inside while testing; this
8683
                   handles up to substring 31; after that we just have to take
8684
                   the less precise approach
8685
  cb             points to the compile data block
8686
  atomcount      atomic group level
8687
  inassert       TRUE if in an assertion
8688
8689
Returns:     TRUE or FALSE
8690
*/
8691
8692
static BOOL
8693
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8694
  int atomcount, BOOL inassert)
8695
348
{
8696
349
do {
8697
349
   PCRE2_SPTR scode = first_significant_code(
8698
349
     code + PRIV(OP_lengths)[*code], FALSE);
8699
349
   int op = *scode;
8700
8701
   /* Non-capturing brackets */
8702
8703
349
   if (op == OP_BRA  || op == OP_BRAPOS ||
8704
349
       op == OP_SBRA || op == OP_SBRAPOS)
8705
0
     {
8706
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8707
0
       return FALSE;
8708
0
     }
8709
8710
   /* Capturing brackets */
8711
8712
349
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8713
348
            op == OP_SCBRA || op == OP_SCBRAPOS)
8714
1
     {
8715
1
     int n = GET2(scode, 1+LINK_SIZE);
8716
1
     uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8717
1
     if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8718
1
     }
8719
8720
   /* Positive forward assertion */
8721
8722
348
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8723
0
     {
8724
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8725
0
     }
8726
8727
   /* Condition. If there is no second branch, it can't be anchored. */
8728
8729
348
   else if (op == OP_COND || op == OP_SCOND)
8730
0
     {
8731
0
     if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8732
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8733
0
       return FALSE;
8734
0
     }
8735
8736
   /* Atomic groups */
8737
8738
348
   else if (op == OP_ONCE)
8739
0
     {
8740
0
     if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8741
0
       return FALSE;
8742
0
     }
8743
8744
   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8745
   it isn't in brackets that are or may be referenced or inside an atomic
8746
   group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8747
   because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8748
   with the subject "aab", which matches "b", i.e. not at the start of a line.
8749
   There is also an option that disables auto-anchoring. */
8750
8751
348
   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8752
345
             op == OP_TYPEPOSSTAR))
8753
4
     {
8754
4
     if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8755
0
         atomcount > 0 || cb->had_pruneorskip || inassert ||
8756
0
         (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8757
4
       return FALSE;
8758
4
     }
8759
8760
   /* Check for explicit anchoring */
8761
8762
344
   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8763
8764
1
   code += GET(code, 1);
8765
1
   }
8766
348
while (*code == OP_ALT);   /* Loop for each alternative */
8767
0
return TRUE;
8768
348
}
8769
8770
8771
8772
/*************************************************
8773
*         Check for starting with ^ or .*        *
8774
*************************************************/
8775
8776
/* This is called to find out if every branch starts with ^ or .* so that
8777
"first char" processing can be done to speed things up in multiline
8778
matching and for non-DOTALL patterns that start with .* (which must start at
8779
the beginning or after \n). As in the case of is_anchored() (see above), we
8780
have to take account of back references to capturing brackets that contain .*
8781
because in that case we can't make the assumption. Also, the appearance of .*
8782
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8783
or *SKIP does not count, because once again the assumption no longer holds.
8784
8785
Arguments:
8786
  code           points to start of the compiled pattern or a group
8787
  bracket_map    a bitmap of which brackets we are inside while testing; this
8788
                   handles up to substring 31; after that we just have to take
8789
                   the less precise approach
8790
  cb             points to the compile data
8791
  atomcount      atomic group level
8792
  inassert       TRUE if in an assertion
8793
8794
Returns:         TRUE or FALSE
8795
*/
8796
8797
static BOOL
8798
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8799
  int atomcount, BOOL inassert)
8800
292
{
8801
293
do {
8802
293
   PCRE2_SPTR scode = first_significant_code(
8803
293
     code + PRIV(OP_lengths)[*code], FALSE);
8804
293
   int op = *scode;
8805
8806
   /* If we are at the start of a conditional assertion group, *both* the
8807
   conditional assertion *and* what follows the condition must satisfy the test
8808
   for start of line. Other kinds of condition fail. Note that there may be an
8809
   auto-callout at the start of a condition. */
8810
8811
293
   if (op == OP_COND)
8812
0
     {
8813
0
     scode += 1 + LINK_SIZE;
8814
8815
0
     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8816
0
       else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8817
8818
0
     switch (*scode)
8819
0
       {
8820
0
       case OP_CREF:
8821
0
       case OP_DNCREF:
8822
0
       case OP_RREF:
8823
0
       case OP_DNRREF:
8824
0
       case OP_FAIL:
8825
0
       case OP_FALSE:
8826
0
       case OP_TRUE:
8827
0
       return FALSE;
8828
8829
0
       default:     /* Assertion */
8830
0
       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8831
0
       do scode += GET(scode, 1); while (*scode == OP_ALT);
8832
0
       scode += 1 + LINK_SIZE;
8833
0
       break;
8834
0
       }
8835
0
     scode = first_significant_code(scode, FALSE);
8836
0
     op = *scode;
8837
0
     }
8838
8839
   /* Non-capturing brackets */
8840
8841
293
   if (op == OP_BRA  || op == OP_BRAPOS ||
8842
293
       op == OP_SBRA || op == OP_SBRAPOS)
8843
0
     {
8844
0
     if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8845
0
       return FALSE;
8846
0
     }
8847
8848
   /* Capturing brackets */
8849
8850
293
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8851
292
            op == OP_SCBRA || op == OP_SCBRAPOS)
8852
1
     {
8853
1
     int n = GET2(scode, 1+LINK_SIZE);
8854
1
     unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8855
1
     if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8856
1
     }
8857
8858
   /* Positive forward assertions */
8859
8860
292
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8861
0
     {
8862
0
     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8863
0
       return FALSE;
8864
0
     }
8865
8866
   /* Atomic brackets */
8867
8868
292
   else if (op == OP_ONCE)
8869
0
     {
8870
0
     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8871
0
       return FALSE;
8872
0
     }
8873
8874
   /* .* means "start at start or after \n" if it isn't in atomic brackets or
8875
   brackets that may be referenced or an assertion, and as long as the pattern
8876
   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8877
   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8878
   i.e. not at the start of a line. There is also an option that disables this
8879
   optimization. */
8880
8881
292
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8882
4
     {
8883
4
     if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8884
0
         atomcount > 0 || cb->had_pruneorskip || inassert ||
8885
0
         (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8886
4
       return FALSE;
8887
4
     }
8888
8889
   /* Check for explicit circumflex; anything else gives a FALSE result. Note
8890
   in particular that this includes atomic brackets OP_ONCE because the number
8891
   of characters matched by .* cannot be adjusted inside them. */
8892
8893
288
   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8894
8895
   /* Move on to the next alternative */
8896
8897
2
   code += GET(code, 1);
8898
2
   }
8899
292
while (*code == OP_ALT);  /* Loop for each alternative */
8900
1
return TRUE;
8901
292
}
8902
8903
8904
8905
/*************************************************
8906
*   Scan compiled regex for recursion reference  *
8907
*************************************************/
8908
8909
/* This function scans through a compiled pattern until it finds an instance of
8910
OP_RECURSE.
8911
8912
Arguments:
8913
  code        points to start of expression
8914
  utf         TRUE in UTF mode
8915
8916
Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8917
*/
8918
8919
static PCRE2_SPTR
8920
find_recurse(PCRE2_SPTR code, BOOL utf)
8921
0
{
8922
0
for (;;)
8923
0
  {
8924
0
  PCRE2_UCHAR c = *code;
8925
0
  if (c == OP_END) return NULL;
8926
0
  if (c == OP_RECURSE) return code;
8927
8928
  /* XCLASS is used for classes that cannot be represented just by a bit map.
8929
  This includes negated single high-valued characters. CALLOUT_STR is used for
8930
  callouts with string arguments. In both cases the length in the table is
8931
  zero; the actual length is stored in the compiled code. */
8932
8933
0
  if (c == OP_XCLASS) code += GET(code, 1);
8934
0
    else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8935
8936
  /* Otherwise, we can get the item's length from the table, except that for
8937
  repeated character types, we have to test for \p and \P, which have an extra
8938
  two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8939
  we must add in its length. */
8940
8941
0
  else
8942
0
    {
8943
0
    switch(c)
8944
0
      {
8945
0
      case OP_TYPESTAR:
8946
0
      case OP_TYPEMINSTAR:
8947
0
      case OP_TYPEPLUS:
8948
0
      case OP_TYPEMINPLUS:
8949
0
      case OP_TYPEQUERY:
8950
0
      case OP_TYPEMINQUERY:
8951
0
      case OP_TYPEPOSSTAR:
8952
0
      case OP_TYPEPOSPLUS:
8953
0
      case OP_TYPEPOSQUERY:
8954
0
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8955
0
      break;
8956
8957
0
      case OP_TYPEPOSUPTO:
8958
0
      case OP_TYPEUPTO:
8959
0
      case OP_TYPEMINUPTO:
8960
0
      case OP_TYPEEXACT:
8961
0
      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8962
0
        code += 2;
8963
0
      break;
8964
8965
0
      case OP_MARK:
8966
0
      case OP_COMMIT_ARG:
8967
0
      case OP_PRUNE_ARG:
8968
0
      case OP_SKIP_ARG:
8969
0
      case OP_THEN_ARG:
8970
0
      code += code[1];
8971
0
      break;
8972
0
      }
8973
8974
    /* Add in the fixed length from the table */
8975
8976
0
    code += PRIV(OP_lengths)[c];
8977
8978
    /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8979
    be followed by a multi-unit character. The length in the table is a
8980
    minimum, so we have to arrange to skip the extra units. */
8981
8982
0
#ifdef MAYBE_UTF_MULTI
8983
0
    if (utf) switch(c)
8984
0
      {
8985
0
      case OP_CHAR:
8986
0
      case OP_CHARI:
8987
0
      case OP_NOT:
8988
0
      case OP_NOTI:
8989
0
      case OP_EXACT:
8990
0
      case OP_EXACTI:
8991
0
      case OP_NOTEXACT:
8992
0
      case OP_NOTEXACTI:
8993
0
      case OP_UPTO:
8994
0
      case OP_UPTOI:
8995
0
      case OP_NOTUPTO:
8996
0
      case OP_NOTUPTOI:
8997
0
      case OP_MINUPTO:
8998
0
      case OP_MINUPTOI:
8999
0
      case OP_NOTMINUPTO:
9000
0
      case OP_NOTMINUPTOI:
9001
0
      case OP_POSUPTO:
9002
0
      case OP_POSUPTOI:
9003
0
      case OP_NOTPOSUPTO:
9004
0
      case OP_NOTPOSUPTOI:
9005
0
      case OP_STAR:
9006
0
      case OP_STARI:
9007
0
      case OP_NOTSTAR:
9008
0
      case OP_NOTSTARI:
9009
0
      case OP_MINSTAR:
9010
0
      case OP_MINSTARI:
9011
0
      case OP_NOTMINSTAR:
9012
0
      case OP_NOTMINSTARI:
9013
0
      case OP_POSSTAR:
9014
0
      case OP_POSSTARI:
9015
0
      case OP_NOTPOSSTAR:
9016
0
      case OP_NOTPOSSTARI:
9017
0
      case OP_PLUS:
9018
0
      case OP_PLUSI:
9019
0
      case OP_NOTPLUS:
9020
0
      case OP_NOTPLUSI:
9021
0
      case OP_MINPLUS:
9022
0
      case OP_MINPLUSI:
9023
0
      case OP_NOTMINPLUS:
9024
0
      case OP_NOTMINPLUSI:
9025
0
      case OP_POSPLUS:
9026
0
      case OP_POSPLUSI:
9027
0
      case OP_NOTPOSPLUS:
9028
0
      case OP_NOTPOSPLUSI:
9029
0
      case OP_QUERY:
9030
0
      case OP_QUERYI:
9031
0
      case OP_NOTQUERY:
9032
0
      case OP_NOTQUERYI:
9033
0
      case OP_MINQUERY:
9034
0
      case OP_MINQUERYI:
9035
0
      case OP_NOTMINQUERY:
9036
0
      case OP_NOTMINQUERYI:
9037
0
      case OP_POSQUERY:
9038
0
      case OP_POSQUERYI:
9039
0
      case OP_NOTPOSQUERY:
9040
0
      case OP_NOTPOSQUERYI:
9041
0
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9042
0
      break;
9043
0
      }
9044
#else
9045
    (void)(utf);  /* Keep compiler happy by referencing function argument */
9046
#endif  /* MAYBE_UTF_MULTI */
9047
0
    }
9048
0
  }
9049
0
}
9050
9051
9052
9053
/*************************************************
9054
*    Check for asserted fixed first code unit    *
9055
*************************************************/
9056
9057
/* During compilation, the "first code unit" settings from forward assertions
9058
are discarded, because they can cause conflicts with actual literals that
9059
follow. However, if we end up without a first code unit setting for an
9060
unanchored pattern, it is worth scanning the regex to see if there is an
9061
initial asserted first code unit. If all branches start with the same asserted
9062
code unit, or with a non-conditional bracket all of whose alternatives start
9063
with the same asserted code unit (recurse ad lib), then we return that code
9064
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9065
REQ_NONE in the flags.
9066
9067
Arguments:
9068
  code       points to start of compiled pattern
9069
  flags      points to the first code unit flags
9070
  inassert   non-zero if in an assertion
9071
9072
Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9073
*/
9074
9075
static uint32_t
9076
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9077
292
{
9078
292
uint32_t c = 0;
9079
292
uint32_t cflags = REQ_NONE;
9080
9081
292
*flags = REQ_NONE;
9082
292
do {
9083
292
   uint32_t d;
9084
292
   uint32_t dflags;
9085
292
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9086
291
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9087
292
   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9088
292
   PCRE2_UCHAR op = *scode;
9089
9090
292
   switch(op)
9091
292
     {
9092
142
     default:
9093
142
     return 0;
9094
9095
0
     case OP_BRA:
9096
0
     case OP_BRAPOS:
9097
1
     case OP_CBRA:
9098
1
     case OP_SCBRA:
9099
1
     case OP_CBRAPOS:
9100
1
     case OP_SCBRAPOS:
9101
1
     case OP_ASSERT:
9102
1
     case OP_ASSERT_NA:
9103
1
     case OP_ONCE:
9104
1
     case OP_SCRIPT_RUN:
9105
1
     d = find_firstassertedcu(scode, &dflags, inassert +
9106
1
       ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9107
1
     if (dflags >= REQ_NONE) return 0;
9108
0
     if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9109
0
       else if (c != d || cflags != dflags) return 0;
9110
0
     break;
9111
9112
0
     case OP_EXACT:
9113
0
     scode += IMM2_SIZE;
9114
     /* Fall through */
9115
9116
67
     case OP_CHAR:
9117
67
     case OP_PLUS:
9118
68
     case OP_MINPLUS:
9119
68
     case OP_POSPLUS:
9120
68
     if (inassert == 0) return 0;
9121
0
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9122
0
       else if (c != scode[1]) return 0;
9123
0
     break;
9124
9125
0
     case OP_EXACTI:
9126
0
     scode += IMM2_SIZE;
9127
     /* Fall through */
9128
9129
81
     case OP_CHARI:
9130
81
     case OP_PLUSI:
9131
81
     case OP_MINPLUSI:
9132
81
     case OP_POSPLUSI:
9133
81
     if (inassert == 0) return 0;
9134
9135
     /* If the character is more than one code unit long, we cannot set its
9136
     first code unit when matching caselessly. Later scanning may pick up
9137
     multiple code units. */
9138
9139
0
#ifdef SUPPORT_UNICODE
9140
0
#if PCRE2_CODE_UNIT_WIDTH == 8
9141
0
     if (scode[1] >= 0x80) return 0;
9142
#elif PCRE2_CODE_UNIT_WIDTH == 16
9143
     if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9144
#endif
9145
0
#endif
9146
9147
0
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9148
0
       else if (c != scode[1]) return 0;
9149
0
     break;
9150
292
     }
9151
9152
0
   code += GET(code, 1);
9153
0
   }
9154
292
while (*code == OP_ALT);
9155
9156
0
*flags = cflags;
9157
0
return c;
9158
292
}
9159
9160
9161
9162
/*************************************************
9163
*     Add an entry to the name/number table      *
9164
*************************************************/
9165
9166
/* This function is called between compiling passes to add an entry to the
9167
name/number table, maintaining alphabetical order. Checking for permitted
9168
and forbidden duplicates has already been done.
9169
9170
Arguments:
9171
  cb           the compile data block
9172
  name         the name to add
9173
  length       the length of the name
9174
  groupno      the group number
9175
  tablecount   the count of names in the table so far
9176
9177
Returns:       nothing
9178
*/
9179
9180
static void
9181
add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9182
  unsigned int groupno, uint32_t tablecount)
9183
0
{
9184
0
uint32_t i;
9185
0
PCRE2_UCHAR *slot = cb->name_table;
9186
9187
0
for (i = 0; i < tablecount; i++)
9188
0
  {
9189
0
  int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9190
0
  if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9191
0
    crc = -1; /* Current name is a substring */
9192
9193
  /* Make space in the table and break the loop for an earlier name. For a
9194
  duplicate or later name, carry on. We do this for duplicates so that in the
9195
  simple case (when ?(| is not used) they are in order of their numbers. In all
9196
  cases they are in the order in which they appear in the pattern. */
9197
9198
0
  if (crc < 0)
9199
0
    {
9200
0
    (void)memmove(slot + cb->name_entry_size, slot,
9201
0
      CU2BYTES((tablecount - i) * cb->name_entry_size));
9202
0
    break;
9203
0
    }
9204
9205
  /* Continue the loop for a later or duplicate name */
9206
9207
0
  slot += cb->name_entry_size;
9208
0
  }
9209
9210
0
PUT2(slot, 0, groupno);
9211
0
memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9212
9213
/* Add a terminating zero and fill the rest of the slot with zeroes so that
9214
the memory is all initialized. Otherwise valgrind moans about uninitialized
9215
memory when saving serialized compiled patterns. */
9216
9217
0
memset(slot + IMM2_SIZE + length, 0,
9218
0
  CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9219
0
}
9220
9221
9222
9223
/*************************************************
9224
*             Skip in parsed pattern             *
9225
*************************************************/
9226
9227
/* This function is called to skip parts of the parsed pattern when finding the
9228
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9229
the end of the branch, it is called to skip over an internal lookaround or
9230
(DEFINE) group, and it is also called to skip to the end of a class, during
9231
which it will never encounter nested groups (but there's no need to have
9232
special code for that).
9233
9234
When called to find the end of a branch or group, pptr must point to the first
9235
meta code inside the branch, not the branch-starting code. In other cases it
9236
can point to the item that causes the function to be called.
9237
9238
Arguments:
9239
  pptr       current pointer to skip from
9240
  skiptype   PSKIP_CLASS when skipping to end of class
9241
             PSKIP_ALT when META_ALT ends the skip
9242
             PSKIP_KET when only META_KET ends the skip
9243
9244
Returns:     new value of pptr
9245
             NULL if META_END is reached - should never occur
9246
               or for an unknown meta value - likewise
9247
*/
9248
9249
static uint32_t *
9250
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9251
0
{
9252
0
uint32_t nestlevel = 0;
9253
9254
0
for (;; pptr++)
9255
0
  {
9256
0
  uint32_t meta = META_CODE(*pptr);
9257
9258
0
  switch(meta)
9259
0
    {
9260
0
    default:  /* Just skip over most items */
9261
0
    if (meta < META_END) continue;  /* Literal */
9262
0
    break;
9263
9264
    /* This should never occur. */
9265
9266
0
    case META_END:
9267
0
    return NULL;
9268
9269
    /* The data for these items is variable in length. */
9270
9271
0
    case META_BACKREF:  /* Offset is present only if group >= 10 */
9272
0
    if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9273
0
    break;
9274
9275
0
    case META_ESCAPE:   /* A few escapes are followed by data items. */
9276
0
    switch (META_DATA(*pptr))
9277
0
      {
9278
0
      case ESC_P:
9279
0
      case ESC_p:
9280
0
      pptr += 1;
9281
0
      break;
9282
9283
0
      case ESC_g:
9284
0
      case ESC_k:
9285
0
      pptr += 1 + SIZEOFFSET;
9286
0
      break;
9287
0
      }
9288
0
    break;
9289
9290
0
    case META_MARK:     /* Add the length of the name. */
9291
0
    case META_COMMIT_ARG:
9292
0
    case META_PRUNE_ARG:
9293
0
    case META_SKIP_ARG:
9294
0
    case META_THEN_ARG:
9295
0
    pptr += pptr[1];
9296
0
    break;
9297
9298
    /* These are the "active" items in this loop. */
9299
9300
0
    case META_CLASS_END:
9301
0
    if (skiptype == PSKIP_CLASS) return pptr;
9302
0
    break;
9303
9304
0
    case META_ATOMIC:
9305
0
    case META_CAPTURE:
9306
0
    case META_COND_ASSERT:
9307
0
    case META_COND_DEFINE:
9308
0
    case META_COND_NAME:
9309
0
    case META_COND_NUMBER:
9310
0
    case META_COND_RNAME:
9311
0
    case META_COND_RNUMBER:
9312
0
    case META_COND_VERSION:
9313
0
    case META_LOOKAHEAD:
9314
0
    case META_LOOKAHEADNOT:
9315
0
    case META_LOOKAHEAD_NA:
9316
0
    case META_LOOKBEHIND:
9317
0
    case META_LOOKBEHINDNOT:
9318
0
    case META_LOOKBEHIND_NA:
9319
0
    case META_NOCAPTURE:
9320
0
    case META_SCRIPT_RUN:
9321
0
    nestlevel++;
9322
0
    break;
9323
9324
0
    case META_ALT:
9325
0
    if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9326
0
    break;
9327
9328
0
    case META_KET:
9329
0
    if (nestlevel == 0) return pptr;
9330
0
    nestlevel--;
9331
0
    break;
9332
0
    }
9333
9334
  /* The extra data item length for each meta is in a table. */
9335
9336
0
  meta = (meta >> 16) & 0x7fff;
9337
0
  if (meta >= sizeof(meta_extra_lengths)) return NULL;
9338
0
  pptr += meta_extra_lengths[meta];
9339
0
  }
9340
/* Control never reaches here */
9341
0
return pptr;
9342
0
}
9343
9344
9345
9346
/*************************************************
9347
*       Find length of a parsed group            *
9348
*************************************************/
9349
9350
/* This is called for nested groups within a branch of a lookbehind whose
9351
length is being computed. On entry, the pointer must be at the first element
9352
after the group initializing code. On exit it points to OP_KET. Caching is used
9353
to improve processing speed when the same capturing group occurs many times.
9354
9355
Arguments:
9356
  pptrptr     pointer to pointer in the parsed pattern
9357
  minptr      where to return the minimum length
9358
  isinline    FALSE if a reference or recursion; TRUE for inline group
9359
  errcodeptr  pointer to the errorcode
9360
  lcptr       pointer to the loop counter
9361
  group       number of captured group or -1 for a non-capturing group
9362
  recurses    chain of recurse_check to catch mutual recursion
9363
  cb          pointer to the compile data
9364
9365
Returns:      the maximum group length or a negative number
9366
*/
9367
9368
static int
9369
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9370
  int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9371
0
{
9372
0
uint32_t *gi = cb->groupinfo + 2 * group;
9373
0
int branchlength, branchminlength;
9374
0
int grouplength = -1;
9375
0
int groupminlength = INT_MAX;
9376
9377
/* The cache can be used only if there is no possibility of there being two
9378
groups with the same number. We do not need to set the end pointer for a group
9379
that is being processed as a back reference or recursion, but we must do so for
9380
an inline group. */
9381
9382
0
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9383
0
  {
9384
0
  uint32_t groupinfo = gi[0];
9385
0
  if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9386
0
  if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9387
0
    {
9388
0
    if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9389
0
    *minptr = gi[1];
9390
0
    return groupinfo & GI_FIXED_LENGTH_MASK;
9391
0
    }
9392
0
  }
9393
9394
/* Scan the group. In this case we find the end pointer of necessity. */
9395
9396
0
for(;;)
9397
0
  {
9398
0
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9399
0
    recurses, cb);
9400
0
  if (branchlength < 0) goto ISNOTFIXED;
9401
0
  if (branchlength > grouplength) grouplength = branchlength;
9402
0
  if (branchminlength < groupminlength) groupminlength = branchminlength;
9403
0
  if (**pptrptr == META_KET) break;
9404
0
  *pptrptr += 1;   /* Skip META_ALT */
9405
0
  }
9406
9407
0
if (group > 0)
9408
0
  {
9409
0
  gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9410
0
  gi[1] = groupminlength;
9411
0
  }
9412
9413
0
*minptr = groupminlength;
9414
0
return grouplength;
9415
9416
0
ISNOTFIXED:
9417
0
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9418
0
return -1;
9419
0
}
9420
9421
9422
9423
/*************************************************
9424
*        Find length of a parsed branch          *
9425
*************************************************/
9426
9427
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9428
giving an error if the length is not limited. On entry, *pptrptr points to the
9429
first element inside the branch. On exit it is set to point to the ALT or KET.
9430
9431
Arguments:
9432
  pptrptr     pointer to pointer in the parsed pattern
9433
  minptr      where to return the minimum length
9434
  errcodeptr  pointer to error code
9435
  lcptr       pointer to loop counter
9436
  recurses    chain of recurse_check to catch mutual recursion
9437
  cb          pointer to compile block
9438
9439
Returns:      the maximum length, or a negative value on error
9440
*/
9441
9442
static int
9443
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9444
  parsed_recurse_check *recurses, compile_block *cb)
9445
1
{
9446
1
int branchlength = 0;
9447
1
int branchminlength = 0;
9448
1
int grouplength, groupminlength;
9449
1
uint32_t lastitemlength = 0;
9450
1
uint32_t lastitemminlength = 0;
9451
1
uint32_t *pptr = *pptrptr;
9452
1
PCRE2_SIZE offset;
9453
1
parsed_recurse_check this_recurse;
9454
9455
/* A large and/or complex regex can take too long to process. This can happen
9456
more often when (?| groups are present in the pattern because their length
9457
cannot be cached. */
9458
9459
1
if ((*lcptr)++ > 2000)
9460
0
  {
9461
0
  *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9462
0
  return -1;
9463
0
  }
9464
9465
/* Scan the branch, accumulating the length. */
9466
9467
9
for (;; pptr++)
9468
10
  {
9469
10
  parsed_recurse_check *r;
9470
10
  uint32_t *gptr, *gptrend;
9471
10
  uint32_t escape;
9472
10
  uint32_t group = 0;
9473
10
  uint32_t itemlength = 0;
9474
10
  uint32_t itemminlength = 0;
9475
10
  uint32_t min, max;
9476
9477
10
  if (*pptr < META_END)
9478
8
    {
9479
8
    itemlength = itemminlength = 1;
9480
8
    }
9481
9482
2
  else switch (META_CODE(*pptr))
9483
2
    {
9484
1
    case META_KET:
9485
1
    case META_ALT:
9486
1
    goto EXIT;
9487
9488
    /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9489
    actual termination. */
9490
9491
0
    case META_ACCEPT:
9492
0
    case META_FAIL:
9493
0
    pptr = parsed_skip(pptr, PSKIP_ALT);
9494
0
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9495
0
    goto EXIT;
9496
9497
0
    case META_MARK:
9498
0
    case META_COMMIT_ARG:
9499
0
    case META_PRUNE_ARG:
9500
0
    case META_SKIP_ARG:
9501
0
    case META_THEN_ARG:
9502
0
    pptr += pptr[1] + 1;
9503
0
    break;
9504
9505
0
    case META_CIRCUMFLEX:
9506
0
    case META_COMMIT:
9507
0
    case META_DOLLAR:
9508
0
    case META_PRUNE:
9509
0
    case META_SKIP:
9510
0
    case META_THEN:
9511
0
    break;
9512
9513
0
    case META_OPTIONS:
9514
0
    pptr += 2;
9515
0
    break;
9516
9517
0
    case META_BIGVALUE:
9518
0
    itemlength = itemminlength = 1;
9519
0
    pptr += 1;
9520
0
    break;
9521
9522
0
    case META_CLASS:
9523
0
    case META_CLASS_NOT:
9524
0
    itemlength = itemminlength = 1;
9525
0
    pptr = parsed_skip(pptr, PSKIP_CLASS);
9526
0
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9527
0
    break;
9528
9529
0
    case META_CLASS_EMPTY_NOT:
9530
0
    case META_DOT:
9531
0
    itemlength = itemminlength = 1;
9532
0
    break;
9533
9534
0
    case META_CALLOUT_NUMBER:
9535
0
    pptr += 3;
9536
0
    break;
9537
9538
0
    case META_CALLOUT_STRING:
9539
0
    pptr += 3 + SIZEOFFSET;
9540
0
    break;
9541
9542
    /* Only some escapes consume a character. Of those, \R can match one or two
9543
    characters, but \X is never allowed because it matches an unknown number of
9544
    characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9545
9546
0
    case META_ESCAPE:
9547
0
    escape = META_DATA(*pptr);
9548
0
    if (escape == ESC_X) return -1;
9549
0
    if (escape == ESC_R)
9550
0
      {
9551
0
      itemminlength = 1;
9552
0
      itemlength = 2;
9553
0
      }
9554
0
    else if (escape > ESC_b && escape < ESC_Z)
9555
0
      {
9556
0
#if PCRE2_CODE_UNIT_WIDTH != 32
9557
0
      if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9558
0
        {
9559
0
        *errcodeptr = ERR36;
9560
0
        return -1;
9561
0
        }
9562
0
#endif
9563
0
      itemlength = itemminlength = 1;
9564
0
      if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9565
0
      }
9566
0
    break;
9567
9568
    /* Lookaheads do not contribute to the length of this branch, but they may
9569
    contain lookbehinds within them whose lengths need to be set. */
9570
9571
0
    case META_LOOKAHEAD:
9572
0
    case META_LOOKAHEADNOT:
9573
0
    case META_LOOKAHEAD_NA:
9574
0
    *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9575
0
    if (*errcodeptr != 0) return -1;
9576
9577
    /* Ignore any qualifiers that follow a lookahead assertion. */
9578
9579
0
    switch (pptr[1])
9580
0
      {
9581
0
      case META_ASTERISK:
9582
0
      case META_ASTERISK_PLUS:
9583
0
      case META_ASTERISK_QUERY:
9584
0
      case META_PLUS:
9585
0
      case META_PLUS_PLUS:
9586
0
      case META_PLUS_QUERY:
9587
0
      case META_QUERY:
9588
0
      case META_QUERY_PLUS:
9589
0
      case META_QUERY_QUERY:
9590
0
      pptr++;
9591
0
      break;
9592
9593
0
      case META_MINMAX:
9594
0
      case META_MINMAX_PLUS:
9595
0
      case META_MINMAX_QUERY:
9596
0
      pptr += 3;
9597
0
      break;
9598
9599
0
      default:
9600
0
      break;
9601
0
      }
9602
0
    break;
9603
9604
    /* A nested lookbehind does not contribute any length to this lookbehind,
9605
    but must itself be checked and have its lengths set. */
9606
9607
0
    case META_LOOKBEHIND:
9608
0
    case META_LOOKBEHINDNOT:
9609
0
    case META_LOOKBEHIND_NA:
9610
0
    if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9611
0
      return -1;
9612
0
    break;
9613
9614
    /* Back references and recursions are handled by very similar code. At this
9615
    stage, the names generated in the parsing pass are available, but the main
9616
    name table has not yet been created. So for the named varieties, scan the
9617
    list of names in order to get the number of the first one in the pattern,
9618
    and whether or not this name is duplicated. */
9619
9620
0
    case META_BACKREF_BYNAME:
9621
0
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9622
0
      goto ISNOTFIXED;
9623
    /* Fall through */
9624
9625
0
    case META_RECURSE_BYNAME:
9626
0
      {
9627
0
      int i;
9628
0
      PCRE2_SPTR name;
9629
0
      BOOL is_dupname = FALSE;
9630
0
      named_group *ng = cb->named_groups;
9631
0
      uint32_t meta_code = META_CODE(*pptr);
9632
0
      uint32_t length = *(++pptr);
9633
9634
0
      GETPLUSOFFSET(offset, pptr);
9635
0
      name = cb->start_pattern + offset;
9636
0
      for (i = 0; i < cb->names_found; i++, ng++)
9637
0
        {
9638
0
        if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9639
0
          {
9640
0
          group = ng->number;
9641
0
          is_dupname = ng->isdup;
9642
0
          break;
9643
0
          }
9644
0
        }
9645
9646
0
      if (group == 0)
9647
0
        {
9648
0
        *errcodeptr = ERR15;  /* Non-existent subpattern */
9649
0
        cb->erroroffset = offset;
9650
0
        return -1;
9651
0
        }
9652
9653
      /* A numerical back reference can be fixed length if duplicate capturing
9654
      groups are not being used. A non-duplicate named back reference can also
9655
      be handled. */
9656
9657
0
      if (meta_code == META_RECURSE_BYNAME ||
9658
0
          (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9659
0
        goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9660
0
      }
9661
0
    goto ISNOTFIXED;                     /* Duplicate name or number */
9662
9663
    /* The offset values for back references < 10 are in a separate vector
9664
    because otherwise they would use more than two parsed pattern elements on
9665
    64-bit systems. */
9666
9667
0
    case META_BACKREF:
9668
0
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9669
0
        (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9670
0
      goto ISNOTFIXED;
9671
0
    group = META_DATA(*pptr);
9672
0
    if (group < 10)
9673
0
      {
9674
0
      offset = cb->small_ref_offset[group];
9675
0
      goto RECURSE_OR_BACKREF_LENGTH;
9676
0
      }
9677
9678
    /* Fall through */
9679
    /* For groups >= 10 - picking up group twice does no harm. */
9680
9681
    /* A true recursion implies not fixed length, but a subroutine call may
9682
    be OK. Back reference "recursions" are also failed. */
9683
9684
0
    case META_RECURSE:
9685
0
    group = META_DATA(*pptr);
9686
0
    GETPLUSOFFSET(offset, pptr);
9687
9688
0
    RECURSE_OR_BACKREF_LENGTH:
9689
0
    if (group > cb->bracount)
9690
0
      {
9691
0
      cb->erroroffset = offset;
9692
0
      *errcodeptr = ERR15;  /* Non-existent subpattern */
9693
0
      return -1;
9694
0
      }
9695
0
    if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9696
0
    for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9697
0
      {
9698
0
      if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9699
0
        else if (*gptr == (META_CAPTURE | group)) break;
9700
0
      }
9701
9702
    /* We must start the search for the end of the group at the first meta code
9703
    inside the group. Otherwise it will be treated as an enclosed group. */
9704
9705
0
    gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9706
0
    if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9707
0
    if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9708
0
    for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9709
0
    if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9710
0
    this_recurse.prev = recurses;
9711
0
    this_recurse.groupptr = gptr;
9712
9713
    /* We do not need to know the position of the end of the group, that is,
9714
    gptr is not used after the call to get_grouplength(). Setting the second
9715
    argument FALSE stops it scanning for the end when the length can be found
9716
    in the cache. */
9717
9718
0
    gptr++;
9719
0
    grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9720
0
      lcptr, group, &this_recurse, cb);
9721
0
    if (grouplength < 0)
9722
0
      {
9723
0
      if (*errcodeptr == 0) goto ISNOTFIXED;
9724
0
      return -1;  /* Error already set */
9725
0
      }
9726
0
    itemlength = grouplength;
9727
0
    itemminlength = groupminlength;
9728
0
    break;
9729
9730
    /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9731
    the length of this branch. Skip from the following item to the next
9732
    unpaired ket. */
9733
9734
0
    case META_COND_DEFINE:
9735
0
    pptr = parsed_skip(pptr + 1, PSKIP_KET);
9736
0
    break;
9737
9738
    /* Check other nested groups - advance past the initial data for each type
9739
    and then seek a fixed length with get_grouplength(). */
9740
9741
0
    case META_COND_NAME:
9742
0
    case META_COND_NUMBER:
9743
0
    case META_COND_RNAME:
9744
0
    case META_COND_RNUMBER:
9745
0
    pptr += 2 + SIZEOFFSET;
9746
0
    goto CHECK_GROUP;
9747
9748
0
    case META_COND_ASSERT:
9749
0
    pptr += 1;
9750
0
    goto CHECK_GROUP;
9751
9752
0
    case META_COND_VERSION:
9753
0
    pptr += 4;
9754
0
    goto CHECK_GROUP;
9755
9756
0
    case META_CAPTURE:
9757
0
    group = META_DATA(*pptr);
9758
    /* Fall through */
9759
9760
0
    case META_ATOMIC:
9761
0
    case META_NOCAPTURE:
9762
0
    case META_SCRIPT_RUN:
9763
0
    pptr++;
9764
0
    CHECK_GROUP:
9765
0
    grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9766
0
      lcptr, group, recurses, cb);
9767
0
    if (grouplength < 0) return -1;
9768
0
    itemlength = grouplength;
9769
0
    itemminlength = groupminlength;
9770
0
    break;
9771
9772
1
    case META_QUERY:
9773
1
    case META_QUERY_PLUS:
9774
1
    case META_QUERY_QUERY:
9775
1
    min = 0;
9776
1
    max = 1;
9777
1
    goto REPETITION;
9778
9779
    /* Exact repetition is OK; variable repetition is not. A repetition of zero
9780
    must subtract the length that has already been added. */
9781
9782
0
    case META_MINMAX:
9783
0
    case META_MINMAX_PLUS:
9784
0
    case META_MINMAX_QUERY:
9785
0
    min = pptr[1];
9786
0
    max = pptr[2];
9787
0
    pptr += 2;
9788
9789
1
    REPETITION:
9790
1
    if (max != REPEAT_UNLIMITED)
9791
1
      {
9792
1
      if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9793
1
          max != 0 &&
9794
1
          (INT_MAX - branchlength)/lastitemlength < max - 1)
9795
0
        {
9796
0
        *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9797
0
        return -1;
9798
0
        }
9799
1
      if (min == 0) branchminlength -= lastitemminlength;
9800
0
        else itemminlength = (min - 1) * lastitemminlength;
9801
1
      if (max == 0) branchlength -= lastitemlength;
9802
1
        else itemlength = (max - 1) * lastitemlength;
9803
1
      break;
9804
1
      }
9805
    /* Fall through */
9806
9807
    /* Any other item means this branch does not have a fixed length. */
9808
9809
0
    default:
9810
0
    ISNOTFIXED:
9811
0
    *errcodeptr = ERR25;   /* Not fixed length */
9812
0
    return -1;
9813
2
    }
9814
9815
  /* Add the item length to the branchlength, checking for integer overflow and
9816
  for the branch length exceeding the overall limit. Later, if there is at
9817
  least one variable-length branch in the group, there is a test for the
9818
  (smaller) variable-length branch length limit. */
9819
9820
9
  if (INT_MAX - branchlength < (int)itemlength ||
9821
9
      (branchlength += itemlength) > LOOKBEHIND_MAX)
9822
0
    {
9823
0
    *errcodeptr = ERR87;
9824
0
    return -1;
9825
0
    }
9826
9827
9
  branchminlength += itemminlength;
9828
9829
  /* Save this item length for use if the next item is a quantifier. */
9830
9831
9
  lastitemlength = itemlength;
9832
9
  lastitemminlength = itemminlength;
9833
9
  }
9834
9835
1
EXIT:
9836
1
*pptrptr = pptr;
9837
1
*minptr = branchminlength;
9838
1
return branchlength;
9839
9840
0
PARSED_SKIP_FAILED:
9841
0
*errcodeptr = ERR90;
9842
0
return -1;
9843
1
}
9844
9845
9846
9847
/*************************************************
9848
*        Set lengths in a lookbehind             *
9849
*************************************************/
9850
9851
/* This function is called for each lookbehind, to set the lengths in its
9852
branches. An error occurs if any branch does not have a limited maximum length
9853
that is less than the limit (65535). On exit, the pointer must be left on the
9854
final ket.
9855
9856
The function also maintains the max_lookbehind value. Any lookbehind branch
9857
that contains a nested lookbehind may actually look further back than the
9858
length of the branch. The additional amount is passed back from
9859
get_branchlength() as an "extra" value.
9860
9861
Arguments:
9862
  pptrptr     pointer to pointer in the parsed pattern
9863
  errcodeptr  pointer to error code
9864
  lcptr       pointer to loop counter
9865
  recurses    chain of recurse_check to catch mutual recursion
9866
  cb          pointer to compile block
9867
9868
Returns:      TRUE if all is well
9869
              FALSE otherwise, with error code and offset set
9870
*/
9871
9872
static BOOL
9873
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9874
  parsed_recurse_check *recurses, compile_block *cb)
9875
1
{
9876
1
PCRE2_SIZE offset;
9877
1
uint32_t *bptr = *pptrptr;
9878
1
uint32_t *gbptr = bptr;
9879
1
int maxlength = 0;
9880
1
int minlength = INT_MAX;
9881
1
BOOL variable = FALSE;
9882
9883
1
READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9884
1
*pptrptr += SIZEOFFSET;
9885
9886
/* Each branch can have a different maximum length, but we can keep only a
9887
single minimum for the whole group, because there's nowhere to save individual
9888
values in the META_ALT item. */
9889
9890
1
do
9891
1
  {
9892
1
  int branchlength, branchminlength;
9893
9894
1
  *pptrptr += 1;
9895
1
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9896
1
    recurses, cb);
9897
9898
1
  if (branchlength < 0)
9899
0
    {
9900
    /* The errorcode and offset may already be set from a nested lookbehind. */
9901
0
    if (*errcodeptr == 0) *errcodeptr = ERR25;
9902
0
    if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9903
0
    return FALSE;
9904
0
    }
9905
9906
1
  if (branchlength != branchminlength) variable = TRUE;
9907
1
  if (branchminlength < minlength) minlength = branchminlength;
9908
1
  if (branchlength > maxlength) maxlength = branchlength;
9909
1
  if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9910
1
  *bptr |= branchlength;  /* branchlength never more than 65535 */
9911
1
  bptr = *pptrptr;
9912
1
  }
9913
1
while (META_CODE(*bptr) == META_ALT);
9914
9915
/* If any branch is of variable length, the whole lookbehind is of variable
9916
length. If the maximum length of any branch exceeds the maximum for variable
9917
lookbehinds, give an error. Otherwise, the minimum length is set in the word
9918
that follows the original group META value. For a fixed-length lookbehind, this
9919
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9920
possibly different) length. */
9921
9922
1
if (variable)
9923
1
  {
9924
1
  gbptr[1] = minlength;
9925
1
  if ((uint32_t)maxlength > cb->max_varlookbehind)
9926
0
    {
9927
0
    *errcodeptr = ERR100;
9928
0
    cb->erroroffset = offset;
9929
0
    return FALSE;
9930
0
    }
9931
1
  }
9932
0
else gbptr[1] = LOOKBEHIND_MAX;
9933
9934
9935
1
gbptr[1] = variable? minlength : LOOKBEHIND_MAX;
9936
1
return TRUE;
9937
1
}
9938
9939
9940
9941
/*************************************************
9942
*         Check parsed pattern lookbehinds       *
9943
*************************************************/
9944
9945
/* This function is called at the end of parsing a pattern if any lookbehinds
9946
were encountered. It scans the parsed pattern for them, calling
9947
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9948
the error offset is marked unset. The enables the functions above not to
9949
override settings from deeper nestings.
9950
9951
This function is called recursively from get_branchlength() for lookaheads in
9952
order to process any lookbehinds that they may contain. It stops when it hits a
9953
non-nested closing parenthesis in this case, returning a pointer to it.
9954
9955
Arguments
9956
  pptr      points to where to start (start of pattern or start of lookahead)
9957
  retptr    if not NULL, return the ket pointer here
9958
  recurses  chain of recurse_check to catch mutual recursion
9959
  cb        points to the compile block
9960
  lcptr     points to loop counter
9961
9962
Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9963
*/
9964
9965
static int
9966
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9967
  parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9968
1
{
9969
1
int errorcode = 0;
9970
1
int nestlevel = 0;
9971
9972
1
cb->erroroffset = PCRE2_UNSET;
9973
9974
65
for (; *pptr != META_END; pptr++)
9975
64
  {
9976
64
  if (*pptr < META_END) continue;  /* Literal */
9977
9978
14
  switch (META_CODE(*pptr))
9979
14
    {
9980
0
    default:
9981
0
    return ERR70;  /* Unrecognized meta code */
9982
9983
0
    case META_ESCAPE:
9984
0
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9985
0
      pptr += 1;
9986
0
    break;
9987
9988
2
    case META_KET:
9989
2
    if (--nestlevel < 0)
9990
0
      {
9991
0
      if (retptr != NULL) *retptr = pptr;
9992
0
      return 0;
9993
0
      }
9994
2
    break;
9995
9996
2
    case META_ATOMIC:
9997
0
    case META_CAPTURE:
9998
0
    case META_COND_ASSERT:
9999
1
    case META_LOOKAHEAD:
10000
1
    case META_LOOKAHEADNOT:
10001
1
    case META_LOOKAHEAD_NA:
10002
2
    case META_NOCAPTURE:
10003
2
    case META_SCRIPT_RUN:
10004
2
    nestlevel++;
10005
2
    break;
10006
10007
0
    case META_ACCEPT:
10008
3
    case META_ALT:
10009
3
    case META_ASTERISK:
10010
3
    case META_ASTERISK_PLUS:
10011
3
    case META_ASTERISK_QUERY:
10012
3
    case META_BACKREF:
10013
4
    case META_CIRCUMFLEX:
10014
4
    case META_CLASS:
10015
4
    case META_CLASS_EMPTY:
10016
4
    case META_CLASS_EMPTY_NOT:
10017
4
    case META_CLASS_END:
10018
4
    case META_CLASS_NOT:
10019
4
    case META_COMMIT:
10020
5
    case META_DOLLAR:
10021
5
    case META_DOT:
10022
5
    case META_FAIL:
10023
7
    case META_PLUS:
10024
7
    case META_PLUS_PLUS:
10025
7
    case META_PLUS_QUERY:
10026
7
    case META_PRUNE:
10027
9
    case META_QUERY:
10028
9
    case META_QUERY_PLUS:
10029
9
    case META_QUERY_QUERY:
10030
9
    case META_RANGE_ESCAPED:
10031
9
    case META_RANGE_LITERAL:
10032
9
    case META_SKIP:
10033
9
    case META_THEN:
10034
9
    break;
10035
10036
0
    case META_RECURSE:
10037
0
    pptr += SIZEOFFSET;
10038
0
    break;
10039
10040
0
    case META_BACKREF_BYNAME:
10041
0
    case META_RECURSE_BYNAME:
10042
0
    pptr += 1 + SIZEOFFSET;
10043
0
    break;
10044
10045
0
    case META_COND_DEFINE:
10046
0
    pptr += SIZEOFFSET;
10047
0
    nestlevel++;
10048
0
    break;
10049
10050
0
    case META_COND_NAME:
10051
0
    case META_COND_NUMBER:
10052
0
    case META_COND_RNAME:
10053
0
    case META_COND_RNUMBER:
10054
0
    pptr += 1 + SIZEOFFSET;
10055
0
    nestlevel++;
10056
0
    break;
10057
10058
0
    case META_COND_VERSION:
10059
0
    pptr += 3;
10060
0
    nestlevel++;
10061
0
    break;
10062
10063
0
    case META_CALLOUT_STRING:
10064
0
    pptr += 3 + SIZEOFFSET;
10065
0
    break;
10066
10067
0
    case META_BIGVALUE:
10068
0
    case META_POSIX:
10069
0
    case META_POSIX_NEG:
10070
0
    pptr += 1;
10071
0
    break;
10072
10073
0
    case META_MINMAX:
10074
0
    case META_MINMAX_QUERY:
10075
0
    case META_MINMAX_PLUS:
10076
0
    case META_OPTIONS:
10077
0
    pptr += 2;
10078
0
    break;
10079
10080
0
    case META_CALLOUT_NUMBER:
10081
0
    pptr += 3;
10082
0
    break;
10083
10084
0
    case META_MARK:
10085
0
    case META_COMMIT_ARG:
10086
0
    case META_PRUNE_ARG:
10087
0
    case META_SKIP_ARG:
10088
0
    case META_THEN_ARG:
10089
0
    pptr += 1 + pptr[1];
10090
0
    break;
10091
10092
1
    case META_LOOKBEHIND:
10093
1
    case META_LOOKBEHINDNOT:
10094
1
    case META_LOOKBEHIND_NA:
10095
1
    if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10096
0
      return errorcode;
10097
1
    break;
10098
14
    }
10099
14
  }
10100
10101
1
return 0;
10102
1
}
10103
10104
10105
10106
/*************************************************
10107
*     External function to compile a pattern     *
10108
*************************************************/
10109
10110
/* This function reads a regular expression in the form of a string and returns
10111
a pointer to a block of store holding a compiled version of the expression.
10112
10113
Arguments:
10114
  pattern       the regular expression
10115
  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10116
  options       option bits
10117
  errorptr      pointer to errorcode
10118
  erroroffset   pointer to error offset
10119
  ccontext      points to a compile context or is NULL
10120
10121
Returns:        pointer to compiled data block, or NULL on error,
10122
                with errorcode and erroroffset set
10123
*/
10124
10125
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10126
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10127
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10128
708
{
10129
708
BOOL utf;                             /* Set TRUE for UTF mode */
10130
708
BOOL ucp;                             /* Set TRUE for UCP mode */
10131
708
BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10132
708
BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10133
708
pcre2_real_code *re = NULL;           /* What we will return */
10134
708
compile_block cb;                     /* "Static" compile-time data */
10135
708
const uint8_t *tables;                /* Char tables base pointer */
10136
10137
708
PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10138
708
PCRE2_SPTR codestart;                 /* Start of compiled code */
10139
708
PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10140
708
uint32_t *pptr;                       /* Current pointer in parsed pattern */
10141
10142
708
PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10143
708
PCRE2_SIZE usedlength;                /* Actual length used */
10144
708
PCRE2_SIZE re_blocksize;              /* Size of memory block */
10145
708
PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
10146
708
PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10147
10148
708
uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10149
708
uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10150
708
uint32_t setflags = 0;                /* NL and BSR set flags */
10151
10152
708
uint32_t skipatstart;                 /* When checking (*UTF) etc */
10153
708
uint32_t limit_heap  = UINT32_MAX;
10154
708
uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10155
708
uint32_t limit_depth = UINT32_MAX;
10156
10157
708
int newline = 0;                      /* Unset; can be set by the pattern */
10158
708
int bsr = 0;                          /* Unset; can be set by the pattern */
10159
708
int errorcode = 0;                    /* Initialize to avoid compiler warn */
10160
708
int regexrc;                          /* Return from compile */
10161
10162
708
uint32_t i;                           /* Local loop counter */
10163
10164
/* Comments at the head of this file explain about these variables. */
10165
10166
708
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10167
708
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10168
708
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10169
10170
/* The workspace is used in different ways in the different compiling phases.
10171
It needs to be 16-bit aligned for the preliminary parsing scan. */
10172
10173
708
uint32_t c16workspace[C16_WORK_SIZE];
10174
708
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10175
10176
10177
/* -------------- Check arguments and set up the pattern ----------------- */
10178
10179
/* There must be error code and offset pointers. */
10180
10181
708
if (errorptr == NULL || erroroffset == NULL) return NULL;
10182
708
*errorptr = ERR0;
10183
708
*erroroffset = 0;
10184
10185
/* There must be a pattern, but NULL is allowed with zero length. */
10186
10187
708
if (pattern == NULL)
10188
0
  {
10189
0
  if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10190
0
    {
10191
0
    *errorptr = ERR16;
10192
0
    return NULL;
10193
0
    }
10194
0
  }
10195
10196
/* A NULL compile context means "use a default context" */
10197
10198
708
if (ccontext == NULL)
10199
0
  ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10200
10201
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10202
10203
708
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10204
10205
/* Check that all undefined public option bits are zero. */
10206
10207
708
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10208
708
    (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10209
0
  {
10210
0
  *errorptr = ERR17;
10211
0
  return NULL;
10212
0
  }
10213
10214
708
if ((options & PCRE2_LITERAL) != 0 &&
10215
0
    ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10216
0
     (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10217
0
  {
10218
0
  *errorptr = ERR92;
10219
0
  return NULL;
10220
0
  }
10221
10222
/* A zero-terminated pattern is indicated by the special length value
10223
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10224
10225
708
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10226
0
  patlen = PRIV(strlen)(pattern);
10227
10228
708
if (patlen > ccontext->max_pattern_length)
10229
0
  {
10230
0
  *errorptr = ERR88;
10231
0
  return NULL;
10232
0
  }
10233
10234
/* From here on, all returns from this function should end up going via the
10235
EXIT label. */
10236
10237
10238
/* ------------ Initialize the "static" compile data -------------- */
10239
10240
708
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10241
10242
708
cb.lcc = tables + lcc_offset;          /* Individual */
10243
708
cb.fcc = tables + fcc_offset;          /*   character */
10244
708
cb.cbits = tables + cbits_offset;      /*      tables */
10245
708
cb.ctypes = tables + ctypes_offset;
10246
10247
708
cb.assert_depth = 0;
10248
708
cb.bracount = 0;
10249
708
cb.cx = ccontext;
10250
708
cb.dupnames = FALSE;
10251
708
cb.end_pattern = pattern + patlen;
10252
708
cb.erroroffset = 0;
10253
708
cb.external_flags = 0;
10254
708
cb.external_options = options;
10255
708
cb.groupinfo = stack_groupinfo;
10256
708
cb.had_recurse = FALSE;
10257
708
cb.lastcapture = 0;
10258
708
cb.max_lookbehind = 0;                               /* Max encountered */
10259
708
cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10260
708
cb.name_entry_size = 0;
10261
708
cb.name_table = NULL;
10262
708
cb.named_groups = named_groups;
10263
708
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10264
708
cb.names_found = 0;
10265
708
cb.parens_depth = 0;
10266
708
cb.parsed_pattern = stack_parsed_pattern;
10267
708
cb.req_varyopt = 0;
10268
708
cb.start_code = cworkspace;
10269
708
cb.start_pattern = pattern;
10270
708
cb.start_workspace = cworkspace;
10271
708
cb.workspace_size = COMPILE_WORK_SIZE;
10272
10273
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10274
references to help in deciding whether (.*) can be treated as anchored or not.
10275
*/
10276
10277
708
cb.top_backref = 0;
10278
708
cb.backref_map = 0;
10279
10280
/* Escape sequences \1 to \9 are always back references, but as they are only
10281
two characters long, only two elements can be used in the parsed_pattern
10282
vector. The first contains the reference, and we'd like to use the second to
10283
record the offset in the pattern, so that forward references to non-existent
10284
groups can be diagnosed later with an offset. However, on 64-bit systems,
10285
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10286
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10287
references have enough space for the offset to be put into the parsed pattern.
10288
*/
10289
10290
7.78k
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10291
10292
10293
/* --------------- Start looking at the pattern --------------- */
10294
10295
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10296
the start of the pattern, and remember the offset to the actual regex. With
10297
valgrind support, make the terminator of a zero-terminated pattern
10298
inaccessible. This catches bugs that would otherwise only show up for
10299
non-zero-terminated patterns. */
10300
10301
#ifdef SUPPORT_VALGRIND
10302
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10303
#endif
10304
10305
708
ptr = pattern;
10306
708
skipatstart = 0;
10307
10308
708
if ((options & PCRE2_LITERAL) == 0)
10309
708
  {
10310
708
  while (patlen - skipatstart >= 2 &&
10311
707
         ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10312
1
         ptr[skipatstart+1] == CHAR_ASTERISK)
10313
0
    {
10314
0
    for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10315
0
      {
10316
0
      uint32_t c, pp;
10317
0
      const pso *p = pso_list + i;
10318
10319
0
      if (patlen - skipatstart - 2 >= p->length &&
10320
0
          PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
10321
0
            p->length) == 0)
10322
0
        {
10323
0
        skipatstart += p->length + 2;
10324
0
        switch(p->type)
10325
0
          {
10326
0
          case PSO_OPT:
10327
0
          cb.external_options |= p->value;
10328
0
          break;
10329
10330
0
          case PSO_FLG:
10331
0
          setflags |= p->value;
10332
0
          break;
10333
10334
0
          case PSO_NL:
10335
0
          newline = p->value;
10336
0
          setflags |= PCRE2_NL_SET;
10337
0
          break;
10338
10339
0
          case PSO_BSR:
10340
0
          bsr = p->value;
10341
0
          setflags |= PCRE2_BSR_SET;
10342
0
          break;
10343
10344
0
          case PSO_LIMM:
10345
0
          case PSO_LIMD:
10346
0
          case PSO_LIMH:
10347
0
          c = 0;
10348
0
          pp = skipatstart;
10349
0
          if (!IS_DIGIT(ptr[pp]))
10350
0
            {
10351
0
            errorcode = ERR60;
10352
0
            ptr += pp;
10353
0
            goto HAD_EARLY_ERROR;
10354
0
            }
10355
0
          while (IS_DIGIT(ptr[pp]))
10356
0
            {
10357
0
            if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10358
0
            c = c*10 + (ptr[pp++] - CHAR_0);
10359
0
            }
10360
0
          if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
10361
0
            {
10362
0
            errorcode = ERR60;
10363
0
            ptr += pp;
10364
0
            goto HAD_EARLY_ERROR;
10365
0
            }
10366
0
          if (p->type == PSO_LIMH) limit_heap = c;
10367
0
            else if (p->type == PSO_LIMM) limit_match = c;
10368
0
            else limit_depth = c;
10369
0
          skipatstart += pp - skipatstart;
10370
0
          break;
10371
0
          }
10372
0
        break;   /* Out of the table scan loop */
10373
0
        }
10374
0
      }
10375
0
    if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10376
0
    }
10377
708
  }
10378
10379
/* End of pattern-start options; advance to start of real regex. */
10380
10381
708
ptr += skipatstart;
10382
10383
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10384
10385
#ifndef SUPPORT_UNICODE
10386
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10387
  {
10388
  errorcode = ERR32;
10389
  goto HAD_EARLY_ERROR;
10390
  }
10391
#endif
10392
10393
/* Check UTF. We have the original options in 'options', with that value as
10394
modified by (*UTF) etc in cb->external_options. The extra option
10395
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10396
surrogate code points cannot be represented in UTF-16. */
10397
10398
708
utf = (cb.external_options & PCRE2_UTF) != 0;
10399
708
if (utf)
10400
154
  {
10401
154
  if ((options & PCRE2_NEVER_UTF) != 0)
10402
0
    {
10403
0
    errorcode = ERR74;
10404
0
    goto HAD_EARLY_ERROR;
10405
0
    }
10406
154
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10407
154
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10408
4
    goto HAD_ERROR;  /* Offset was set by valid_utf() */
10409
10410
#if PCRE2_CODE_UNIT_WIDTH == 16
10411
  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10412
    {
10413
    errorcode = ERR91;
10414
    goto HAD_EARLY_ERROR;
10415
    }
10416
#endif
10417
154
  }
10418
10419
/* Check UCP lockout. */
10420
10421
704
ucp = (cb.external_options & PCRE2_UCP) != 0;
10422
704
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10423
0
  {
10424
0
  errorcode = ERR75;
10425
0
  goto HAD_EARLY_ERROR;
10426
0
  }
10427
10428
/* Process the BSR setting. */
10429
10430
704
if (bsr == 0) bsr = ccontext->bsr_convention;
10431
10432
/* Process the newline setting. */
10433
10434
704
if (newline == 0) newline = ccontext->newline_convention;
10435
704
cb.nltype = NLTYPE_FIXED;
10436
704
switch(newline)
10437
704
  {
10438
0
  case PCRE2_NEWLINE_CR:
10439
0
  cb.nllen = 1;
10440
0
  cb.nl[0] = CHAR_CR;
10441
0
  break;
10442
10443
704
  case PCRE2_NEWLINE_LF:
10444
704
  cb.nllen = 1;
10445
704
  cb.nl[0] = CHAR_NL;
10446
704
  break;
10447
10448
0
  case PCRE2_NEWLINE_NUL:
10449
0
  cb.nllen = 1;
10450
0
  cb.nl[0] = CHAR_NUL;
10451
0
  break;
10452
10453
0
  case PCRE2_NEWLINE_CRLF:
10454
0
  cb.nllen = 2;
10455
0
  cb.nl[0] = CHAR_CR;
10456
0
  cb.nl[1] = CHAR_NL;
10457
0
  break;
10458
10459
0
  case PCRE2_NEWLINE_ANY:
10460
0
  cb.nltype = NLTYPE_ANY;
10461
0
  break;
10462
10463
0
  case PCRE2_NEWLINE_ANYCRLF:
10464
0
  cb.nltype = NLTYPE_ANYCRLF;
10465
0
  break;
10466
10467
0
  default:
10468
0
  errorcode = ERR56;
10469
0
  goto HAD_EARLY_ERROR;
10470
704
  }
10471
10472
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10473
their numerical equivalents, so that this information is always available for
10474
the remaining processing. (2) At the same time, parse the pattern and put a
10475
processed version into the parsed_pattern vector. This has escapes interpreted
10476
and comments removed (amongst other things).
10477
10478
In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10479
32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10480
one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10481
set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10482
characters greater than META_END (0x80000000) have to be coded as two units. In
10483
this case, therefore, we scan the pattern to check for such values. */
10484
10485
#if PCRE2_CODE_UNIT_WIDTH == 32
10486
if (!utf)
10487
  {
10488
  PCRE2_SPTR p;
10489
  for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10490
  }
10491
#endif
10492
10493
/* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10494
is set we have to assume a numerical callout (4 elements) for each character
10495
plus one at the end. This is overkill, but memory is plentiful these days. For
10496
many smaller patterns the vector on the stack (which was set up above) can be
10497
used. */
10498
10499
704
parsed_size_needed = patlen - skipatstart + big32count;
10500
10501
704
if ((ccontext->extra_options &
10502
704
     (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10503
0
  parsed_size_needed += 4;
10504
10505
704
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10506
0
  parsed_size_needed = (parsed_size_needed + 1) * 5;
10507
10508
704
if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10509
93
  {
10510
93
  uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10511
93
    (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10512
93
  if (heap_parsed_pattern == NULL)
10513
0
    {
10514
0
    *errorptr = ERR21;
10515
0
    goto EXIT;
10516
0
    }
10517
93
  cb.parsed_pattern = heap_parsed_pattern;
10518
93
  }
10519
704
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10520
10521
/* Do the parsing scan. */
10522
10523
704
errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10524
704
if (errorcode != 0) goto HAD_CB_ERROR;
10525
10526
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10527
lengths. Workspace is needed to remember whether numbered groups are or are not
10528
of limited length, and if limited, what the minimum and maximum lengths are.
10529
This caching saves re-computing the length of any group that is referenced more
10530
than once, which is particularly relevant when recursion is involved.
10531
Unnumbered groups do not have this exposure because they cannot be referenced.
10532
If there are sufficiently few groups, the default index vector on the stack, as
10533
set up above, can be used. Otherwise we have to get/free some heap memory. The
10534
vector must be initialized to zero. */
10535
10536
368
if (has_lookbehind)
10537
1
  {
10538
1
  int loopcount = 0;
10539
1
  if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10540
0
    {
10541
0
    cb.groupinfo = ccontext->memctl.malloc(
10542
0
      (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10543
0
    if (cb.groupinfo == NULL)
10544
0
      {
10545
0
      errorcode = ERR21;
10546
0
      cb.erroroffset = 0;
10547
0
      goto HAD_CB_ERROR;
10548
0
      }
10549
0
    }
10550
1
  memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10551
1
  errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10552
1
  if (errorcode != 0) goto HAD_CB_ERROR;
10553
1
  }
10554
10555
/* For debugging, there is a function that shows the parsed pattern vector. */
10556
10557
#ifdef DEBUG_SHOW_PARSED
10558
fprintf(stderr, "+++ Pre-scan complete:\n");
10559
show_parsed(&cb);
10560
#endif
10561
10562
/* For debugging capturing information this code can be enabled. */
10563
10564
#ifdef DEBUG_SHOW_CAPTURES
10565
  {
10566
  named_group *ng = cb.named_groups;
10567
  fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10568
  for (i = 0; i < cb.names_found; i++, ng++)
10569
    {
10570
    fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10571
    }
10572
  }
10573
#endif
10574
10575
/* Pretend to compile the pattern while actually just accumulating the amount
10576
of memory required in the 'length' variable. This behaviour is triggered by
10577
passing a non-NULL final argument to compile_regex(). We pass a block of
10578
workspace (cworkspace) for it to compile parts of the pattern into; the
10579
compiled code is discarded when it is no longer needed, so hopefully this
10580
workspace will never overflow, though there is a test for its doing so.
10581
10582
On error, errorcode will be set non-zero, so we don't need to look at the
10583
result of the function. The initial options have been put into the cb block,
10584
but we still have to pass a separate options variable (the first argument)
10585
because the options may change as the pattern is processed. */
10586
10587
368
cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10588
368
pptr = cb.parsed_pattern;
10589
368
code = cworkspace;
10590
368
*code = OP_BRA;
10591
10592
368
(void)compile_regex(cb.external_options, ccontext->extra_options, &code, &pptr,
10593
368
   &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10594
368
   &cb, &length);
10595
10596
368
if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10597
10598
/* This should be caught in compile_regex(), but just in case... */
10599
10600
348
if (length > MAX_PATTERN_SIZE)
10601
0
  {
10602
0
  errorcode = ERR20;
10603
0
  goto HAD_CB_ERROR;
10604
0
  }
10605
10606
/* Compute the size of, then, if not too large, get and initialize the data
10607
block for storing the compiled pattern and names table. Integer overflow should
10608
no longer be possible because nowadays we limit the maximum value of
10609
cb.names_found and cb.name_entry_size. */
10610
10611
348
re_blocksize = sizeof(pcre2_real_code) +
10612
348
  CU2BYTES(length +
10613
348
  (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10614
10615
348
if (re_blocksize > ccontext->max_pattern_compiled_length)
10616
0
  {
10617
0
  errorcode = ERR101;
10618
0
  goto HAD_CB_ERROR;
10619
0
  }
10620
10621
348
re = (pcre2_real_code *)
10622
348
  ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10623
348
if (re == NULL)
10624
0
  {
10625
0
  errorcode = ERR21;
10626
0
  goto HAD_CB_ERROR;
10627
0
  }
10628
10629
/* The compiler may put padding at the end of the pcre2_real_code structure in
10630
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10631
compiled pattern is copied (for example, when serialized) undefined bytes are
10632
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10633
write to the last 8 bytes of the structure before setting the fields. */
10634
10635
348
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10636
348
re->memctl = ccontext->memctl;
10637
348
re->tables = tables;
10638
348
re->executable_jit = NULL;
10639
348
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10640
348
re->blocksize = re_blocksize;
10641
348
re->magic_number = MAGIC_NUMBER;
10642
348
re->compile_options = options;
10643
348
re->overall_options = cb.external_options;
10644
348
re->extra_options = ccontext->extra_options;
10645
348
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10646
348
re->limit_heap = limit_heap;
10647
348
re->limit_match = limit_match;
10648
348
re->limit_depth = limit_depth;
10649
348
re->first_codeunit = 0;
10650
348
re->last_codeunit = 0;
10651
348
re->bsr_convention = bsr;
10652
348
re->newline_convention = newline;
10653
348
re->max_lookbehind = 0;
10654
348
re->minlength = 0;
10655
348
re->top_bracket = 0;
10656
348
re->top_backref = 0;
10657
348
re->name_entry_size = cb.name_entry_size;
10658
348
re->name_count = cb.names_found;
10659
10660
/* The basic block is immediately followed by the name table, and the compiled
10661
code follows after that. */
10662
10663
348
codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10664
348
  re->name_entry_size * re->name_count;
10665
10666
/* Update the compile data block for the actual compile. The starting points of
10667
the name/number translation table and of the code are passed around in the
10668
compile data block. The start/end pattern and initial options are already set
10669
from the pre-compile phase, as is the name_entry_size field. */
10670
10671
348
cb.parens_depth = 0;
10672
348
cb.assert_depth = 0;
10673
348
cb.lastcapture = 0;
10674
348
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10675
348
cb.start_code = codestart;
10676
348
cb.req_varyopt = 0;
10677
348
cb.had_accept = FALSE;
10678
348
cb.had_pruneorskip = FALSE;
10679
10680
/* If any named groups were found, create the name/number table from the list
10681
created in the pre-pass. */
10682
10683
348
if (cb.names_found > 0)
10684
0
  {
10685
0
  named_group *ng = cb.named_groups;
10686
0
  for (i = 0; i < cb.names_found; i++, ng++)
10687
0
    add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10688
0
  }
10689
10690
/* Set up a starting, non-extracting bracket, then compile the expression. On
10691
error, errorcode will be set non-zero, so we don't need to look at the result
10692
of the function here. */
10693
10694
348
pptr = cb.parsed_pattern;
10695
348
code = (PCRE2_UCHAR *)codestart;
10696
348
*code = OP_BRA;
10697
348
regexrc = compile_regex(re->overall_options, ccontext->extra_options, &code,
10698
348
  &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10699
348
  NULL, &cb, NULL);
10700
348
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10701
348
re->top_bracket = cb.bracount;
10702
348
re->top_backref = cb.top_backref;
10703
348
re->max_lookbehind = cb.max_lookbehind;
10704
10705
348
if (cb.had_accept)
10706
0
  {
10707
0
  reqcu = 0;                     /* Must disable after (*ACCEPT) */
10708
0
  reqcuflags = REQ_NONE;
10709
0
  re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10710
0
  }
10711
10712
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10713
but the estimated length exceeds the really used length, adjust the value of
10714
re->blocksize, and if valgrind support is configured, mark the extra allocated
10715
memory as unaddressable, so that any out-of-bound reads can be detected. */
10716
10717
348
*code++ = OP_END;
10718
348
usedlength = code - codestart;
10719
348
if (usedlength > length) errorcode = ERR23; else
10720
348
  {
10721
348
  re->blocksize -= CU2BYTES(length - usedlength);
10722
#ifdef SUPPORT_VALGRIND
10723
  VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10724
#endif
10725
348
  }
10726
10727
/* Scan the pattern for recursion/subroutine calls and convert the group
10728
numbers into offsets. Maintain a small cache so that repeated groups containing
10729
recursions are efficiently handled. */
10730
10731
348
#define RSCAN_CACHE_SIZE 8
10732
10733
348
if (errorcode == 0 && cb.had_recurse)
10734
0
  {
10735
0
  PCRE2_UCHAR *rcode;
10736
0
  PCRE2_SPTR rgroup;
10737
0
  unsigned int ccount = 0;
10738
0
  int start = RSCAN_CACHE_SIZE;
10739
0
  recurse_cache rc[RSCAN_CACHE_SIZE];
10740
10741
0
  for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10742
0
       rcode != NULL;
10743
0
       rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10744
0
    {
10745
0
    int p, groupnumber;
10746
10747
0
    groupnumber = (int)GET(rcode, 1);
10748
0
    if (groupnumber == 0) rgroup = codestart; else
10749
0
      {
10750
0
      PCRE2_SPTR search_from = codestart;
10751
0
      rgroup = NULL;
10752
0
      for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10753
0
        {
10754
0
        if (groupnumber == rc[p].groupnumber)
10755
0
          {
10756
0
          rgroup = rc[p].group;
10757
0
          break;
10758
0
          }
10759
10760
        /* Group n+1 must always start to the right of group n, so we can save
10761
        search time below when the new group number is greater than any of the
10762
        previously found groups. */
10763
10764
0
        if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10765
0
        }
10766
10767
0
      if (rgroup == NULL)
10768
0
        {
10769
0
        rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10770
0
        if (rgroup == NULL)
10771
0
          {
10772
0
          errorcode = ERR53;
10773
0
          break;
10774
0
          }
10775
0
        if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10776
0
        rc[start].groupnumber = groupnumber;
10777
0
        rc[start].group = rgroup;
10778
0
        if (ccount < RSCAN_CACHE_SIZE) ccount++;
10779
0
        }
10780
0
      }
10781
10782
0
    PUT(rcode, 1, rgroup - codestart);
10783
0
    }
10784
0
  }
10785
10786
/* In rare debugging situations we sometimes need to look at the compiled code
10787
at this stage. */
10788
10789
#ifdef DEBUG_CALL_PRINTINT
10790
pcre2_printint(re, stderr, TRUE);
10791
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10792
#endif
10793
10794
/* Unless disabled, check whether any single character iterators can be
10795
auto-possessified. The function overwrites the appropriate opcode values, so
10796
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10797
used in this code because at least one compiler gives a warning about loss of
10798
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10799
function call. */
10800
10801
348
if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10802
348
  {
10803
348
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10804
348
  if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10805
348
  }
10806
10807
/* Failed to compile, or error while post-processing. */
10808
10809
348
if (errorcode != 0) goto HAD_CB_ERROR;
10810
10811
/* Successful compile. If the anchored option was not passed, set it if
10812
we can determine that the pattern is anchored by virtue of ^ characters or \A
10813
or anything else, such as starting with non-atomic .* when DOTALL is set and
10814
there are no occurrences of *PRUNE or *SKIP (though there is an option to
10815
disable this case). */
10816
10817
348
if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10818
347
     is_anchored(codestart, 0, &cb, 0, FALSE))
10819
0
  re->overall_options |= PCRE2_ANCHORED;
10820
10821
/* Set up the first code unit or startline flag, the required code unit, and
10822
then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10823
is set, as the data it would create will not be used. Note that a first code
10824
unit (but not the startline flag) is useful for anchored patterns because it
10825
can still give a quick "no match" and also avoid searching for a last code
10826
unit. */
10827
10828
348
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10829
348
  {
10830
348
  int minminlength = 0;  /* For minimal minlength from first/required CU */
10831
10832
  /* If we do not have a first code unit, see if there is one that is asserted
10833
  (these are not saved during the compile because they can cause conflicts with
10834
  actual literals that follow). */
10835
10836
348
  if (firstcuflags >= REQ_NONE)
10837
291
    firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10838
10839
  /* Save the data for a first code unit. The existence of one means the
10840
  minimum length must be at least 1. */
10841
10842
348
  if (firstcuflags < REQ_NONE)
10843
57
    {
10844
57
    re->first_codeunit = firstcu;
10845
57
    re->flags |= PCRE2_FIRSTSET;
10846
57
    minminlength++;
10847
10848
    /* Handle caseless first code units. */
10849
10850
57
    if ((firstcuflags & REQ_CASELESS) != 0)
10851
37
      {
10852
37
      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10853
37
        {
10854
37
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10855
37
        }
10856
10857
      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10858
      In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10859
      points and cannot have another case, but if UCP is set they may do. */
10860
10861
0
#ifdef SUPPORT_UNICODE
10862
0
#if PCRE2_CODE_UNIT_WIDTH == 8
10863
0
      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10864
0
        re->flags |= PCRE2_FIRSTCASELESS;
10865
#else
10866
      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10867
               UCD_OTHERCASE(firstcu) != firstcu)
10868
        re->flags |= PCRE2_FIRSTCASELESS;
10869
#endif
10870
37
#endif  /* SUPPORT_UNICODE */
10871
37
      }
10872
57
    }
10873
10874
  /* When there is no first code unit, for non-anchored patterns, see if we can
10875
  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10876
  branches start with ^ and also when all branches start with non-atomic .* for
10877
  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10878
  that disables this case.) */
10879
10880
291
  else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10881
291
           is_startline(codestart, 0, &cb, 0, FALSE))
10882
1
    re->flags |= PCRE2_STARTLINE;
10883
10884
  /* Handle the "required code unit", if one is set. In the UTF case we can
10885
  increment the minimum minimum length only if we are sure this really is a
10886
  different character and not a non-starting code unit of the first character,
10887
  because the minimum length count is in characters, not code units. */
10888
10889
348
  if (reqcuflags < REQ_NONE)
10890
141
    {
10891
#if PCRE2_CODE_UNIT_WIDTH == 16
10892
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10893
        firstcuflags >= REQ_NONE ||                 /* First not set */
10894
        (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10895
        (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10896
#elif PCRE2_CODE_UNIT_WIDTH == 8
10897
141
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10898
78
        firstcuflags >= REQ_NONE ||                 /* First not set */
10899
16
        (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10900
0
        (reqcu & 0x80) == 0)                        /* Req is ASCII */
10901
141
#endif
10902
141
      {
10903
141
      minminlength++;
10904
141
      }
10905
10906
    /* In the case of an anchored pattern, set up the value only if it follows
10907
    a variable length item in the pattern. */
10908
10909
141
    if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10910
1
        (reqcuflags & REQ_VARY) != 0)
10911
141
      {
10912
141
      re->last_codeunit = reqcu;
10913
141
      re->flags |= PCRE2_LASTSET;
10914
10915
      /* Handle caseless required code units as for first code units (above). */
10916
10917
141
      if ((reqcuflags & REQ_CASELESS) != 0)
10918
60
        {
10919
60
        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10920
59
          {
10921
59
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10922
59
          }
10923
1
#ifdef SUPPORT_UNICODE
10924
1
#if PCRE2_CODE_UNIT_WIDTH == 8
10925
1
      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10926
0
        re->flags |= PCRE2_LASTCASELESS;
10927
#else
10928
      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10929
               UCD_OTHERCASE(reqcu) != reqcu)
10930
        re->flags |= PCRE2_LASTCASELESS;
10931
#endif
10932
60
#endif  /* SUPPORT_UNICODE */
10933
60
        }
10934
141
      }
10935
141
    }
10936
10937
  /* Study the compiled pattern to set up information such as a bitmap of
10938
  starting code units and a minimum matching length. */
10939
10940
348
  if (PRIV(study)(re) != 0)
10941
0
    {
10942
0
    errorcode = ERR31;
10943
0
    goto HAD_CB_ERROR;
10944
0
    }
10945
10946
  /* If study() set a bitmap of starting code units, it implies a minimum
10947
  length of at least one. */
10948
10949
348
  if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10950
106
    minminlength = 1;
10951
10952
  /* If the minimum length set (or not set) by study() is less than the minimum
10953
  implied by required code units, override it. */
10954
10955
348
  if (re->minlength < minminlength) re->minlength = minminlength;
10956
348
  }   /* End of start-of-match optimizations. */
10957
10958
/* Control ends up here in all cases. When running under valgrind, make a
10959
pattern's terminating zero defined again. If memory was obtained for the parsed
10960
version of the pattern, free it before returning. Also free the list of named
10961
groups if a larger one had to be obtained, and likewise the group information
10962
vector. */
10963
10964
708
EXIT:
10965
#ifdef SUPPORT_VALGRIND
10966
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10967
#endif
10968
708
if (cb.parsed_pattern != stack_parsed_pattern)
10969
93
  ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10970
708
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10971
0
  ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10972
708
if (cb.groupinfo != stack_groupinfo)
10973
0
  ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10974
708
return re;    /* Will be NULL after an error */
10975
10976
/* Errors discovered in parse_regex() set the offset value in the compile
10977
block. Errors discovered before it is called must compute it from the ptr
10978
value. After parse_regex() is called, the offset in the compile block is set to
10979
the end of the pattern, but certain errors in compile_regex() may reset it if
10980
an offset is available in the parsed pattern. */
10981
10982
356
HAD_CB_ERROR:
10983
356
ptr = pattern + cb.erroroffset;
10984
10985
356
HAD_EARLY_ERROR:
10986
356
*erroroffset = ptr - pattern;
10987
10988
360
HAD_ERROR:
10989
360
*errorptr = errorcode;
10990
360
pcre2_code_free(re);
10991
re = NULL;
10992
360
goto EXIT;
10993
356
}
10994
10995
/* These #undefs are here to enable unity builds with CMake. */
10996
10997
#undef NLBLOCK /* Block containing newline information */
10998
#undef PSSTART /* Field containing processed string start */
10999
#undef PSEND   /* Field containing processed string end */
11000
11001
/* End of pcre2_compile.c */