Coverage Report

Created: 2025-06-13 06:43

/src/php-src/ext/pcre/pcre2lib/pcre2_compile.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#ifdef HAVE_CONFIG_H
43
#include "config.h"
44
#endif
45
46
0
#define NLBLOCK cb             /* Block containing newline information */
47
#define PSSTART start_pattern  /* Field containing processed string start */
48
0
#define PSEND   end_pattern    /* Field containing processed string end */
49
50
#include "pcre2_compile.h"
51
52
/* In rare error cases debugging might require calling pcre2_printint(). */
53
54
#if 0
55
#ifdef EBCDIC
56
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57
#else
58
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59
#endif
60
#include "pcre2_printint.c"
61
#define DEBUG_CALL_PRINTINT
62
#endif
63
64
/* Other debugging code can be enabled by these defines. */
65
66
/* #define DEBUG_SHOW_CAPTURES */
67
/* #define DEBUG_SHOW_PARSED */
68
69
/* There are a few things that vary with different code unit sizes. Handle them
70
by defining macros in order to minimize #if usage. */
71
72
#if PCRE2_CODE_UNIT_WIDTH == 8
73
#define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74
3
#define XDIGIT(c)                xdigitab[c]
75
76
#else  /* Either 16-bit or 32-bit */
77
#define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78
79
#if PCRE2_CODE_UNIT_WIDTH == 16
80
#define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81
82
#else  /* 32-bit */
83
#define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84
#endif
85
#endif
86
87
/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88
consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89
them will be able to (i.e. assume a 64-bit world). */
90
91
#if PCRE2_SIZE_MAX <= UINT32_MAX
92
#define PUTOFFSET(s,p) *p++ = s
93
#define GETOFFSET(s,p) s = *p++
94
#define GETPLUSOFFSET(s,p) s = *(++p)
95
#define READPLUSOFFSET(s,p) s = p[1]
96
#define SKIPOFFSET(p) p++
97
#define SIZEOFFSET 1
98
#else
99
#define PUTOFFSET(s,p) \
100
1.12k
  { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101
#define GETOFFSET(s,p) \
102
  { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103
#define GETPLUSOFFSET(s,p) \
104
1.22k
  { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105
#define READPLUSOFFSET(s,p) \
106
91
  { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107
0
#define SKIPOFFSET(p) p += 2
108
268
#define SIZEOFFSET 2
109
#endif
110
111
/* Function definitions to allow mutual recursion */
112
113
static int
114
  compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
115
    uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
116
    open_capitem *, compile_block *, PCRE2_SIZE *);
117
118
static int
119
  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
120
    compile_block *);
121
122
static BOOL
123
  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
124
    compile_block *);
125
126
static int
127
  check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
128
    compile_block *, int *);
129
130
131
/*************************************************
132
*      Code parameters and static tables         *
133
*************************************************/
134
135
11.8k
#define MAX_GROUP_NUMBER   65535u
136
131k
#define MAX_REPEAT_COUNT   65535u
137
131k
#define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
138
139
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
140
different ways in the different pattern scans. The parsing and group-
141
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
142
aligned for this. Having defined the size in code units, we set up
143
C16_WORK_SIZE as the number of elements in the 16-bit vector.
144
145
During the first compiling phase, when determining how much memory is required,
146
the regex is partly compiled into this space, but the compiled parts are
147
discarded as soon as they can be, so that hopefully there will never be an
148
overrun. The code does, however, check for an overrun, which can occur for
149
pathological patterns. The size of the workspace depends on LINK_SIZE because
150
the length of compiled items varies with this.
151
152
In the real compile phase, this workspace is not currently used. */
153
154
2.39k
#define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
155
156
#define C16_WORK_SIZE \
157
  ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
158
159
/* A uint32_t vector is used for caching information about the size of
160
capturing groups, to improve performance. A default is created on the stack of
161
this size. */
162
163
21
#define GROUPINFO_DEFAULT_SIZE 256
164
165
/* The overrun tests check for a slightly smaller size so that they detect the
166
overrun before it actually does run off the end of the data block. */
167
168
597k
#define WORK_SIZE_SAFETY_MARGIN (100)
169
170
/* This value determines the size of the initial vector that is used for
171
remembering named groups during the pre-compile. It is allocated on the stack,
172
but if it is too small, it is expanded, in a similar way to the workspace. The
173
value is the number of slots in the list. */
174
175
4.78k
#define NAMED_GROUP_LIST_SIZE  20
176
177
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
178
of uint32_t. For short patterns this lives on the stack, with this size. Heap
179
memory is used for longer patterns. */
180
181
2.35k
#define PARSED_PATTERN_DEFAULT_SIZE 1024
182
183
/* Maximum length value to check against when making sure that the variable
184
that holds the compiled pattern length does not overflow. We make it a bit less
185
than INT_MAX to allow for adding in group terminating code units, so that we
186
don't have to check them every time. */
187
188
558k
#define OFLOW_MAX (INT_MAX - 20)
189
190
/* Table of extra lengths for each of the meta codes. Must be kept in step with
191
the definitions above. For some items these values are a basic length to which
192
a variable amount has to be added. */
193
194
static unsigned char meta_extra_lengths[] = {
195
  0,             /* META_END */
196
  0,             /* META_ALT */
197
  0,             /* META_ATOMIC */
198
  0,             /* META_BACKREF - more if group is >= 10 */
199
  1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
200
  1,             /* META_BIGVALUE */
201
  3,             /* META_CALLOUT_NUMBER */
202
  3+SIZEOFFSET,  /* META_CALLOUT_STRING */
203
  0,             /* META_CAPTURE */
204
  0,             /* META_CIRCUMFLEX */
205
  0,             /* META_CLASS */
206
  0,             /* META_CLASS_EMPTY */
207
  0,             /* META_CLASS_EMPTY_NOT */
208
  0,             /* META_CLASS_END */
209
  0,             /* META_CLASS_NOT */
210
  0,             /* META_COND_ASSERT */
211
  SIZEOFFSET,    /* META_COND_DEFINE */
212
  1+SIZEOFFSET,  /* META_COND_NAME */
213
  1+SIZEOFFSET,  /* META_COND_NUMBER */
214
  1+SIZEOFFSET,  /* META_COND_RNAME */
215
  1+SIZEOFFSET,  /* META_COND_RNUMBER */
216
  3,             /* META_COND_VERSION */
217
  SIZEOFFSET,    /* META_OFFSET */
218
  0,             /* META_SCS */
219
  1,             /* META_SCS_NAME */
220
  1,             /* META_SCS_NUMBER */
221
  0,             /* META_DOLLAR */
222
  0,             /* META_DOT */
223
  0,             /* META_ESCAPE - one more for ESC_P and ESC_p */
224
  0,             /* META_KET */
225
  0,             /* META_NOCAPTURE */
226
  2,             /* META_OPTIONS */
227
  1,             /* META_POSIX */
228
  1,             /* META_POSIX_NEG */
229
  0,             /* META_RANGE_ESCAPED */
230
  0,             /* META_RANGE_LITERAL */
231
  SIZEOFFSET,    /* META_RECURSE */
232
  1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
233
  0,             /* META_SCRIPT_RUN */
234
  0,             /* META_LOOKAHEAD */
235
  0,             /* META_LOOKAHEADNOT */
236
  SIZEOFFSET,    /* META_LOOKBEHIND */
237
  SIZEOFFSET,    /* META_LOOKBEHINDNOT */
238
  0,             /* META_LOOKAHEAD_NA */
239
  SIZEOFFSET,    /* META_LOOKBEHIND_NA */
240
  1,             /* META_MARK - plus the string length */
241
  0,             /* META_ACCEPT */
242
  0,             /* META_FAIL */
243
  0,             /* META_COMMIT */
244
  1,             /* META_COMMIT_ARG - plus the string length */
245
  0,             /* META_PRUNE */
246
  1,             /* META_PRUNE_ARG - plus the string length */
247
  0,             /* META_SKIP */
248
  1,             /* META_SKIP_ARG - plus the string length */
249
  0,             /* META_THEN */
250
  1,             /* META_THEN_ARG - plus the string length */
251
  0,             /* META_ASTERISK */
252
  0,             /* META_ASTERISK_PLUS */
253
  0,             /* META_ASTERISK_QUERY */
254
  0,             /* META_PLUS */
255
  0,             /* META_PLUS_PLUS */
256
  0,             /* META_PLUS_QUERY */
257
  0,             /* META_QUERY */
258
  0,             /* META_QUERY_PLUS */
259
  0,             /* META_QUERY_QUERY */
260
  2,             /* META_MINMAX */
261
  2,             /* META_MINMAX_PLUS */
262
  2,             /* META_MINMAX_QUERY */
263
  0,             /* META_ECLASS_AND */
264
  0,             /* META_ECLASS_OR */
265
  0,             /* META_ECLASS_SUB */
266
  0,             /* META_ECLASS_XOR */
267
  0              /* META_ECLASS_NOT */
268
};
269
270
/* Types for skipping parts of a parsed pattern. */
271
272
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
273
274
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
275
variables, which are concerned with first and required code units. A value
276
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
277
matching xxcu variable is set, and the low valued bits are relevant. */
278
279
1.16M
#define REQ_UNSET     0xffffffffu  /* Not yet found anything */
280
175k
#define REQ_NONE      0xfffffffeu  /* Found not fixed character */
281
10.2k
#define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
282
220k
#define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
283
284
/* These flags are used in the groupinfo vector. */
285
286
0
#define GI_SET_FIXED_LENGTH    0x80000000u
287
0
#define GI_NOT_FIXED_LENGTH    0x40000000u
288
0
#define GI_FIXED_LENGTH_MASK   0x0000ffffu
289
290
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
291
and is fast (a good compiler can turn it into a subtraction and unsigned
292
comparison). */
293
294
20.3k
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
295
296
/* Table to identify hex digits. The tables in chartables are dependent on the
297
locale, and may mark arbitrary characters as digits. We want to recognize only
298
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
299
costs 256 bytes, but it is a lot faster than doing character value tests (at
300
least in some simple cases I timed), and in some applications one wants PCRE2
301
to compile efficiently as well as match efficiently. The value in the table is
302
the binary hex digit value, or 0xff for non-hex digits. */
303
304
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
305
UTF-8 mode. */
306
307
#ifndef EBCDIC
308
static const uint8_t xdigitab[] =
309
  {
310
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
311
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
312
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
313
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
314
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
315
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
316
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
317
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
318
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
319
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
320
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
321
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
322
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
323
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
324
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
325
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
326
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
327
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
328
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
329
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
330
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
331
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
332
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
333
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
334
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
335
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
336
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
337
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
338
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
339
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
340
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
341
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
342
343
#else
344
345
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
346
347
static const uint8_t xdigitab[] =
348
  {
349
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
350
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
351
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
352
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
353
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
354
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
355
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
356
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
357
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
358
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
359
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
360
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
361
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
362
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
363
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
364
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
365
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
366
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
367
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
368
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
369
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
370
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
371
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
372
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
373
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
374
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
375
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
376
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
377
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
378
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
379
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
380
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
381
#endif  /* EBCDIC */
382
383
384
/* Table for handling alphanumeric escaped characters. Positive returns are
385
simple data values; negative values are for special things like \d and so on.
386
Zero means further processing is needed (for things like \x), or the escape is
387
invalid. */
388
389
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
390
in UTF-8 mode. It runs from '0' to 'z'. */
391
392
#ifndef EBCDIC
393
109k
#define ESCAPES_FIRST       CHAR_0
394
54.8k
#define ESCAPES_LAST        CHAR_z
395
30
#define UPPER_CASE(c)       (c-32)
396
397
static const short int escapes[] = {
398
    /* 0 */ 0,                       /* 1 */ 0,
399
    /* 2 */ 0,                       /* 3 */ 0,
400
    /* 4 */ 0,                       /* 5 */ 0,
401
    /* 6 */ 0,                       /* 7 */ 0,
402
    /* 8 */ 0,                       /* 9 */ 0,
403
    /* : */ CHAR_COLON,              /* ; */ CHAR_SEMICOLON,
404
    /* < */ CHAR_LESS_THAN_SIGN,     /* = */ CHAR_EQUALS_SIGN,
405
    /* > */ CHAR_GREATER_THAN_SIGN,  /* ? */ CHAR_QUESTION_MARK,
406
    /* @ */ CHAR_COMMERCIAL_AT,      /* A */ -ESC_A,
407
    /* B */ -ESC_B,                  /* C */ -ESC_C,
408
    /* D */ -ESC_D,                  /* E */ -ESC_E,
409
    /* F */ 0,                       /* G */ -ESC_G,
410
    /* H */ -ESC_H,                  /* I */ 0,
411
    /* J */ 0,                       /* K */ -ESC_K,
412
    /* L */ 0,                       /* M */ 0,
413
    /* N */ -ESC_N,                  /* O */ 0,
414
    /* P */ -ESC_P,                  /* Q */ -ESC_Q,
415
    /* R */ -ESC_R,                  /* S */ -ESC_S,
416
    /* T */ 0,                       /* U */ 0,
417
    /* V */ -ESC_V,                  /* W */ -ESC_W,
418
    /* X */ -ESC_X,                  /* Y */ 0,
419
    /* Z */ -ESC_Z,                  /* [ */ CHAR_LEFT_SQUARE_BRACKET,
420
    /* \ */ CHAR_BACKSLASH,          /* ] */ CHAR_RIGHT_SQUARE_BRACKET,
421
    /* ^ */ CHAR_CIRCUMFLEX_ACCENT,  /* _ */ CHAR_UNDERSCORE,
422
    /* ` */ CHAR_GRAVE_ACCENT,       /* a */ CHAR_BEL,
423
    /* b */ -ESC_b,                  /* c */ 0,
424
    /* d */ -ESC_d,                  /* e */ CHAR_ESC,
425
    /* f */ CHAR_FF,                 /* g */ 0,
426
    /* h */ -ESC_h,                  /* i */ 0,
427
    /* j */ 0,                       /* k */ -ESC_k,
428
    /* l */ 0,                       /* m */ 0,
429
    /* n */ CHAR_LF,                 /* o */ 0,
430
    /* p */ -ESC_p,                  /* q */ 0,
431
    /* r */ CHAR_CR,                 /* s */ -ESC_s,
432
    /* t */ CHAR_HT,                 /* u */ 0,
433
    /* v */ -ESC_v,                  /* w */ -ESC_w,
434
    /* x */ 0,                       /* y */ 0,
435
    /* z */ -ESC_z
436
};
437
438
#else
439
440
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
441
It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
442
is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
443
because it is defined as 'a', which of course picks up the ASCII value. */
444
445
#if 'a' == 0x81                    /* Check for a real EBCDIC environment */
446
#define ESCAPES_FIRST       CHAR_a
447
#define ESCAPES_LAST        CHAR_9
448
#define UPPER_CASE(c)       (c+64)
449
#else                              /* Testing in an ASCII environment */
450
#define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
451
#define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
452
#define UPPER_CASE(c)  (c-32)
453
#endif
454
455
static const short int escapes[] = {
456
/*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
457
/*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
458
/*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
459
/*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
460
/*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
461
/*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
462
/*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
463
/*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
464
/*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
465
/*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
466
/*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
467
/*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
468
/*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
469
/*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
470
/*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
471
/*  F8 */      0,        0
472
};
473
474
/* We also need a table of characters that may follow \c in an EBCDIC
475
environment for characters 0-31. */
476
477
static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
478
479
#endif   /* EBCDIC */
480
481
482
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
483
searched linearly. Put all the names into a single string, in order to reduce
484
the number of relocations when a shared library is dynamically linked. The
485
string is built from string macros so that it works in UTF-8 mode on EBCDIC
486
platforms. */
487
488
typedef struct verbitem {
489
  unsigned int len;          /* Length of verb name */
490
  uint32_t meta;             /* Base META_ code */
491
  int has_arg;               /* Argument requirement */
492
} verbitem;
493
494
static const char verbnames[] =
495
  "\0"                       /* Empty name is a shorthand for MARK */
496
  STRING_MARK0
497
  STRING_ACCEPT0
498
  STRING_F0
499
  STRING_FAIL0
500
  STRING_COMMIT0
501
  STRING_PRUNE0
502
  STRING_SKIP0
503
  STRING_THEN;
504
505
static const verbitem verbs[] = {
506
  { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
507
  { 4, META_MARK,   +1 },
508
  { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
509
  { 1, META_FAIL,   -1 },
510
  { 4, META_FAIL,   -1 },
511
  { 6, META_COMMIT,  0 },
512
  { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
513
  { 4, META_SKIP,    0 },
514
  { 4, META_THEN,    0 }
515
};
516
517
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
518
519
/* Verb opcodes, indexed by their META code offset from META_MARK. */
520
521
static const uint32_t verbops[] = {
522
  OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
523
  OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
524
525
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
526
527
typedef struct alasitem {
528
  unsigned int len;          /* Length of name */
529
  uint32_t meta;             /* Base META_ code */
530
} alasitem;
531
532
static const char alasnames[] =
533
  STRING_pla0
534
  STRING_plb0
535
  STRING_napla0
536
  STRING_naplb0
537
  STRING_nla0
538
  STRING_nlb0
539
  STRING_positive_lookahead0
540
  STRING_positive_lookbehind0
541
  STRING_non_atomic_positive_lookahead0
542
  STRING_non_atomic_positive_lookbehind0
543
  STRING_negative_lookahead0
544
  STRING_negative_lookbehind0
545
  STRING_scs0
546
  STRING_scan_substring0
547
  STRING_atomic0
548
  STRING_sr0
549
  STRING_asr0
550
  STRING_script_run0
551
  STRING_atomic_script_run;
552
553
static const alasitem alasmeta[] = {
554
  {  3, META_LOOKAHEAD         },
555
  {  3, META_LOOKBEHIND        },
556
  {  5, META_LOOKAHEAD_NA      },
557
  {  5, META_LOOKBEHIND_NA     },
558
  {  3, META_LOOKAHEADNOT      },
559
  {  3, META_LOOKBEHINDNOT     },
560
  { 18, META_LOOKAHEAD         },
561
  { 19, META_LOOKBEHIND        },
562
  { 29, META_LOOKAHEAD_NA      },
563
  { 30, META_LOOKBEHIND_NA     },
564
  { 18, META_LOOKAHEADNOT      },
565
  { 19, META_LOOKBEHINDNOT     },
566
  {  3, META_SCS               },
567
  { 14, META_SCS               },
568
  {  6, META_ATOMIC            },
569
  {  2, META_SCRIPT_RUN        }, /* sr = script run */
570
  {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
571
  { 10, META_SCRIPT_RUN        }, /* script run */
572
  { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
573
};
574
575
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
576
577
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
578
579
static uint32_t chartypeoffset[] = {
580
  OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
581
  OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
582
583
/* Tables of names of POSIX character classes and their lengths. The names are
584
now all in a single string, to reduce the number of relocations when a shared
585
library is dynamically loaded. The list of lengths is terminated by a zero
586
length entry. The first three must be alpha, lower, upper, as this is assumed
587
for handling case independence.
588
589
The indices for several classes are stored in pcre2_compile.h - these must
590
be kept in sync with posix_names, posix_name_lengths, posix_class_maps,
591
and posix_substitutes. */
592
593
static const char posix_names[] =
594
  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
595
  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
596
  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
597
  STRING_word0  STRING_xdigit;
598
599
static const uint8_t posix_name_lengths[] = {
600
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
601
602
/* Table of class bit maps for each POSIX class. Each class is formed from a
603
base map, with an optional addition or removal of another map. Then, for some
604
classes, there is some additional tweaking: for [:blank:] the vertical space
605
characters are removed, and for [:alpha:] and [:alnum:] the underscore
606
character is removed. The triples in the table consist of the base map offset,
607
second map offset or -1 if no second map, and a non-negative value for map
608
addition or a negative value for map subtraction (if there are two maps). The
609
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
610
remove vertical space characters, 2 => remove underscore. */
611
612
const int PRIV(posix_class_maps)[] = {
613
  cbit_word,   cbit_digit, -2,            /* alpha */
614
  cbit_lower,  -1,          0,            /* lower */
615
  cbit_upper,  -1,          0,            /* upper */
616
  cbit_word,   -1,          2,            /* alnum - word without underscore */
617
  cbit_print,  cbit_cntrl,  0,            /* ascii */
618
  cbit_space,  -1,          1,            /* blank - a GNU extension */
619
  cbit_cntrl,  -1,          0,            /* cntrl */
620
  cbit_digit,  -1,          0,            /* digit */
621
  cbit_graph,  -1,          0,            /* graph */
622
  cbit_print,  -1,          0,            /* print */
623
  cbit_punct,  -1,          0,            /* punct */
624
  cbit_space,  -1,          0,            /* space */
625
  cbit_word,   -1,          0,            /* word - a Perl extension */
626
  cbit_xdigit, -1,          0             /* xdigit */
627
};
628
629
#ifdef SUPPORT_UNICODE
630
631
/* The POSIX class Unicode property substitutes that are used in UCP mode must
632
be in the order of the POSIX class names, defined above. */
633
634
static int posix_substitutes[] = {
635
  PT_GC, ucp_L,     /* alpha */
636
  PT_PC, ucp_Ll,    /* lower */
637
  PT_PC, ucp_Lu,    /* upper */
638
  PT_ALNUM, 0,      /* alnum */
639
  -1, 0,            /* ascii, treat as non-UCP */
640
  -1, 1,            /* blank, treat as \h */
641
  PT_PC, ucp_Cc,    /* cntrl */
642
  PT_PC, ucp_Nd,    /* digit */
643
  PT_PXGRAPH, 0,    /* graph */
644
  PT_PXPRINT, 0,    /* print */
645
  PT_PXPUNCT, 0,    /* punct */
646
  PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
647
  PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
648
  PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
649
};
650
#endif  /* SUPPORT_UNICODE */
651
652
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
653
are allowed. */
654
655
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
656
2.39k
  (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
657
2.39k
   PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
658
2.39k
   PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
659
660
#define PUBLIC_COMPILE_OPTIONS \
661
2.39k
  (PUBLIC_LITERAL_COMPILE_OPTIONS| \
662
2.39k
   PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
663
2.39k
   PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
664
2.39k
   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
665
2.39k
   PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
666
2.39k
   PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
667
2.39k
   PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)
668
669
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
670
2.39k
   (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \
671
2.39k
    PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)
672
673
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
674
2.39k
   (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
675
2.39k
    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
676
2.39k
    PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
677
2.39k
    PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
678
2.39k
    PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
679
2.39k
    PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \
680
2.39k
    PCRE2_EXTRA_NEVER_CALLOUT)
681
682
/* This is a table of start-of-pattern options such as (*UTF) and settings such
683
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
684
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
685
generic and always supported. */
686
687
enum { PSO_OPT,     /* Value is an option bit */
688
       PSO_XOPT,    /* Value is an xoption bit */
689
       PSO_FLG,     /* Value is a flag bit */
690
       PSO_NL,      /* Value is a newline type */
691
       PSO_BSR,     /* Value is a \R type */
692
       PSO_LIMH,    /* Read integer value for heap limit */
693
       PSO_LIMM,    /* Read integer value for match limit */
694
       PSO_LIMD,    /* Read integer value for depth limit */
695
       PSO_OPTMZ    /* Value is an optimization bit */
696
     };
697
698
typedef struct pso {
699
  const char *name;
700
  uint16_t length;
701
  uint16_t type;
702
  uint32_t value;
703
} pso;
704
705
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
706
707
static const pso pso_list[] = {
708
  { STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
709
  { STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
710
  { STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
711
  { STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
712
  { STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
713
  { STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },
714
  { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },
715
  { STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
716
  { STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },
717
  { STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },
718
  { STRING_TURKISH_CASING_RIGHTPAR,    15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },
719
  { STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
720
  { STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
721
  { STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
722
  { STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
723
  { STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
724
  { STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
725
  { STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
726
  { STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
727
  { STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
728
  { STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
729
  { STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
730
  { STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
731
};
732
733
/* This table is used when converting repeating opcodes into possessified
734
versions as a result of an explicit possessive quantifier such as ++. A zero
735
value means there is no possessified version - in those cases the item in
736
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
737
because all relevant opcodes are less than that. */
738
739
static const uint8_t opcode_possessify[] = {
740
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
741
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
742
743
  0,                       /* NOTI */
744
  OP_POSSTAR, 0,           /* STAR, MINSTAR */
745
  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
746
  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
747
  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
748
  0,                       /* EXACT */
749
  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
750
751
  OP_POSSTARI, 0,          /* STARI, MINSTARI */
752
  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
753
  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
754
  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
755
  0,                       /* EXACTI */
756
  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
757
758
  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
759
  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
760
  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
761
  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
762
  0,                       /* NOTEXACT */
763
  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
764
765
  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
766
  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
767
  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
768
  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
769
  0,                       /* NOTEXACTI */
770
  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
771
772
  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
773
  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
774
  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
775
  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
776
  0,                       /* TYPEEXACT */
777
  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
778
779
  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
780
  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
781
  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
782
  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
783
  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
784
785
  0, 0, 0, 0,              /* CLASS, NCLASS, XCLASS, ECLASS */
786
  0, 0,                    /* REF, REFI */
787
  0, 0,                    /* DNREF, DNREFI */
788
  0, 0,                    /* RECURSE, CALLOUT */
789
};
790
791
/* Compile-time check that the table has the correct size. */
792
STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);
793
794
795
#ifdef DEBUG_SHOW_PARSED
796
/*************************************************
797
*     Show the parsed pattern for debugging      *
798
*************************************************/
799
800
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
801
can be enabled. */
802
803
static void show_parsed(compile_block *cb)
804
{
805
uint32_t *pptr = cb->parsed_pattern;
806
807
for (;;)
808
  {
809
  int max, min;
810
  PCRE2_SIZE offset;
811
  uint32_t i;
812
  uint32_t length;
813
  uint32_t meta_arg = META_DATA(*pptr);
814
815
  fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
816
817
  if (*pptr < META_END)
818
    {
819
    if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
820
    pptr++;
821
    }
822
823
  else switch (META_CODE(*pptr++))
824
    {
825
    default:
826
    fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
827
    return;
828
829
    case META_END:
830
    fprintf(stderr, "META_END\n");
831
    return;
832
833
    case META_CAPTURE:
834
    fprintf(stderr, "META_CAPTURE %d", meta_arg);
835
    break;
836
837
    case META_RECURSE:
838
    GETOFFSET(offset, pptr);
839
    fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
840
    break;
841
842
    case META_BACKREF:
843
    if (meta_arg < 10)
844
      offset = cb->small_ref_offset[meta_arg];
845
    else
846
      GETOFFSET(offset, pptr);
847
    fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
848
    break;
849
850
    case META_ESCAPE:
851
    if (meta_arg == ESC_P || meta_arg == ESC_p)
852
      {
853
      uint32_t ptype = *pptr >> 16;
854
      uint32_t pvalue = *pptr++ & 0xffff;
855
      fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
856
        ptype, pvalue);
857
      }
858
    else
859
      {
860
      uint32_t cc;
861
      /* There's just one escape we might have here that isn't negated in the
862
      escapes table. */
863
      if (meta_arg == ESC_g) cc = CHAR_g;
864
      else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
865
        {
866
        if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
867
        }
868
      if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
869
      fprintf(stderr, "META \\%c", cc);
870
      }
871
    break;
872
873
    case META_MINMAX:
874
    min = *pptr++;
875
    max = *pptr++;
876
    if (max != REPEAT_UNLIMITED)
877
      fprintf(stderr, "META {%d,%d}", min, max);
878
    else
879
      fprintf(stderr, "META {%d,}", min);
880
    break;
881
882
    case META_MINMAX_QUERY:
883
    min = *pptr++;
884
    max = *pptr++;
885
    if (max != REPEAT_UNLIMITED)
886
      fprintf(stderr, "META {%d,%d}?", min, max);
887
    else
888
      fprintf(stderr, "META {%d,}?", min);
889
    break;
890
891
    case META_MINMAX_PLUS:
892
    min = *pptr++;
893
    max = *pptr++;
894
    if (max != REPEAT_UNLIMITED)
895
      fprintf(stderr, "META {%d,%d}+", min, max);
896
    else
897
      fprintf(stderr, "META {%d,}+", min);
898
    break;
899
900
    case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
901
    case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
902
    case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
903
    case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
904
    case META_DOT: fprintf(stderr, "META_DOT"); break;
905
    case META_ASTERISK: fprintf(stderr, "META *"); break;
906
    case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
907
    case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
908
    case META_PLUS: fprintf(stderr, "META +"); break;
909
    case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
910
    case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
911
    case META_QUERY: fprintf(stderr, "META ?"); break;
912
    case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
913
    case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
914
915
    case META_ATOMIC: fprintf(stderr, "META (?>"); break;
916
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
917
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
918
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
919
    case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
920
    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
921
    case META_KET: fprintf(stderr, "META )"); break;
922
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
923
924
    case META_CLASS: fprintf(stderr, "META ["); break;
925
    case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
926
    case META_CLASS_END: fprintf(stderr, "META ]"); break;
927
    case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
928
    case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
929
930
    case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
931
    case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
932
933
    case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
934
    case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
935
936
    case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
937
    case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
938
    case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
939
    case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
940
    case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
941
    case META_THEN: fprintf(stderr, "META (*THEN)"); break;
942
943
    case META_OPTIONS:
944
    fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
945
    pptr += 2;
946
    break;
947
948
    case META_LOOKBEHIND:
949
    fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
950
    pptr += 2;
951
    break;
952
953
    case META_LOOKBEHIND_NA:
954
    fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
955
    pptr += 2;
956
    break;
957
958
    case META_LOOKBEHINDNOT:
959
    fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
960
    pptr += 2;
961
    break;
962
963
    case META_CALLOUT_NUMBER:
964
    fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
965
       pptr[1]);
966
    pptr += 3;
967
    break;
968
969
    case META_CALLOUT_STRING:
970
      {
971
      uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
972
      uint32_t patlength = *pptr++;    /* Length of next pattern item */
973
      fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
974
      GETOFFSET(offset, pptr);
975
      fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
976
      }
977
    break;
978
979
    case META_RECURSE_BYNAME:
980
    fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
981
    GETOFFSET(offset, pptr);
982
    fprintf(stderr, "%zd", offset);
983
    break;
984
985
    case META_BACKREF_BYNAME:
986
    fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
987
    GETOFFSET(offset, pptr);
988
    fprintf(stderr, "%zd", offset);
989
    break;
990
991
    case META_COND_NUMBER:
992
    fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
993
    GETOFFSET(offset, pptr);
994
    fprintf(stderr, "%zd", offset);
995
    pptr++;
996
    break;
997
998
    case META_COND_DEFINE:
999
    fprintf(stderr, "META (?(DEFINE) offset=");
1000
    GETOFFSET(offset, pptr);
1001
    fprintf(stderr, "%zd", offset);
1002
    break;
1003
1004
    case META_COND_VERSION:
1005
    fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1006
    fprintf(stderr, "%d.", *pptr++);
1007
    fprintf(stderr, "%d)", *pptr++);
1008
    break;
1009
1010
    case META_COND_NAME:
1011
    fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1012
    GETOFFSET(offset, pptr);
1013
    fprintf(stderr, "%zd", offset);
1014
    break;
1015
1016
    case META_COND_RNAME:
1017
    fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1018
    GETOFFSET(offset, pptr);
1019
    fprintf(stderr, "%zd", offset);
1020
    break;
1021
1022
    /* This is kept as a name, because it might be. */
1023
1024
    case META_COND_RNUMBER:
1025
    fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1026
    GETOFFSET(offset, pptr);
1027
    fprintf(stderr, "%zd", offset);
1028
    break;
1029
1030
    case META_OFFSET:
1031
    fprintf(stderr, "META_OFFSET offset=");
1032
    GETOFFSET(offset, pptr);
1033
    fprintf(stderr, "%zd", offset);
1034
    break;
1035
1036
    case META_SCS:
1037
    fprintf(stderr, "META (*scan_substring:");
1038
    break;
1039
1040
    case META_SCS_NAME:
1041
    fprintf(stderr, "META_SCS_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);
1042
    break;
1043
1044
    case META_SCS_NUMBER:
1045
    fprintf(stderr, "META_SCS_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);
1046
    break;
1047
1048
    case META_MARK:
1049
    fprintf(stderr, "META (*MARK:");
1050
    goto SHOWARG;
1051
1052
    case META_COMMIT_ARG:
1053
    fprintf(stderr, "META (*COMMIT:");
1054
    goto SHOWARG;
1055
1056
    case META_PRUNE_ARG:
1057
    fprintf(stderr, "META (*PRUNE:");
1058
    goto SHOWARG;
1059
1060
    case META_SKIP_ARG:
1061
    fprintf(stderr, "META (*SKIP:");
1062
    goto SHOWARG;
1063
1064
    case META_THEN_ARG:
1065
    fprintf(stderr, "META (*THEN:");
1066
    SHOWARG:
1067
    length = *pptr++;
1068
    for (i = 0; i < length; i++)
1069
      {
1070
      uint32_t cc = *pptr++;
1071
      if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1072
        else fprintf(stderr, "\\x{%x}", cc);
1073
      }
1074
    fprintf(stderr, ") length=%u", length);
1075
    break;
1076
1077
    case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;
1078
    case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;
1079
    case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;
1080
    case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;
1081
    case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;
1082
    }
1083
  fprintf(stderr, "\n");
1084
  }
1085
return;
1086
}
1087
#endif  /* DEBUG_SHOW_PARSED */
1088
1089
1090
1091
/*************************************************
1092
*               Copy compiled code               *
1093
*************************************************/
1094
1095
/* Compiled JIT code cannot be copied, so the new compiled block has no
1096
associated JIT data. */
1097
1098
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1099
pcre2_code_copy(const pcre2_code *code)
1100
0
{
1101
0
PCRE2_SIZE *ref_count;
1102
0
pcre2_code *newcode;
1103
1104
0
if (code == NULL) return NULL;
1105
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1106
0
if (newcode == NULL) return NULL;
1107
0
memcpy(newcode, code, code->blocksize);
1108
0
newcode->executable_jit = NULL;
1109
1110
/* If the code is one that has been deserialized, increment the reference count
1111
in the decoded tables. */
1112
1113
0
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1114
0
  {
1115
0
  ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1116
0
  (*ref_count)++;
1117
0
  }
1118
1119
0
return newcode;
1120
0
}
1121
1122
1123
1124
/*************************************************
1125
*     Copy compiled code and character tables    *
1126
*************************************************/
1127
1128
/* Compiled JIT code cannot be copied, so the new compiled block has no
1129
associated JIT data. This version of code_copy also makes a separate copy of
1130
the character tables. */
1131
1132
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1133
pcre2_code_copy_with_tables(const pcre2_code *code)
1134
0
{
1135
0
PCRE2_SIZE* ref_count;
1136
0
pcre2_code *newcode;
1137
0
uint8_t *newtables;
1138
1139
0
if (code == NULL) return NULL;
1140
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1141
0
if (newcode == NULL) return NULL;
1142
0
memcpy(newcode, code, code->blocksize);
1143
0
newcode->executable_jit = NULL;
1144
1145
0
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1146
0
  code->memctl.memory_data);
1147
0
if (newtables == NULL)
1148
0
  {
1149
0
  code->memctl.free((void *)newcode, code->memctl.memory_data);
1150
0
  return NULL;
1151
0
  }
1152
0
memcpy(newtables, code->tables, TABLES_LENGTH);
1153
0
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1154
0
*ref_count = 1;
1155
1156
0
newcode->tables = newtables;
1157
0
newcode->flags |= PCRE2_DEREF_TABLES;
1158
0
return newcode;
1159
0
}
1160
1161
1162
1163
/*************************************************
1164
*               Free compiled code               *
1165
*************************************************/
1166
1167
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1168
pcre2_code_free(pcre2_code *code)
1169
1.22k
{
1170
1.22k
PCRE2_SIZE* ref_count;
1171
1172
1.22k
if (code != NULL)
1173
0
  {
1174
#ifdef SUPPORT_JIT
1175
  if (code->executable_jit != NULL)
1176
    PRIV(jit_free)(code->executable_jit, &code->memctl);
1177
#endif
1178
1179
0
  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1180
0
    {
1181
    /* Decoded tables belong to the codes after deserialization, and they must
1182
    be freed when there are no more references to them. The *ref_count should
1183
    always be > 0. */
1184
1185
0
    ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1186
0
    if (*ref_count > 0)
1187
0
      {
1188
0
      (*ref_count)--;
1189
0
      if (*ref_count == 0)
1190
0
        code->memctl.free((void *)code->tables, code->memctl.memory_data);
1191
0
      }
1192
0
    }
1193
1194
0
  code->memctl.free(code, code->memctl.memory_data);
1195
0
  }
1196
1.22k
}
1197
1198
1199
1200
/*************************************************
1201
*         Read a number, possibly signed         *
1202
*************************************************/
1203
1204
/* This function is used to read numbers in the pattern. The initial pointer
1205
must be at the sign or first digit of the number. When relative values
1206
(introduced by + or -) are allowed, they are relative group numbers, and the
1207
result must be greater than zero.
1208
1209
Arguments:
1210
  ptrptr      points to the character pointer variable
1211
  ptrend      points to the end of the input string
1212
  allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1213
  max_value   the largest number allowed;
1214
              you must not pass a value for max_value larger than
1215
              INT_MAX/10 - 1 because this function relies on max_value to
1216
              avoid integer overflow
1217
  max_error   the error to give for an over-large number
1218
  intptr      where to put the result
1219
  errcodeptr  where to put an error code
1220
1221
Returns:      TRUE  - a number was read
1222
              FALSE - errorcode == 0 => no number was found
1223
                      errorcode != 0 => an error occurred
1224
*/
1225
1226
static BOOL
1227
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1228
  uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1229
3.01k
{
1230
3.01k
int sign = 0;
1231
3.01k
uint32_t n = 0;
1232
3.01k
PCRE2_SPTR ptr = *ptrptr;
1233
3.01k
BOOL yield = FALSE;
1234
1235
3.01k
PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);
1236
1237
3.01k
*errorcodeptr = 0;
1238
1239
3.01k
if (allow_sign >= 0 && ptr < ptrend)
1240
14
  {
1241
14
  if (*ptr == CHAR_PLUS)
1242
3
    {
1243
3
    sign = +1;
1244
3
    max_value -= allow_sign;
1245
3
    ptr++;
1246
3
    }
1247
11
  else if (*ptr == CHAR_MINUS)
1248
0
    {
1249
0
    sign = -1;
1250
0
    ptr++;
1251
0
    }
1252
14
  }
1253
1254
3.01k
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1255
7.55k
while (ptr < ptrend && IS_DIGIT(*ptr))
1256
4.63k
  {
1257
4.63k
  n = n * 10 + (*ptr++ - CHAR_0);
1258
4.63k
  if (n > max_value)
1259
84
    {
1260
84
    *errorcodeptr = max_error;
1261
1.45k
    while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;
1262
84
    goto EXIT;
1263
84
    }
1264
4.63k
  }
1265
1266
2.92k
if (allow_sign >= 0 && sign != 0)
1267
0
  {
1268
0
  if (n == 0)
1269
0
    {
1270
0
    *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1271
0
    goto EXIT;
1272
0
    }
1273
1274
0
  if (sign > 0) n += allow_sign;
1275
0
  else if (n > (uint32_t)allow_sign)
1276
0
    {
1277
0
    *errorcodeptr = ERR15;  /* Non-existent subpattern */
1278
0
    goto EXIT;
1279
0
    }
1280
0
  else n = allow_sign + 1 - n;
1281
0
  }
1282
1283
2.92k
yield = TRUE;
1284
1285
3.00k
EXIT:
1286
3.00k
*intptr = n;
1287
3.00k
*ptrptr = ptr;
1288
3.00k
return yield;
1289
2.92k
}
1290
1291
1292
1293
/*************************************************
1294
*         Read repeat counts                     *
1295
*************************************************/
1296
1297
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1298
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1299
larger value is used for "unlimited". We have to use signed arguments for
1300
read_number() because it is capable of returning a signed value. As of Perl
1301
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1302
tabs after { and before } and between the numbers and the comma, so we do too.
1303
1304
Arguments:
1305
  ptrptr         points to pointer to character after '{'
1306
  ptrend         pointer to end of input
1307
  minp           if not NULL, pointer to int for min
1308
  maxp           if not NULL, pointer to int for max
1309
  errorcodeptr   points to error code variable
1310
1311
Returns:         FALSE if not a repeat quantifier, errorcode set zero
1312
                 FALSE on error, with errorcode set non-zero
1313
                 TRUE on success, with pointer updated to point after '}'
1314
*/
1315
1316
static BOOL
1317
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1318
  uint32_t *maxp, int *errorcodeptr)
1319
5.05k
{
1320
5.05k
PCRE2_SPTR p = *ptrptr;
1321
5.05k
PCRE2_SPTR pp;
1322
5.05k
BOOL yield = FALSE;
1323
5.05k
BOOL had_minimum = FALSE;
1324
5.05k
int32_t min = 0;
1325
5.05k
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1326
1327
5.05k
*errorcodeptr = 0;
1328
5.17k
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1329
1330
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1331
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1332
error. */
1333
1334
5.05k
pp = p;
1335
5.05k
if (pp < ptrend && IS_DIGIT(*pp))
1336
416
  {
1337
416
  had_minimum = TRUE;
1338
2.46k
  while (++pp < ptrend && IS_DIGIT(*pp)) {}
1339
416
  }
1340
1341
5.12k
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1342
5.05k
if (pp >= ptrend) return FALSE;
1343
1344
5.04k
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1345
1.90k
  {
1346
1.90k
  if (!had_minimum) return FALSE;
1347
1.90k
  }
1348
3.13k
else
1349
3.13k
  {
1350
3.13k
  if (*pp++ != CHAR_COMMA) return FALSE;
1351
4
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1352
4
  if (pp >= ptrend) return FALSE;
1353
4
  if (IS_DIGIT(*pp))
1354
0
    {
1355
0
    while (++pp < ptrend && IS_DIGIT(*pp)) {}
1356
0
    }
1357
4
  else if (!had_minimum) return FALSE;
1358
0
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1359
0
  if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1360
0
  }
1361
1362
/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
1363
or {n,m}. The only error that read_number() can return is for a number that is
1364
too big. If *errorcodeptr is returned as zero it means no number was found. */
1365
1366
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1367
check m >= n because n defaults to zero. */
1368
1369
12
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1370
0
  {
1371
0
  if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1372
0
  p++;  /* Skip comma and subsequent spaces */
1373
0
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1374
0
  if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1375
0
    {
1376
0
    if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1377
0
    }
1378
0
  }
1379
1380
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1381
1382
12
else
1383
12
  {
1384
12
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1385
12
  if (*p == CHAR_RIGHT_CURLY_BRACKET)
1386
12
    {
1387
12
    max = min;
1388
12
    }
1389
0
  else   /* Handle {n,} or {n,m} */
1390
0
    {
1391
0
    p++;    /* Skip comma and subsequent spaces */
1392
0
    while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1393
0
    if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1394
0
      {
1395
0
      if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1396
0
      }
1397
1398
0
    if (max < min)
1399
0
      {
1400
0
      *errorcodeptr = ERR4;
1401
0
      goto EXIT;
1402
0
      }
1403
0
    }
1404
12
  }
1405
1406
/* Valid quantifier exists */
1407
1408
12
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1409
12
p++;
1410
12
yield = TRUE;
1411
12
if (minp != NULL) *minp = (uint32_t)min;
1412
12
if (maxp != NULL) *maxp = (uint32_t)max;
1413
1414
/* Update the pattern pointer */
1415
1416
12
EXIT:
1417
12
*ptrptr = p;
1418
12
return yield;
1419
12
}
1420
1421
1422
1423
/*************************************************
1424
*            Handle escapes                      *
1425
*************************************************/
1426
1427
/* This function is called when a \ has been encountered. It either returns a
1428
positive value for a simple escape such as \d, or 0 for a data character, which
1429
is placed in chptr. A backreference to group n is returned as -(n+1). On
1430
entry, ptr is pointing at the character after \. On exit, it points after the
1431
final code unit of the escape sequence.
1432
1433
This function is also called from pcre2_substitute() to handle escape sequences
1434
in replacement strings. In this case, the cb argument is NULL, and in the case
1435
of escapes that have further processing, only sequences that define a data
1436
character are recognised. The options argument is the final value of the
1437
compiled pattern's options.
1438
1439
Arguments:
1440
  ptrptr         points to the input position pointer
1441
  ptrend         points to the end of the input
1442
  chptr          points to a returned data character
1443
  errorcodeptr   points to the errorcode variable (containing zero)
1444
  options        the current options bits
1445
  xoptions       the current extra options bits
1446
  bracount       the number of capturing parentheses encountered so far
1447
  isclass        TRUE if in a character class
1448
  cb             compile data block or NULL when called from pcre2_substitute()
1449
1450
Returns:         zero => a data character
1451
                 positive => a special escape sequence
1452
                 negative => a numerical back reference
1453
                 on error, errorcodeptr is set non-zero
1454
*/
1455
1456
int
1457
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1458
  int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,
1459
  BOOL isclass, compile_block *cb)
1460
62.2k
{
1461
62.2k
BOOL utf = (options & PCRE2_UTF) != 0;
1462
62.2k
BOOL alt_bsux =
1463
62.2k
  ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1464
62.2k
PCRE2_SPTR ptr = *ptrptr;
1465
62.2k
uint32_t c, cc;
1466
62.2k
int escape = 0;
1467
62.2k
int i;
1468
1469
/* If backslash is at the end of the string, it's an error. */
1470
1471
62.2k
if (ptr >= ptrend)
1472
0
  {
1473
0
  *errorcodeptr = ERR1;
1474
0
  return 0;
1475
0
  }
1476
1477
62.2k
GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1478
62.2k
*errorcodeptr = 0;              /* Be optimistic */
1479
1480
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1481
value test saves a memory lookup for code points outside the alphanumeric
1482
range. */
1483
1484
62.2k
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1485
1486
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1487
positive value is a literal value for something like \n. A negative value is
1488
the negation of one of the ESC_ macros that is passed back for handling by the
1489
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1490
is supported. If the value is zero, further processing is handled below. */
1491
1492
47.6k
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1493
44.4k
  {
1494
44.4k
  if (i > 0)
1495
9.16k
    {
1496
9.16k
    c = (uint32_t)i;
1497
9.16k
    if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1498
0
      c = CHAR_LF;
1499
9.16k
    }
1500
35.2k
  else  /* Negative table entry */
1501
35.2k
    {
1502
35.2k
    escape = -i;                    /* Else return a special escape */
1503
35.2k
    if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1504
2.37k
      cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1505
1506
    /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1507
    Unicode code points, as well as plain \N for "not newline". PCRE does not
1508
    support \N{name}. However, it does support quantification such as \N{2,3},
1509
    so if \N{ is not followed by U+dddd we check for a quantifier. */
1510
1511
35.2k
    if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1512
0
      {
1513
0
      PCRE2_SPTR p = ptr + 1;
1514
1515
      /* Perl ignores spaces and tabs after { */
1516
1517
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1518
1519
      /* \N{U+ can be handled by the \x{ code. However, this construction is
1520
      not valid in EBCDIC environments because it specifies a Unicode
1521
      character, not a codepoint in the local code. For example \N{U+0041}
1522
      must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1523
      casing semantics for the entire pattern, so allow it only in UTF (i.e.
1524
      Unicode) mode. */
1525
1526
0
      if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1527
0
        {
1528
0
#ifndef EBCDIC
1529
0
        if (utf)
1530
0
          {
1531
0
          ptr = p + 2;
1532
0
          escape = 0;   /* Not a fancy escape after all */
1533
0
          goto COME_FROM_NU;
1534
0
          }
1535
0
#endif
1536
0
        *errorcodeptr = ERR93;
1537
0
        }
1538
1539
      /* Give an error in contexts where quantifiers are not allowed
1540
      (character classes; substitution strings). */
1541
1542
0
      else if (isclass || cb == NULL)
1543
0
        {
1544
0
        *errorcodeptr = ERR37;
1545
0
        }
1546
1547
      /* Give an error if what follows is not a quantifier, but don't override
1548
      an error set by the quantifier reader (e.g. number overflow). */
1549
1550
0
      else
1551
0
        {
1552
0
        if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1553
0
             *errorcodeptr == 0)
1554
0
          *errorcodeptr = ERR37;
1555
0
        }
1556
0
      }
1557
35.2k
    }
1558
44.4k
  }
1559
1560
/* Escapes that need further processing, including those that are unknown, have
1561
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1562
\o, and \x are recognized (\u and \U can never appear as they are used for case
1563
forcing). */
1564
1565
3.23k
else
1566
3.23k
  {
1567
3.23k
  int s;
1568
3.23k
  PCRE2_SPTR oldptr;
1569
3.23k
  BOOL overflow;
1570
1571
  /* Filter calls from pcre2_substitute(). */
1572
1573
3.23k
  if (cb == NULL)
1574
0
    {
1575
0
    if (c < CHAR_0 ||
1576
0
       (c > CHAR_9 && (c != CHAR_c && c != CHAR_o && c != CHAR_x && c != CHAR_g)))
1577
0
      {
1578
0
      *errorcodeptr = ERR3;
1579
0
      return 0;
1580
0
      }
1581
0
    alt_bsux = FALSE;   /* Do not modify \x handling */
1582
0
    }
1583
1584
3.23k
  switch (c)
1585
3.23k
    {
1586
    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1587
    error. */
1588
1589
5
    case CHAR_F:
1590
7
    case CHAR_l:
1591
80
    case CHAR_L:
1592
80
    *errorcodeptr = ERR37;
1593
80
    break;
1594
1595
    /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1596
    is set. Otherwise, \u must be followed by exactly four hex digits or, if
1597
    PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1598
    Otherwise it is a lowercase u letter. This gives some compatibility with
1599
    ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1600
    allowed. When \u{ is not followed by hex digits, a special return is given
1601
    because otherwise \u{ 12} (for example) would be treated as u{12}. */
1602
1603
5
    case CHAR_u:
1604
5
    if (!alt_bsux) *errorcodeptr = ERR37; else
1605
0
      {
1606
0
      uint32_t xc;
1607
1608
0
      if (ptr >= ptrend) break;
1609
0
      if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1610
0
          (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1611
0
        {
1612
0
        PCRE2_SPTR hptr = ptr + 1;
1613
1614
0
        cc = 0;
1615
0
        while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1616
0
          {
1617
0
          if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1618
0
            {
1619
0
            *errorcodeptr = ERR77;
1620
0
            ptr = hptr;   /* Show where */
1621
0
            break;        /* *hptr != } will cause another break below */
1622
0
            }
1623
0
          cc = (cc << 4) | xc;
1624
0
          hptr++;
1625
0
          }
1626
1627
0
        if (hptr == ptr + 1 ||   /* No hex digits */
1628
0
            hptr >= ptrend ||    /* Hit end of input */
1629
0
            *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1630
0
          {
1631
0
          if (isclass) break; /* In a class, just treat as '\u' literal */
1632
0
          escape = ESC_ub;    /* Special return */
1633
0
          ptr++;              /* Skip { */
1634
0
          break;              /* Hex escape not recognized */
1635
0
          }
1636
1637
0
        c = cc;          /* Accept the code point */
1638
0
        ptr = hptr + 1;
1639
0
        }
1640
1641
0
      else  /* Must be exactly 4 hex digits */
1642
0
        {
1643
0
        if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1644
0
        if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1645
0
        if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1646
0
        cc = (cc << 4) | xc;
1647
0
        if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1648
0
        cc = (cc << 4) | xc;
1649
0
        if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1650
0
        c = (cc << 4) | xc;
1651
0
        ptr += 4;
1652
0
        }
1653
1654
0
      if (utf)
1655
0
        {
1656
0
        if (c > 0x10ffffU) *errorcodeptr = ERR77;
1657
0
        else
1658
0
          if (c >= 0xd800 && c <= 0xdfff &&
1659
0
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1660
0
                *errorcodeptr = ERR73;
1661
0
        }
1662
0
      else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1663
0
      }
1664
5
    break;
1665
1666
    /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1667
    in which case it is an upper case letter. */
1668
1669
5
    case CHAR_U:
1670
0
    if (!alt_bsux) *errorcodeptr = ERR37;
1671
0
    break;
1672
1673
    /* In a character class, \g is just a literal "g". Outside a character
1674
    class, \g must be followed by one of a number of specific things:
1675
1676
    (1) A number, either plain or braced. If positive, it is an absolute
1677
    backreference. If negative, it is a relative backreference. This is a Perl
1678
    5.10 feature.
1679
1680
    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1681
    is part of Perl's movement towards a unified syntax for back references. As
1682
    this is synonymous with \k{name}, we fudge it up by pretending it really
1683
    was \k{name}.
1684
1685
    (3) For Oniguruma compatibility we also support \g followed by a name or a
1686
    number either in angle brackets or in single quotes. However, these are
1687
    (possibly recursive) subroutine calls, _not_ backreferences. We return
1688
    the ESC_g code.
1689
1690
    Summary: Return a negative number for a numerical back reference (offset
1691
    by 1), ESC_k for a named back reference, and ESC_g for a named or
1692
    numbered subroutine call.
1693
1694
    The above describes the \g behaviour inside patterns. Inside replacement
1695
    strings (pcre2_substitute) we support only \g<nameornum> for Python
1696
    compatibility. Return ESG_g for the named case, and -(num+1) for the
1697
    numbered case.
1698
    */
1699
1700
11
    case CHAR_g:
1701
11
    if (isclass) break;
1702
1703
11
    if (ptr >= ptrend)
1704
0
      {
1705
0
      *errorcodeptr = ERR57;
1706
0
      break;
1707
0
      }
1708
1709
11
    if (cb == NULL)
1710
0
      {
1711
0
      PCRE2_SPTR p;
1712
      /* Substitution strings */
1713
0
      if (*ptr != CHAR_LESS_THAN_SIGN)
1714
0
        {
1715
0
        *errorcodeptr = ERR57;
1716
0
        break;
1717
0
        }
1718
1719
0
      p = ptr + 1;
1720
1721
0
      if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,
1722
0
          errorcodeptr))
1723
0
        {
1724
0
        if (*errorcodeptr == 0) escape = ESC_g;  /* No number found */
1725
0
        break;
1726
0
        }
1727
1728
0
      if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)
1729
0
        {
1730
        /* not advancing ptr; report error at the \g character */
1731
0
        *errorcodeptr = ERR57;
1732
0
        break;
1733
0
        }
1734
1735
      /* This is the reason that back references are returned as -(s+1) rather
1736
      than just -s. In a pattern, \0 is not a back reference, but \g<0> is
1737
      valid in a substitution string, so this must be representable. */
1738
0
      ptr = p + 1;
1739
0
      escape = -(s+1);
1740
0
      break;
1741
0
      }
1742
1743
11
    if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1744
3
      {
1745
3
      escape = ESC_g;
1746
3
      break;
1747
3
      }
1748
1749
    /* If there is a brace delimiter, try to read a numerical reference. If
1750
    there isn't one, assume we have a name and treat it as \k. */
1751
1752
8
    if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1753
2
      {
1754
2
      PCRE2_SPTR p = ptr + 1;
1755
1756
2
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1757
2
      if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1758
2
          errorcodeptr))
1759
2
        {
1760
2
        if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1761
2
        break;
1762
2
        }
1763
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1764
1765
0
      if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1766
0
        {
1767
        /* not advancing ptr; report error at the \g character */
1768
0
        *errorcodeptr = ERR57;
1769
0
        break;
1770
0
        }
1771
0
      ptr = p + 1;
1772
0
      }
1773
1774
    /* Read an undelimited number */
1775
1776
6
    else
1777
6
      {
1778
6
      if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1779
6
          errorcodeptr))
1780
6
        {
1781
6
        if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1782
6
        break;
1783
6
        }
1784
6
      }
1785
1786
0
    if (s <= 0)
1787
0
      {
1788
0
      *errorcodeptr = ERR15;
1789
0
      break;
1790
0
      }
1791
1792
0
    escape = -(s+1);
1793
0
    break;
1794
1795
    /* The handling of escape sequences consisting of a string of digits
1796
    starting with one that is not zero is not straightforward. Perl has changed
1797
    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1798
    recommended to avoid the ambiguities in the old syntax.
1799
1800
    Outside a character class, the digits are read as a decimal number. If the
1801
    number is less than 10, or if there are that many previous extracting left
1802
    brackets, it is a back reference. Otherwise, up to three octal digits are
1803
    read to form an escaped character code. Thus \123 is likely to be octal 123
1804
    (cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
1805
    style" of handling ambiguous octal/backrefences such as \12.
1806
1807
    There is an alternative disambiguation strategy, selected by
1808
    PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
1809
    have either a leading zero, or exactly three octal digits; otherwise it's
1810
    a backreference. The disambiguation is stable, and does not depend on how
1811
    many capture groups are defined (it's simply an invalid backreference if
1812
    there is no corresponding capture group). Additionally, octal values above
1813
    \377 (\xff) are rejected.
1814
1815
    Inside a character class, \ followed by a digit is always either a literal
1816
    8 or 9 or an octal number. */
1817
1818
1.61k
    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1819
2.35k
    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1820
1821
2.35k
    if (isclass)
1822
144
      {
1823
      /* Fall through to octal handling; never a backreference inside a class. */
1824
144
      }
1825
2.21k
    else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
1826
0
      {
1827
      /* Python-style disambiguation. */
1828
0
      if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
1829
0
          ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1830
0
        {
1831
        /* We peeked a three-digit octal, so fall through */
1832
0
        }
1833
0
      else
1834
0
        {
1835
        /* We are at a digit, so the only possible error from read_number() is
1836
        a number that is too large. */
1837
0
        ptr--;   /* Back to the digit */
1838
1839
0
        if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1840
0
          {
1841
0
          *errorcodeptr = ERR61;
1842
0
          break;
1843
0
          }
1844
1845
0
        escape = -(s+1);
1846
0
        break;
1847
0
        }
1848
0
      }
1849
2.21k
    else
1850
2.21k
      {
1851
      /* Perl-style disambiguation. */
1852
2.21k
      oldptr = ptr;
1853
2.21k
      ptr--;   /* Back to the digit */
1854
1855
      /* As we know we are at a digit, the only possible error from
1856
      read_number() is a number that is too large to be a group number. Because
1857
      that number might be still valid if read as an octal, errorcodeptr is not
1858
      set on failure and therefore a sentinel value of INT_MAX is used instead
1859
      of the original value, and will be used later to properly set the error,
1860
      if not falling through. */
1861
1862
2.21k
      if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1863
84
        s = INT_MAX;
1864
1865
      /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1866
      are octal escapes if there are not that many previous captures. */
1867
1868
2.21k
      if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)
1869
1.59k
        {
1870
        /* s > MAX_GROUP_NUMBER should not be possible because of read_number(),
1871
        but we keep it just to be safe and because it will also catch the
1872
        sentinel value that was set on failure by that function. */
1873
1874
1.59k
        if ((unsigned)s > MAX_GROUP_NUMBER)
1875
4
          {
1876
4
          PCRE2_ASSERT(s == INT_MAX);
1877
4
          *errorcodeptr = ERR61;
1878
4
          }
1879
1.58k
        else escape = -(s+1);     /* Indicates a back reference */
1880
1.59k
        break;
1881
1.59k
        }
1882
1883
624
      ptr = oldptr;      /* Put the pointer back and fall through */
1884
624
      }
1885
1886
    /* Handle a digit following \ when the number is not a back reference, or
1887
    we are within a character class. If the first digit is 8 or 9, Perl used to
1888
    generate a binary zero and then treat the digit as a following literal. At
1889
    least by Perl 5.18 this changed so as not to insert the binary zero. */
1890
1891
768
    if (c >= CHAR_8) break;
1892
1893
    /* Fall through */
1894
1895
    /* \0 always starts an octal number, but we may drop through to here with a
1896
    larger first octal digit. The original code used just to take the least
1897
    significant 8 bits of octal numbers (I think this is what early Perls used
1898
    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
1899
    but no more than 3 octal digits. */
1900
1901
1.47k
    case CHAR_0:
1902
1.47k
    c -= CHAR_0;
1903
3.37k
    while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1904
1.90k
        c = c * 8 + *ptr++ - CHAR_0;
1905
1.47k
    if (c > 0xff)
1906
9
      {
1907
9
      if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;
1908
9
#if PCRE2_CODE_UNIT_WIDTH == 8
1909
9
      else if (!utf) *errorcodeptr = ERR51;
1910
9
#endif
1911
9
      }
1912
1913
    /* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
1914
    two- or three-character octal escapes \00 and \000, nor \x00. */
1915
1916
1.47k
    if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
1917
0
        *errorcodeptr = ERR98;
1918
1.47k
    break;
1919
1920
    /* \o is a relatively new Perl feature, supporting a more general way of
1921
    specifying character codes in octal. The only supported form is \o{ddd},
1922
    with optional spaces or tabs after { and before }. */
1923
1924
3
    case CHAR_o:
1925
3
    if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1926
3
      {
1927
3
      ptr--;
1928
3
      *errorcodeptr = ERR55;
1929
3
      break;
1930
3
      }
1931
1932
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1933
0
    if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1934
0
      {
1935
0
      *errorcodeptr = ERR78;
1936
0
      break;
1937
0
      }
1938
1939
0
    c = 0;
1940
0
    overflow = FALSE;
1941
0
    while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1942
0
      {
1943
0
      cc = *ptr++;
1944
0
      if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1945
#if PCRE2_CODE_UNIT_WIDTH == 32
1946
      if (c >= 0x20000000u) { overflow = TRUE; break; }
1947
#endif
1948
0
      c = (c << 3) + (cc - CHAR_0);
1949
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1950
0
      if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1951
#elif PCRE2_CODE_UNIT_WIDTH == 16
1952
      if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1953
#elif PCRE2_CODE_UNIT_WIDTH == 32
1954
      if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1955
#endif
1956
0
      }
1957
1958
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1959
1960
0
    if (overflow)
1961
0
      {
1962
0
      while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1963
0
      *errorcodeptr = ERR34;
1964
0
      }
1965
0
    else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1966
0
      {
1967
0
      if (utf && c >= 0xd800 && c <= 0xdfff &&
1968
0
          (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1969
0
        {
1970
0
        ptr--;
1971
0
        *errorcodeptr = ERR73;
1972
0
        }
1973
0
      }
1974
0
    else
1975
0
      {
1976
0
      ptr--;
1977
0
      *errorcodeptr = ERR64;
1978
0
      }
1979
0
    break;
1980
1981
    /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1982
    by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1983
1984
3
    case CHAR_x:
1985
3
    if (alt_bsux)
1986
0
      {
1987
0
      uint32_t xc;
1988
0
      if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1989
0
      if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1990
0
      if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1991
0
      c = (cc << 4) | xc;
1992
0
      ptr += 2;
1993
0
      }
1994
1995
    /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1996
    greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1997
    digits. If not, { used to be treated as a data character. However, Perl
1998
    seems to read hex digits up to the first non-such, and ignore the rest, so
1999
    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2000
    now gives an error. */
2001
2002
3
    else
2003
3
      {
2004
3
      if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
2005
0
        {
2006
0
        ptr++;
2007
0
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2008
2009
0
#ifndef EBCDIC
2010
0
        COME_FROM_NU:
2011
0
#endif
2012
0
        if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
2013
0
          {
2014
0
          *errorcodeptr = ERR78;
2015
0
          break;
2016
0
          }
2017
0
        c = 0;
2018
0
        overflow = FALSE;
2019
2020
0
        while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2021
0
          {
2022
0
          ptr++;
2023
0
          if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2024
#if PCRE2_CODE_UNIT_WIDTH == 32
2025
          if (c >= 0x10000000l) { overflow = TRUE; break; }
2026
#endif
2027
0
          c = (c << 4) | cc;
2028
0
          if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2029
0
            {
2030
0
            overflow = TRUE;
2031
0
            break;
2032
0
            }
2033
0
          }
2034
2035
        /* Perl ignores spaces and tabs before } */
2036
2037
0
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2038
2039
        /* On overflow, skip remaining hex digits */
2040
2041
0
        if (overflow)
2042
0
          {
2043
0
          while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2044
0
          *errorcodeptr = ERR34;
2045
0
          }
2046
0
        else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2047
0
          {
2048
0
          if (utf && c >= 0xd800 && c <= 0xdfff &&
2049
0
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2050
0
            {
2051
0
            ptr--;
2052
0
            *errorcodeptr = ERR73;
2053
0
            }
2054
0
          }
2055
2056
        /* If the sequence of hex digits (followed by optional space) does not
2057
        end with '}', give an error. We used just to recognize this construct
2058
        and fall through to the normal \x handling, but nowadays Perl gives an
2059
        error, which seems much more sensible, so we do too. */
2060
2061
0
        else
2062
0
          {
2063
0
          ptr--;
2064
0
          *errorcodeptr = ERR67;
2065
0
          }
2066
0
        }   /* End of \x{} processing */
2067
2068
      /* Read a up to two hex digits after \x */
2069
2070
3
      else
2071
3
        {
2072
        /* Perl has the surprising/broken behaviour that \x without following
2073
        hex digits is treated as an escape for NUL. Their source code laments
2074
        this but keeps it for backwards compatibility. A warning is printed
2075
        when "use warnings" is enabled. Because we don't have warnings, we
2076
        simply forbid it. */
2077
3
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)
2078
3
          {
2079
          /* Not a hex digit */
2080
3
          *errorcodeptr = ERR78;
2081
3
          break;
2082
3
          }
2083
0
        ptr++;
2084
0
        c = cc;
2085
2086
        /* With "use re 'strict'" Perl actually requires exactly two digits (error
2087
        for \x, \xA and \xAAA). While \x was already rejected, this seems overly
2088
        strict, and there seems little incentive to align with that, given the
2089
        backwards-compatibility cost.
2090
2091
        For comparison, note that other engines disagree. For example:
2092
          - Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits
2093
          - .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.
2094
        */
2095
0
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2096
0
        ptr++;
2097
0
        c = (c << 4) | cc;
2098
0
        }     /* End of \xdd handling */
2099
3
      }       /* End of Perl-style \x handling */
2100
0
    break;
2101
2102
    /* The handling of \c is different in ASCII and EBCDIC environments. In an
2103
    ASCII (or Unicode) environment, an error is given if the character
2104
    following \c is not a printable ASCII character. Otherwise, the following
2105
    character is upper-cased if it is a letter, and after that the 0x40 bit is
2106
    flipped. The result is the value of the escape.
2107
2108
    In an EBCDIC environment the handling of \c is compatible with the
2109
    specification in the perlebcdic document. The following character must be
2110
    a letter or one of small number of special characters. These provide a
2111
    means of defining the character values 0-31.
2112
2113
    For testing the EBCDIC handling of \c in an ASCII environment, recognize
2114
    the EBCDIC value of 'c' explicitly. */
2115
2116
#if defined EBCDIC && 'a' != 0x81
2117
    case 0x83:
2118
#else
2119
33
    case CHAR_c:
2120
33
#endif
2121
33
    if (ptr >= ptrend)
2122
0
      {
2123
0
      *errorcodeptr = ERR2;
2124
0
      break;
2125
0
      }
2126
33
    c = *ptr;
2127
33
    if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2128
2129
    /* Handle \c in an ASCII/Unicode environment. */
2130
2131
33
#ifndef EBCDIC    /* ASCII/UTF-8 coding */
2132
33
    if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2133
2
      {
2134
2
      *errorcodeptr = ERR68;
2135
2
      break;
2136
2
      }
2137
31
    c ^= 0x40;
2138
2139
    /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2140
    255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2141
    POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2142
    The other valid sequences correspond to a list of specific characters. */
2143
2144
#else
2145
    if (c == CHAR_QUESTION_MARK)
2146
      c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2147
    else
2148
      {
2149
      for (i = 0; i < 32; i++)
2150
        {
2151
        if (c == ebcdic_escape_c[i]) break;
2152
        }
2153
      if (i < 32) c = i; else *errorcodeptr = ERR68;
2154
      }
2155
#endif  /* EBCDIC */
2156
2157
31
    ptr++;
2158
31
    break;
2159
2160
    /* Any other alphanumeric following \ is an error. Perl gives an error only
2161
    if in warning mode, but PCRE doesn't have a warning mode. */
2162
2163
14
    default:
2164
14
    *errorcodeptr = ERR3;
2165
14
    *ptrptr = ptr - 1;     /* Point to the character at fault */
2166
14
    return 0;
2167
3.23k
    }
2168
3.23k
  }
2169
2170
/* Set the pointer to the next character before returning. */
2171
2172
62.1k
*ptrptr = ptr;
2173
62.1k
*chptr = c;
2174
62.1k
return escape;
2175
62.2k
}
2176
2177
2178
2179
#ifdef SUPPORT_UNICODE
2180
/*************************************************
2181
*               Handle \P and \p                 *
2182
*************************************************/
2183
2184
/* This function is called after \P or \p has been encountered, provided that
2185
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2186
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2187
after the final code unit of the escape sequence.
2188
2189
Arguments:
2190
  ptrptr         the pattern position pointer
2191
  negptr         a boolean that is set TRUE for negation else FALSE
2192
  ptypeptr       an unsigned int that is set to the type value
2193
  pdataptr       an unsigned int that is set to the detailed property value
2194
  errorcodeptr   the error code variable
2195
  cb             the compile data
2196
2197
Returns:         TRUE if the type value was found, or FALSE for an invalid type
2198
*/
2199
2200
static BOOL
2201
get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2202
  uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2203
1.77k
{
2204
1.77k
PCRE2_UCHAR c;
2205
1.77k
PCRE2_SIZE i, bot, top;
2206
1.77k
PCRE2_SPTR ptr = *ptrptr;
2207
1.77k
PCRE2_UCHAR name[50];
2208
1.77k
PCRE2_UCHAR *vptr = NULL;
2209
1.77k
uint16_t ptscript = PT_NOTSCRIPT;
2210
2211
1.77k
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2212
1.77k
c = *ptr++;
2213
1.77k
*negptr = FALSE;
2214
2215
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2216
negation. We must be handling Unicode encoding here, though we may be compiling
2217
for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC
2218
input and Unicode input in the same build.) In accordance with Unicode's "loose
2219
matching" rules, ASCII white space, hyphens, and underscores are ignored. We
2220
don't use isspace() or tolower() because (a) code points may be greater than
2221
255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC
2222
environment. */
2223
2224
1.77k
if (c == CHAR_LEFT_CURLY_BRACKET)
2225
0
  {
2226
0
  if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2227
2228
0
  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2229
0
    {
2230
0
    REDO:
2231
2232
0
    if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2233
0
    c = *ptr++;
2234
2235
    /* Skip ignorable Unicode characters. */
2236
2237
0
    while (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||
2238
0
          (c >= CHAR_HT && c <= CHAR_CR))
2239
0
      {
2240
0
      if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2241
0
      c = *ptr++;
2242
0
      }
2243
2244
    /* The first significant character being circumflex negates the meaning of
2245
    the item. */
2246
2247
0
    if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)
2248
0
      {
2249
0
      *negptr = TRUE;
2250
0
      goto REDO;
2251
0
      }
2252
2253
0
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2254
2255
    /* Names consist of ASCII letters and digits, but equals and colon may also
2256
    occur as a name/value separator. We must also allow for \p{L&}. A simple
2257
    check for a value between '&' and 'z' suffices because anything else in a
2258
    name or value will cause an "unknown property" error anyway. */
2259
2260
0
    if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;
2261
2262
    /* Lower case a capital letter or remember where the name/value separator
2263
    is. */
2264
2265
0
    if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;
2266
0
    else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)
2267
0
      vptr = name + i;
2268
2269
0
    name[i] = c;
2270
0
    }
2271
2272
  /* Error if the loop didn't end with '}' - either we hit the end of the
2273
  pattern or the name was longer than any legal property name. */
2274
2275
0
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2276
0
  name[i] = 0;
2277
0
  }
2278
2279
/* If { doesn't follow \p or \P there is just one following character, which
2280
must be an ASCII letter. */
2281
2282
1.77k
else if (c >= CHAR_A && c <= CHAR_Z)
2283
855
  {
2284
855
  name[0] = c | 0x20;  /* Lower case */
2285
855
  name[1] = 0;
2286
855
  }
2287
916
else if (c >= CHAR_a && c <= CHAR_z)
2288
901
  {
2289
901
  name[0] = c;
2290
901
  name[1] = 0;
2291
901
  }
2292
15
else goto ERROR_RETURN;
2293
2294
1.75k
*ptrptr = ptr;   /* Update pattern pointer */
2295
2296
/* If the property contains ':' or '=' we have class name and value separately
2297
specified. The following are supported:
2298
2299
  . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2300
  . Script (synonym sc) for which the property name is the script name
2301
  . Script_Extensions (synonym scx), ditto
2302
2303
As this is a small number, we currently just check the names directly. If this
2304
grows, a sorted table and a switch will be neater.
2305
2306
For both the script properties, set a PT_xxx value so that (1) they can be
2307
distinguished and (2) invalid script names that happen to be the name of
2308
another property can be diagnosed. */
2309
2310
1.75k
if (vptr != NULL)
2311
0
  {
2312
0
  int offset = 0;
2313
0
  PCRE2_UCHAR sname[8];
2314
2315
0
  *vptr = 0;   /* Terminate property name */
2316
0
  if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2317
0
      PRIV(strcmp_c8)(name, STRING_bc) == 0)
2318
0
    {
2319
0
    offset = 4;
2320
0
    sname[0] = CHAR_b;
2321
0
    sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2322
0
    sname[2] = CHAR_d;
2323
0
    sname[3] = CHAR_i;
2324
0
    }
2325
2326
0
  else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2327
0
           PRIV(strcmp_c8)(name, STRING_sc) == 0)
2328
0
    ptscript = PT_SC;
2329
2330
0
  else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2331
0
           PRIV(strcmp_c8)(name, STRING_scx) == 0)
2332
0
    ptscript = PT_SCX;
2333
2334
0
  else
2335
0
    {
2336
0
    *errorcodeptr = ERR47;
2337
0
    return FALSE;
2338
0
    }
2339
2340
  /* Adjust the string in name[] as needed */
2341
2342
0
  memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2343
0
  if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2344
0
  }
2345
2346
/* Search for a recognized property using binary chop. */
2347
2348
1.75k
bot = 0;
2349
1.75k
top = PRIV(utt_size);
2350
2351
14.0k
while (bot < top)
2352
14.0k
  {
2353
14.0k
  int r;
2354
14.0k
  i = (bot + top) >> 1;
2355
14.0k
  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2356
2357
  /* When a matching property is found, some extra checking is needed when the
2358
  \p{xx:yy} syntax is used and xx is either sc or scx. */
2359
2360
14.0k
  if (r == 0)
2361
1.74k
    {
2362
1.74k
    *pdataptr = PRIV(utt)[i].value;
2363
1.74k
    if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2364
1.74k
      {
2365
1.74k
      *ptypeptr = PRIV(utt)[i].type;
2366
1.74k
      return TRUE;
2367
1.74k
      }
2368
2369
0
    switch (PRIV(utt)[i].type)
2370
0
      {
2371
0
      case PT_SC:
2372
0
      *ptypeptr = PT_SC;
2373
0
      return TRUE;
2374
2375
0
      case PT_SCX:
2376
0
      *ptypeptr = ptscript;
2377
0
      return TRUE;
2378
0
      }
2379
2380
0
    break;  /* Non-script found */
2381
0
    }
2382
2383
12.3k
  if (r > 0) bot = i + 1; else top = i;
2384
12.3k
  }
2385
2386
7
*errorcodeptr = ERR47;   /* Unrecognized property */
2387
7
return FALSE;
2388
2389
15
ERROR_RETURN:            /* Malformed \P or \p */
2390
15
*errorcodeptr = ERR46;
2391
15
*ptrptr = ptr;
2392
15
return FALSE;
2393
1.75k
}
2394
#endif
2395
2396
2397
2398
/*************************************************
2399
*           Check for POSIX class syntax         *
2400
*************************************************/
2401
2402
/* This function is called when the sequence "[:" or "[." or "[=" is
2403
encountered in a character class. It checks whether this is followed by a
2404
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2405
reach an unescaped ']' without the special preceding character, return FALSE.
2406
2407
Originally, this function only recognized a sequence of letters between the
2408
terminators, but it seems that Perl recognizes any sequence of characters,
2409
though of course unknown POSIX names are subsequently rejected. Perl gives an
2410
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2411
didn't consider this to be a POSIX class. Likewise for [:1234:].
2412
2413
The problem in trying to be exactly like Perl is in the handling of escapes. We
2414
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2415
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2416
below handles the special cases \\ and \], but does not try to do any other
2417
escape processing. This makes it different from Perl for cases such as
2418
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2419
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2420
when Perl does, I think.
2421
2422
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2423
It seems that the appearance of a nested POSIX class supersedes an apparent
2424
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2425
a digit. This is handled by returning FALSE if the start of a new group with
2426
the same terminator is encountered, since the next closing sequence must close
2427
the nested group, not the outer one.
2428
2429
In Perl, unescaped square brackets may also appear as part of class names. For
2430
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2431
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2432
seem right at all. PCRE does not allow closing square brackets in POSIX class
2433
names.
2434
2435
Arguments:
2436
  ptr      pointer to the character after the initial [ (colon, dot, equals)
2437
  ptrend   pointer to the end of the pattern
2438
  endptr   where to return a pointer to the terminating ':', '.', or '='
2439
2440
Returns:   TRUE or FALSE
2441
*/
2442
2443
static BOOL
2444
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2445
333
{
2446
333
PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2447
333
terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2448
2449
27.1k
for (; ptrend - ptr >= 2; ptr++)
2450
27.1k
  {
2451
27.1k
  if (*ptr == CHAR_BACKSLASH &&
2452
27.1k
      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2453
256
    ptr++;
2454
2455
26.8k
  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2456
26.8k
            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2457
2458
26.6k
  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2459
0
    {
2460
0
    *endptr = ptr;
2461
0
    return TRUE;
2462
0
    }
2463
27.1k
  }
2464
2465
40
return FALSE;
2466
333
}
2467
2468
2469
2470
/*************************************************
2471
*          Check POSIX class name                *
2472
*************************************************/
2473
2474
/* This function is called to check the name given in a POSIX-style class entry
2475
such as [:alnum:].
2476
2477
Arguments:
2478
  ptr        points to the first letter
2479
  len        the length of the name
2480
2481
Returns:     a value representing the name, or -1 if unknown
2482
*/
2483
2484
static int
2485
check_posix_name(PCRE2_SPTR ptr, int len)
2486
0
{
2487
0
const char *pn = posix_names;
2488
0
int yield = 0;
2489
0
while (posix_name_lengths[yield] != 0)
2490
0
  {
2491
0
  if (len == posix_name_lengths[yield] &&
2492
0
    PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2493
0
  pn += posix_name_lengths[yield] + 1;
2494
0
  yield++;
2495
0
  }
2496
0
return -1;
2497
0
}
2498
2499
2500
2501
/*************************************************
2502
*       Read a subpattern or VERB name           *
2503
*************************************************/
2504
2505
/* This function is called from parse_regex() below whenever it needs to read
2506
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2507
pointer must be to the preceding character. If that character is '*' we are
2508
reading a verb or alpha assertion name. The pointer is updated to point after
2509
the name, for a VERB or alpha assertion name, or after tha name's terminator
2510
for a subpattern name. Returning both the offset and the name pointer is
2511
redundant information, but some callers use one and some the other, so it is
2512
simplest just to return both. When the name is in braces, spaces and tabs are
2513
allowed (and ignored) at either end.
2514
2515
Arguments:
2516
  ptrptr      points to the character pointer variable
2517
  ptrend      points to the end of the input string
2518
  utf         true if the input is UTF-encoded
2519
  terminator  the terminator of a subpattern name must be this
2520
  offsetptr   where to put the offset from the start of the pattern
2521
  nameptr     where to put a pointer to the name in the input
2522
  namelenptr  where to put the length of the name
2523
  errcodeptr  where to put an error code
2524
  cb          pointer to the compile data block
2525
2526
Returns:    TRUE if a name was read
2527
            FALSE otherwise, with error code set
2528
*/
2529
2530
static BOOL
2531
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2532
  PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2533
  int *errorcodeptr, compile_block *cb)
2534
74
{
2535
74
PCRE2_SPTR ptr = *ptrptr;
2536
74
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2537
74
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2538
2539
74
if (is_braced)
2540
2
  while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2541
2542
74
if (ptr >= ptrend)                 /* No characters in name */
2543
0
  {
2544
0
  *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2545
0
                            ERR60; /* Verb not recognized or malformed */
2546
0
  goto FAILED;
2547
0
  }
2548
2549
74
*nameptr = ptr;
2550
74
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2551
2552
/* If this logic were ever to change, the matching function in pcre2_substitute.c
2553
ought to be updated to match. */
2554
2555
/* In UTF mode, a group name may contain letters and decimal digits as defined
2556
by Unicode properties, and underscores, but must not start with a digit. */
2557
2558
74
#ifdef SUPPORT_UNICODE
2559
74
if (utf && is_group)
2560
3
  {
2561
3
  uint32_t c, type;
2562
2563
3
  GETCHAR(c, ptr);
2564
3
  type = UCD_CHARTYPE(c);
2565
2566
3
  if (type == ucp_Nd)
2567
0
    {
2568
0
    *errorcodeptr = ERR44;
2569
0
    goto FAILED;
2570
0
    }
2571
2572
3
  for(;;)
2573
3
    {
2574
3
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2575
3
        c != CHAR_UNDERSCORE) break;
2576
0
    ptr++;
2577
0
    FORWARDCHARTEST(ptr, ptrend);
2578
0
    if (ptr >= ptrend) break;
2579
0
    GETCHAR(c, ptr);
2580
0
    type = UCD_CHARTYPE(c);
2581
0
    }
2582
3
  }
2583
71
else
2584
#else
2585
(void)utf;  /* Avoid compiler warning */
2586
#endif      /* SUPPORT_UNICODE */
2587
2588
/* Handle non-group names and group names in non-UTF modes. A group name must
2589
not start with a digit. If either of the others start with a digit it just
2590
won't be recognized. */
2591
2592
71
  {
2593
71
  if (is_group && IS_DIGIT(*ptr))
2594
0
    {
2595
0
    *errorcodeptr = ERR44;
2596
0
    goto FAILED;
2597
0
    }
2598
2599
74
  while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2600
3
    {
2601
3
    ptr++;
2602
3
    }
2603
71
  }
2604
2605
/* Check name length */
2606
2607
74
if (ptr > *nameptr + MAX_NAME_SIZE)
2608
0
  {
2609
0
  *errorcodeptr = ERR48;
2610
0
  goto FAILED;
2611
0
  }
2612
74
*namelenptr = (uint32_t)(ptr - *nameptr);
2613
2614
/* Subpattern names must not be empty, and their terminator is checked here.
2615
(What follows a verb or alpha assertion name is checked separately.) */
2616
2617
74
if (is_group)
2618
20
  {
2619
20
  if (ptr == *nameptr)
2620
17
    {
2621
17
    *errorcodeptr = ERR62;   /* Subpattern name expected */
2622
17
    goto FAILED;
2623
17
    }
2624
3
  if (is_braced)
2625
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2626
3
  if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2627
3
    {
2628
3
    *errorcodeptr = ERR42;
2629
3
    goto FAILED;
2630
3
    }
2631
0
  ptr++;
2632
0
  }
2633
2634
54
*ptrptr = ptr;
2635
54
return TRUE;
2636
2637
20
FAILED:
2638
20
*ptrptr = ptr;
2639
20
return FALSE;
2640
74
}
2641
2642
2643
2644
/*************************************************
2645
*          Manage callouts at start of cycle     *
2646
*************************************************/
2647
2648
/* At the start of a new item in parse_regex() we are able to record the
2649
details of the previous item in a prior callout, and also to set up an
2650
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2651
which would otherwise happen for items such as \Q that contribute nothing to
2652
the parsed pattern.
2653
2654
Arguments:
2655
  ptr              current pattern pointer
2656
  pcalloutptr      points to a pointer to previous callout, or NULL
2657
  auto_callout     TRUE if auto_callouts are enabled
2658
  parsed_pattern   the parsed pattern pointer
2659
  cb               compile block
2660
2661
Returns: possibly updated parsed_pattern pointer.
2662
*/
2663
2664
static uint32_t *
2665
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2666
  uint32_t *parsed_pattern, compile_block *cb)
2667
871k
{
2668
871k
uint32_t *previous_callout = *pcalloutptr;
2669
2670
871k
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2671
0
  cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2672
2673
871k
if (!auto_callout) previous_callout = NULL; else
2674
0
  {
2675
0
  if (previous_callout == NULL ||
2676
0
      previous_callout != parsed_pattern - 4 ||
2677
0
      previous_callout[3] != 255)
2678
0
    {
2679
0
    previous_callout = parsed_pattern;  /* Set up new automatic callout */
2680
0
    parsed_pattern += 4;
2681
0
    previous_callout[0] = META_CALLOUT_NUMBER;
2682
0
    previous_callout[2] = 0;
2683
0
    previous_callout[3] = 255;
2684
0
    }
2685
0
  previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2686
0
  }
2687
2688
871k
*pcalloutptr = previous_callout;
2689
871k
return parsed_pattern;
2690
871k
}
2691
2692
2693
2694
/*************************************************
2695
*          Handle \d, \D, \s, \S, \w, \W         *
2696
*************************************************/
2697
2698
/* This function is called from parse_regex() below, both for freestanding
2699
escapes, and those within classes, to handle those escapes that may change when
2700
Unicode property support is requested. Note that PCRE2_UCP will never be set
2701
without Unicode support because that is checked when pcre2_compile() is called.
2702
2703
Arguments:
2704
  escape          the ESC_... value
2705
  parsed_pattern  where to add the code
2706
  options         options bits
2707
  xoptions        extra options bits
2708
2709
Returns:          updated value of parsed_pattern
2710
*/
2711
static uint32_t *
2712
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2713
  uint32_t xoptions)
2714
19.0k
{
2715
19.0k
uint32_t ascii_option = 0;
2716
19.0k
uint32_t prop = ESC_p;
2717
2718
19.0k
switch(escape)
2719
19.0k
  {
2720
1.62k
  case ESC_D:
2721
1.62k
  prop = ESC_P;
2722
  /* Fall through */
2723
5.12k
  case ESC_d:
2724
5.12k
  ascii_option = PCRE2_EXTRA_ASCII_BSD;
2725
5.12k
  break;
2726
2727
376
  case ESC_S:
2728
376
  prop = ESC_P;
2729
  /* Fall through */
2730
1.79k
  case ESC_s:
2731
1.79k
  ascii_option = PCRE2_EXTRA_ASCII_BSS;
2732
1.79k
  break;
2733
2734
1.72k
  case ESC_W:
2735
1.72k
  prop = ESC_P;
2736
  /* Fall through */
2737
12.1k
  case ESC_w:
2738
12.1k
  ascii_option = PCRE2_EXTRA_ASCII_BSW;
2739
12.1k
  break;
2740
19.0k
  }
2741
2742
19.0k
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2743
17.9k
  {
2744
17.9k
  *parsed_pattern++ = META_ESCAPE + escape;
2745
17.9k
  }
2746
1.14k
else
2747
1.14k
  {
2748
1.14k
  *parsed_pattern++ = META_ESCAPE + prop;
2749
1.14k
  switch(escape)
2750
1.14k
    {
2751
128
    case ESC_d:
2752
129
    case ESC_D:
2753
129
    *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2754
129
    break;
2755
2756
17
    case ESC_s:
2757
177
    case ESC_S:
2758
177
    *parsed_pattern++ = PT_SPACE << 16;
2759
177
    break;
2760
2761
827
    case ESC_w:
2762
839
    case ESC_W:
2763
839
    *parsed_pattern++ = PT_WORD << 16;
2764
839
    break;
2765
1.14k
    }
2766
1.14k
  }
2767
2768
19.0k
return parsed_pattern;
2769
19.0k
}
2770
2771
2772
2773
/*************************************************
2774
* Maximum size of parsed_pattern for given input *
2775
*************************************************/
2776
2777
/* This function is called from parse_regex() below, to determine the amount
2778
of memory to allocate for parsed_pattern. It is also called to check whether
2779
the amount of data written respects the amount of memory allocated.
2780
2781
Arguments:
2782
  ptr             points to the start of the pattern
2783
  ptrend          points to the end of the pattern
2784
  utf             TRUE in UTF mode
2785
  options         the options bits
2786
2787
Returns:          the number of uint32_t units for parsed_pattern
2788
*/
2789
static ptrdiff_t
2790
max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,
2791
  uint32_t options)
2792
2.35k
{
2793
2.35k
PCRE2_SIZE big32count = 0;
2794
2.35k
ptrdiff_t parsed_size_needed;
2795
2796
/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of
2797
unsigned 32-bit ints written out to the parsed pattern is bounded by the length
2798
of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,
2799
when literal characters greater than META_END (0x80000000) have to be coded as
2800
two units. In this case, therefore, we scan the pattern to check for such
2801
values. */
2802
2803
#if PCRE2_CODE_UNIT_WIDTH == 32
2804
if (!utf)
2805
  {
2806
  PCRE2_SPTR p;
2807
  for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;
2808
  }
2809
#else
2810
2.35k
(void)utf;  /* Avoid compiler warning */
2811
2.35k
#endif
2812
2813
2.35k
parsed_size_needed = (ptrend - ptr) + big32count;
2814
2815
/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (4
2816
elements) for each character. This is overkill, but memory is plentiful these
2817
days. */
2818
2819
2.35k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
2820
0
  parsed_size_needed += (ptrend - ptr) * 4;
2821
2822
2.35k
return parsed_size_needed;
2823
2.35k
}
2824
2825
2826
2827
/*************************************************
2828
*      Parse regex and identify named groups     *
2829
*************************************************/
2830
2831
/* This function is called first of all. It scans the pattern and does two
2832
things: (1) It identifies capturing groups and makes a table of named capturing
2833
groups so that information about them is fully available to both the compiling
2834
scans. (2) It writes a parsed version of the pattern with comments omitted and
2835
escapes processed into the parsed_pattern vector.
2836
2837
Arguments:
2838
  ptr             points to the start of the pattern
2839
  options         compiling dynamic options (may change during the scan)
2840
  has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2841
  cb              pointer to the compile data block
2842
2843
Returns:   zero on success or a non-zero error code, with the
2844
             error offset placed in the cb field
2845
*/
2846
2847
/* A structure and some flags for dealing with nested groups. */
2848
2849
typedef struct nest_save {
2850
  uint16_t  nest_depth;
2851
  uint16_t  reset_group;
2852
  uint16_t  max_group;
2853
  uint16_t  flags;
2854
  uint32_t  options;
2855
  uint32_t  xoptions;
2856
} nest_save;
2857
2858
1.80k
#define NSF_RESET          0x0001u
2859
363
#define NSF_CONDASSERT     0x0002u
2860
363
#define NSF_ATOMICSR       0x0004u
2861
2862
/* Options that are changeable within the pattern must be tracked during
2863
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2864
but all must be tracked so that META_OPTIONS items set the correct values for
2865
the main compiling phase. */
2866
2867
766
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2868
766
  PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2869
766
  PCRE2_UNGREEDY)
2870
2871
766
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2872
766
  PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2873
766
  PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2874
2875
/* States used for analyzing ranges in character classes. The two OK values
2876
must be last. */
2877
2878
enum {
2879
  RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
2880
  RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
2881
  RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
2882
  RANGE_FORBID_STARTED, /* State after '[\d-'*/
2883
  RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */
2884
  RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */
2885
};
2886
2887
/* States used for analyzing operators and operands in extended character
2888
classes. */
2889
2890
enum {
2891
  CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */
2892
  CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */
2893
  CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */
2894
};
2895
2896
/* States used for determining the parse mode in character classes. The two
2897
PERL_EXT values must be last. */
2898
2899
enum {
2900
  CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */
2901
  CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */
2902
  CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */
2903
  CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */
2904
};
2905
2906
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2907
the storing of literal values in the main parsed pattern, where they can always
2908
be quantified. */
2909
2910
#if PCRE2_CODE_UNIT_WIDTH == 32
2911
#define PARSED_LITERAL(c, p) \
2912
  { \
2913
  if (c >= META_END) *p++ = META_BIGVALUE; \
2914
  *p++ = c; \
2915
  okquantifier = TRUE; \
2916
  }
2917
#else
2918
1.10M
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2919
#endif
2920
2921
/* Here's the actual function. */
2922
2923
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,
2924
  BOOL *has_lookbehind, compile_block *cb)
2925
2.35k
{
2926
2.35k
uint32_t c;
2927
2.35k
uint32_t delimiter;
2928
2.35k
uint32_t namelen;
2929
2.35k
uint32_t class_range_state;
2930
2.35k
uint32_t class_op_state;
2931
2.35k
uint32_t class_mode_state;
2932
2.35k
uint32_t *class_start;
2933
2.35k
uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2934
2.35k
uint32_t *verbstartptr = NULL;
2935
2.35k
uint32_t *previous_callout = NULL;
2936
2.35k
uint32_t *parsed_pattern = cb->parsed_pattern;
2937
2.35k
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2938
2.35k
uint32_t *this_parsed_item = NULL;
2939
2.35k
uint32_t *prev_parsed_item = NULL;
2940
2.35k
uint32_t meta_quantifier = 0;
2941
2.35k
uint32_t add_after_mark = 0;
2942
2.35k
uint16_t nest_depth = 0;
2943
2.35k
int16_t class_depth_m1 = -1; /* The m1 means minus 1. */
2944
2.35k
int16_t class_maxdepth_m1 = -1;
2945
2.35k
int after_manual_callout = 0;
2946
2.35k
int expect_cond_assert = 0;
2947
2.35k
int errorcode = 0;
2948
2.35k
int escape;
2949
2.35k
int i;
2950
2.35k
BOOL inescq = FALSE;
2951
2.35k
BOOL inverbname = FALSE;
2952
2.35k
BOOL utf = (options & PCRE2_UTF) != 0;
2953
2.35k
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2954
2.35k
BOOL isdupname;
2955
2.35k
BOOL negate_class;
2956
2.35k
BOOL okquantifier = FALSE;
2957
2.35k
PCRE2_SPTR thisptr;
2958
2.35k
PCRE2_SPTR name;
2959
2.35k
PCRE2_SPTR ptrend = cb->end_pattern;
2960
2.35k
PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2961
2.35k
PCRE2_SPTR class_range_forbid_ptr = NULL;
2962
2.35k
named_group *ng;
2963
2.35k
nest_save *top_nest, *end_nests;
2964
#ifdef PCRE2_DEBUG
2965
uint32_t *parsed_pattern_check;
2966
ptrdiff_t parsed_pattern_extra = 0;
2967
ptrdiff_t parsed_pattern_extra_check = 0;
2968
PCRE2_SPTR ptr_check;
2969
#endif
2970
2971
2.35k
PCRE2_ASSERT(parsed_pattern != NULL);
2972
2973
/* Insert leading items for word and line matching (features provided for the
2974
benefit of pcre2grep). */
2975
2976
2.35k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2977
0
  {
2978
0
  *parsed_pattern++ = META_CIRCUMFLEX;
2979
0
  *parsed_pattern++ = META_NOCAPTURE;
2980
0
  }
2981
2.35k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2982
0
  {
2983
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
2984
0
  *parsed_pattern++ = META_NOCAPTURE;
2985
0
  }
2986
2987
#ifdef PCRE2_DEBUG
2988
parsed_pattern_check = parsed_pattern;
2989
ptr_check = ptr;
2990
#endif
2991
2992
/* If the pattern is actually a literal string, process it separately to avoid
2993
cluttering up the main loop. */
2994
2995
2.35k
if ((options & PCRE2_LITERAL) != 0)
2996
0
  {
2997
0
  while (ptr < ptrend)
2998
0
    {
2999
0
    if (parsed_pattern >= parsed_pattern_end)
3000
0
      {
3001
0
      PCRE2_DEBUG_UNREACHABLE();
3002
0
      errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3003
0
      goto FAILED;
3004
0
      }
3005
0
    thisptr = ptr;
3006
0
    GETCHARINCTEST(c, ptr);
3007
0
    if (auto_callout)
3008
0
      parsed_pattern = manage_callouts(thisptr, &previous_callout,
3009
0
        auto_callout, parsed_pattern, cb);
3010
0
    PARSED_LITERAL(c, parsed_pattern);
3011
0
    }
3012
0
  goto PARSED_END;
3013
0
  }
3014
3015
/* Process a real regex which may contain meta-characters. */
3016
3017
2.35k
top_nest = NULL;
3018
2.35k
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3019
3020
/* The size of the nest_save structure might not be a factor of the size of the
3021
workspace. Therefore we must round down end_nests so as to correctly avoid
3022
creating a nest_save that spans the end of the workspace. */
3023
3024
2.35k
end_nests = (nest_save *)((char *)end_nests -
3025
2.35k
  ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3026
3027
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
3028
3029
2.35k
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
3030
3031
/* Now scan the pattern */
3032
3033
964k
while (ptr < ptrend)
3034
963k
  {
3035
963k
  int prev_expect_cond_assert;
3036
963k
  uint32_t min_repeat = 0, max_repeat = 0;
3037
963k
  uint32_t set, unset, *optset;
3038
963k
  uint32_t xset, xunset, *xoptset;
3039
963k
  uint32_t terminator;
3040
963k
  uint32_t prev_meta_quantifier;
3041
963k
  BOOL prev_okquantifier;
3042
963k
  PCRE2_SPTR tempptr;
3043
963k
  PCRE2_SIZE offset;
3044
3045
963k
  if (nest_depth > cb->cx->parens_nest_limit)
3046
4
    {
3047
4
    errorcode = ERR19;
3048
4
    goto FAILED;        /* Parentheses too deeply nested */
3049
4
    }
3050
3051
  /* Check that we haven't emitted too much into parsed_pattern. We allocate
3052
  a suitably-sized buffer upfront, then do unchecked writes to it. If we only
3053
  write a little bit too much, everything will appear to be OK, because the
3054
  upfront size is an overestimate... but a malicious pattern could end up
3055
  forcing a write past the buffer end. We must catch this during
3056
  development. */
3057
3058
#ifdef PCRE2_DEBUG
3059
  /* Strong post-write check. Won't help in release builds - at this point
3060
  the write has already occurred so it's too late. However, should stop us
3061
  committing unsafe code. */
3062
  PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
3063
               (parsed_pattern_extra - parsed_pattern_extra_check) <=
3064
                 max_parsed_pattern(ptr_check, ptr, utf, options));
3065
  parsed_pattern_check = parsed_pattern;
3066
  parsed_pattern_extra_check = parsed_pattern_extra;
3067
  ptr_check = ptr;
3068
#endif
3069
3070
963k
  if (parsed_pattern >= parsed_pattern_end)
3071
0
    {
3072
    /* Weak pre-write check; only ensures parsed_pattern[0] is writeable
3073
    (but the code below can write many chars). Better than nothing. */
3074
0
    PCRE2_DEBUG_UNREACHABLE();
3075
0
    errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3076
0
    goto FAILED;
3077
0
    }
3078
3079
  /* If the last time round this loop something was added, parsed_pattern will
3080
  no longer be equal to this_parsed_item. Remember where the previous item
3081
  started and reset for the next item. Note that sometimes round the loop,
3082
  nothing gets added (e.g. for ignored white space). */
3083
3084
963k
  if (this_parsed_item != parsed_pattern)
3085
946k
    {
3086
946k
    prev_parsed_item = this_parsed_item;
3087
946k
    this_parsed_item = parsed_pattern;
3088
946k
    }
3089
3090
  /* Get next input character, save its position for callout handling. */
3091
3092
963k
  thisptr = ptr;
3093
963k
  GETCHARINCTEST(c, ptr);
3094
3095
  /* Copy quoted literals until \E, allowing for the possibility of automatic
3096
  callouts, except when processing a (*VERB) "name".  */
3097
3098
963k
  if (inescq)
3099
7.47k
    {
3100
7.47k
    if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3101
0
      {
3102
0
      inescq = FALSE;
3103
0
      ptr++;   /* Skip E */
3104
0
      }
3105
7.47k
    else
3106
7.47k
      {
3107
7.47k
      if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
3108
0
        {                           /* expecting a conditional assertion, */
3109
0
        ptr--;                      /* but an empty \Q\E sequence is OK.  */
3110
0
        errorcode = ERR28;
3111
0
        goto FAILED;
3112
0
        }
3113
7.47k
      if (inverbname)
3114
0
        {                          /* Don't use PARSED_LITERAL() because it */
3115
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3116
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3117
#endif
3118
0
        *parsed_pattern++ = c;
3119
0
        }
3120
7.47k
      else
3121
7.47k
        {
3122
7.47k
        if (after_manual_callout-- <= 0)
3123
7.47k
          parsed_pattern = manage_callouts(thisptr, &previous_callout,
3124
7.47k
            auto_callout, parsed_pattern, cb);
3125
7.47k
        PARSED_LITERAL(c, parsed_pattern);
3126
7.47k
        }
3127
7.47k
      meta_quantifier = 0;
3128
7.47k
      }
3129
7.47k
    continue;  /* Next character */
3130
7.47k
    }
3131
3132
  /* If we are processing the "name" part of a (*VERB:NAME) item, all
3133
  characters up to the closing parenthesis are literals except when
3134
  PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
3135
  and \E and escaped characters are allowed (no character types such as \d). If
3136
  PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
3137
  this by not entering the special (*VERB:NAME) processing - they are then
3138
  picked up below. Note that c is a character, not a code unit, so we must not
3139
  use MAX_255 to test its size because MAX_255 tests code units and is assumed
3140
  TRUE in 8-bit mode. */
3141
3142
955k
  if (inverbname &&
3143
955k
       (
3144
        /* EITHER: not both options set */
3145
6.22k
        ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
3146
6.22k
                    (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
3147
6.22k
#ifdef SUPPORT_UNICODE
3148
        /* OR: character > 255 AND not Unicode Pattern White Space */
3149
6.22k
        (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
3150
6.22k
#endif
3151
        /* OR: not a # comment or isspace() white space */
3152
6.22k
        (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
3153
0
#ifdef SUPPORT_UNICODE
3154
        /* and not CHAR_NEL when Unicode is supported */
3155
0
          && c != CHAR_NEL
3156
0
#endif
3157
0
       )))
3158
6.22k
    {
3159
6.22k
    PCRE2_SIZE verbnamelength;
3160
3161
6.22k
    switch(c)
3162
6.22k
      {
3163
6.17k
      default:                     /* Don't use PARSED_LITERAL() because it */
3164
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3165
      if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3166
#endif
3167
6.17k
      *parsed_pattern++ = c;
3168
6.17k
      break;
3169
3170
49
      case CHAR_RIGHT_PARENTHESIS:
3171
49
      inverbname = FALSE;
3172
      /* This is the length in characters */
3173
49
      verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
3174
      /* But the limit on the length is in code units */
3175
49
      if (ptr - verbnamestart - 1 > (int)MAX_MARK)
3176
0
        {
3177
0
        ptr--;
3178
0
        errorcode = ERR76;
3179
0
        goto FAILED;
3180
0
        }
3181
49
      *verblengthptr = (uint32_t)verbnamelength;
3182
3183
      /* If this name was on a verb such as (*ACCEPT) which does not continue,
3184
      a (*MARK) was generated for the name. We now add the original verb as the
3185
      next item. */
3186
3187
49
      if (add_after_mark != 0)
3188
0
        {
3189
0
        *parsed_pattern++ = add_after_mark;
3190
0
        add_after_mark = 0;
3191
0
        }
3192
49
      break;
3193
3194
0
      case CHAR_BACKSLASH:
3195
0
      if ((options & PCRE2_ALT_VERBNAMES) != 0)
3196
0
        {
3197
0
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3198
0
          xoptions, cb->bracount, FALSE, cb);
3199
0
        if (errorcode != 0) goto FAILED;
3200
0
        }
3201
0
      else escape = 0;   /* Treat all as literal */
3202
3203
0
      switch(escape)
3204
0
        {
3205
0
        case 0:                    /* Don't use PARSED_LITERAL() because it */
3206
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3207
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3208
#endif
3209
0
        *parsed_pattern++ = c;
3210
0
        break;
3211
3212
0
        case ESC_ub:
3213
0
        *parsed_pattern++ = CHAR_u;
3214
0
        PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3215
0
        break;
3216
3217
0
        case ESC_Q:
3218
0
        inescq = TRUE;
3219
0
        break;
3220
3221
0
        case ESC_E:           /* Ignore */
3222
0
        break;
3223
3224
0
        default:
3225
0
        errorcode = ERR40;    /* Invalid in verb name */
3226
0
        goto FAILED;
3227
0
        }
3228
6.22k
      }
3229
6.22k
    continue;   /* Next character in pattern */
3230
6.22k
    }
3231
3232
  /* Not a verb name character. At this point we must process everything that
3233
  must not change the quantification state. This is mainly comments, but we
3234
  handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3235
  A+, as in Perl. An isolated \E is ignored. */
3236
3237
949k
  if (c == CHAR_BACKSLASH && ptr < ptrend)
3238
51.7k
    {
3239
51.7k
    if (*ptr == CHAR_Q || *ptr == CHAR_E)
3240
69
      {
3241
69
      inescq = *ptr == CHAR_Q;
3242
69
      ptr++;
3243
69
      continue;
3244
69
      }
3245
51.7k
    }
3246
3247
  /* Skip over whitespace and # comments in extended mode. Note that c is a
3248
  character, not a code unit, so we must not use MAX_255 to test its size
3249
  because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3250
  whitespace characters are those designated as "Pattern White Space" by
3251
  Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3252
  U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3253
  subset of space characters that match \h and \v. */
3254
3255
949k
  if ((options & PCRE2_EXTENDED) != 0)
3256
0
    {
3257
0
    if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3258
0
#ifdef SUPPORT_UNICODE
3259
0
    if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3260
0
#endif
3261
0
    if (c == CHAR_NUMBER_SIGN)
3262
0
      {
3263
0
      while (ptr < ptrend)
3264
0
        {
3265
0
        if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3266
0
          {                       /* IS_NEWLINE sets cb->nllen. */
3267
0
          ptr += cb->nllen;
3268
0
          break;
3269
0
          }
3270
0
        ptr++;
3271
0
#ifdef SUPPORT_UNICODE
3272
0
        if (utf) FORWARDCHARTEST(ptr, ptrend);
3273
0
#endif
3274
0
        }
3275
0
      continue;  /* Next character in pattern */
3276
0
      }
3277
0
    }
3278
3279
  /* Skip over bracketed comments */
3280
3281
949k
  if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3282
949k
      ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3283
0
    {
3284
0
    while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3285
0
    if (ptr >= ptrend)
3286
0
      {
3287
0
      errorcode = ERR18;  /* A special error for missing ) in a comment */
3288
0
      goto FAILED;        /* to make it easier to debug. */
3289
0
      }
3290
0
    ptr++;
3291
0
    continue;  /* Next character in pattern */
3292
0
    }
3293
3294
  /* If the next item is not a quantifier, fill in length of any previous
3295
  callout and create an auto callout if required. */
3296
3297
949k
  if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3298
949k
       (c != CHAR_LEFT_CURLY_BRACKET ||
3299
863k
         (tempptr = ptr,
3300
2.52k
         !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3301
863k
    {
3302
863k
    if (after_manual_callout-- <= 0)
3303
863k
      {
3304
863k
      parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3305
863k
        parsed_pattern, cb);
3306
863k
      this_parsed_item = parsed_pattern;  /* New start for current item */
3307
863k
      }
3308
863k
    }
3309
3310
  /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3311
  assertion, possibly preceded by a callout. If the value is 1, we have just
3312
  had the callout and expect an assertion. There must be at least 3 more
3313
  characters in all cases. When expect_cond_assert is 2, we know that the
3314
  current character is an opening parenthesis, as otherwise we wouldn't be
3315
  here. However, when it is 1, we need to check, and it's easiest just to check
3316
  always. Note that expect_cond_assert may be negative, since all callouts just
3317
  decrement it. */
3318
3319
949k
  if (expect_cond_assert > 0)
3320
0
    {
3321
0
    BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3322
0
              (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3323
0
    if (ok)
3324
0
      {
3325
0
      if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3326
0
        {
3327
0
        ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3328
0
        }
3329
0
      else switch(ptr[1])  /* Traditional symbolic format */
3330
0
        {
3331
0
        case CHAR_C:
3332
0
        ok = expect_cond_assert == 2;
3333
0
        break;
3334
3335
0
        case CHAR_EQUALS_SIGN:
3336
0
        case CHAR_EXCLAMATION_MARK:
3337
0
        break;
3338
3339
0
        case CHAR_LESS_THAN_SIGN:
3340
0
        ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3341
0
        break;
3342
3343
0
        default:
3344
0
        ok = FALSE;
3345
0
        }
3346
0
      }
3347
3348
0
    if (!ok)
3349
0
      {
3350
0
      ptr--;   /* Adjust error offset */
3351
0
      errorcode = ERR28;
3352
0
      goto FAILED;
3353
0
      }
3354
0
    }
3355
3356
  /* Remember whether we are expecting a conditional assertion, and set the
3357
  default for this item. */
3358
3359
949k
  prev_expect_cond_assert = expect_cond_assert;
3360
949k
  expect_cond_assert = 0;
3361
3362
  /* Remember quantification status for the previous significant item, then set
3363
  default for this item. */
3364
3365
949k
  prev_okquantifier = okquantifier;
3366
949k
  prev_meta_quantifier = meta_quantifier;
3367
949k
  okquantifier = FALSE;
3368
949k
  meta_quantifier = 0;
3369
3370
  /* If the previous significant item was a quantifier, adjust the parsed code
3371
  if there is a following modifier. The base meta value is always followed by
3372
  the PLUS and QUERY values, in that order. We do this here rather than after
3373
  reading a quantifier so that intervening comments and /x whitespace can be
3374
  ignored without having to replicate code. */
3375
3376
949k
  if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3377
16.0k
    {
3378
16.0k
    parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3379
16.0k
      prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3380
13.5k
        0x00020000u : 0x00010000u);
3381
16.0k
    continue;  /* Next character in pattern */
3382
16.0k
    }
3383
3384
  /* Process the next item in the main part of a pattern. */
3385
3386
933k
  switch(c)
3387
933k
    {
3388
724k
    default:              /* Non-special character */
3389
724k
    PARSED_LITERAL(c, parsed_pattern);
3390
724k
    break;
3391
3392
3393
    /* ---- Escape sequence ---- */
3394
3395
51.7k
    case CHAR_BACKSLASH:
3396
51.7k
    tempptr = ptr;
3397
51.7k
    escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3398
51.7k
      xoptions, cb->bracount, FALSE, cb);
3399
51.7k
    if (errorcode != 0)
3400
114
      {
3401
146
      ESCAPE_FAILED:
3402
146
      if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3403
146
        goto FAILED;
3404
0
      ptr = tempptr;
3405
0
      if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3406
0
        {
3407
0
        GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3408
0
        }
3409
0
      escape = 0;                 /* Treat as literal character */
3410
0
      }
3411
3412
    /* The escape was a data escape or literal character. */
3413
3414
51.6k
    if (escape == 0)
3415
19.9k
      {
3416
19.9k
      PARSED_LITERAL(c, parsed_pattern);
3417
19.9k
      }
3418
3419
    /* The escape was a back (or forward) reference. We keep the offset in
3420
    order to give a more useful diagnostic for a bad forward reference. For
3421
    references to groups numbered less than 10 we can't use more than two items
3422
    in parsed_pattern because they may be just two characters in the input (and
3423
    in a 64-bit world an offset may need two elements). So for them, the offset
3424
    of the first occurrent is held in a special vector. */
3425
3426
31.6k
    else if (escape < 0)
3427
1.58k
      {
3428
1.58k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3429
1.58k
      escape = -escape - 1;
3430
1.58k
      *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3431
1.58k
      if (escape < 10)
3432
1.39k
        {
3433
1.39k
        if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3434
341
          cb->small_ref_offset[escape] = offset;
3435
1.39k
        }
3436
195
      else
3437
195
        {
3438
195
        PUTOFFSET(offset, parsed_pattern);
3439
195
        }
3440
1.58k
      okquantifier = TRUE;
3441
1.58k
      }
3442
3443
    /* The escape was a character class such as \d etc. or other special
3444
    escape indicator such as \A or \X. Most of them generate just a single
3445
    parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3446
    value. They are supported only when Unicode is available. The type and
3447
    value are packed into a single 32-bit value so that the whole sequences
3448
    uses only two elements in the parsed_vector. This is because the same
3449
    coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3450
    set.
3451
3452
    There are also some cases where the escape sequence is followed by a name:
3453
    \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3454
    and \g'name' are subroutine calls by name; \g{name} is a synonym for
3455
    \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3456
    and returned as a negative value (handled above). A name is coded as an
3457
    offset into the pattern and a length. */
3458
3459
30.0k
    else switch (escape)
3460
30.0k
      {
3461
149
      case ESC_C:
3462
#ifdef NEVER_BACKSLASH_C
3463
      errorcode = ERR85;
3464
      goto ESCAPE_FAILED;
3465
#else
3466
149
      if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3467
0
        {
3468
0
        errorcode = ERR83;
3469
0
        goto ESCAPE_FAILED;
3470
0
        }
3471
149
#endif
3472
149
      okquantifier = TRUE;
3473
149
      *parsed_pattern++ = META_ESCAPE + escape;
3474
149
      break;
3475
3476
      /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3477
      when \u{ is not followed by hex digits and }. It requests two literal
3478
      characters, u and { and we need this, as otherwise \u{ 12} (for example)
3479
      would be treated as u{12} now that spaces are allowed in quantifiers. */
3480
3481
0
      case ESC_ub:
3482
0
      *parsed_pattern++ = CHAR_u;
3483
0
      PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3484
0
      break;
3485
3486
602
      case ESC_X:
3487
#ifndef SUPPORT_UNICODE
3488
      errorcode = ERR45;   /* Supported only with Unicode support */
3489
      goto ESCAPE_FAILED;
3490
#endif
3491
1.28k
      case ESC_H:
3492
1.39k
      case ESC_h:
3493
5.67k
      case ESC_N:
3494
11.3k
      case ESC_R:
3495
12.2k
      case ESC_V:
3496
12.7k
      case ESC_v:
3497
12.7k
      okquantifier = TRUE;
3498
12.7k
      *parsed_pattern++ = META_ESCAPE + escape;
3499
12.7k
      break;
3500
3501
1.10k
      default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3502
1.10k
      *parsed_pattern++ = META_ESCAPE + escape;
3503
1.10k
      break;
3504
3505
      /* Escapes that may change in UCP mode. */
3506
3507
2.40k
      case ESC_d:
3508
3.36k
      case ESC_D:
3509
4.74k
      case ESC_s:
3510
5.08k
      case ESC_S:
3511
15.1k
      case ESC_w:
3512
15.4k
      case ESC_W:
3513
15.4k
      okquantifier = TRUE;
3514
15.4k
      parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3515
15.4k
        xoptions);
3516
15.4k
      break;
3517
3518
      /* Unicode property matching */
3519
3520
397
      case ESC_P:
3521
526
      case ESC_p:
3522
526
#ifdef SUPPORT_UNICODE
3523
526
        {
3524
526
        BOOL negated;
3525
526
        uint16_t ptype = 0, pdata = 0;
3526
526
        if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3527
22
          goto ESCAPE_FAILED;
3528
504
        if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3529
504
        *parsed_pattern++ = META_ESCAPE + escape;
3530
504
        *parsed_pattern++ = (ptype << 16) | pdata;
3531
504
        okquantifier = TRUE;
3532
504
        }
3533
#else
3534
      errorcode = ERR45;
3535
      goto ESCAPE_FAILED;
3536
#endif
3537
0
      break;  /* End \P and \p */
3538
3539
      /* When \g is used with quotes or angle brackets as delimiters, it is a
3540
      numerical or named subroutine call, and control comes here. When used
3541
      with brace delimiters it is a numerical back reference and does not come
3542
      here because check_escape() returns it directly as a reference. \k is
3543
      always a named back reference. */
3544
3545
3
      case ESC_g:
3546
10
      case ESC_k:
3547
10
      if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3548
10
          *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3549
5
        {
3550
5
        errorcode = (escape == ESC_g)? ERR57 : ERR69;
3551
5
        goto ESCAPE_FAILED;
3552
5
        }
3553
5
      terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3554
3
        CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3555
2
        CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3556
3557
      /* For a non-braced \g, check for a numerical recursion. */
3558
3559
5
      if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3560
3
        {
3561
3
        PCRE2_SPTR p = ptr + 1;
3562
3563
3
        if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3564
3
            &errorcode))
3565
0
          {
3566
0
          if (p >= ptrend || *p != terminator)
3567
0
            {
3568
0
            errorcode = ERR57;
3569
0
            goto ESCAPE_FAILED;
3570
0
            }
3571
0
          ptr = p;
3572
0
          goto SET_RECURSION;
3573
0
          }
3574
3
        if (errorcode != 0) goto ESCAPE_FAILED;
3575
3
        }
3576
3577
      /* Not a numerical recursion. Perl allows spaces and tabs after { and
3578
      before } but not for other delimiters. */
3579
3580
5
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3581
5
          &errorcode, cb)) goto ESCAPE_FAILED;
3582
3583
      /* \k and \g when used with braces are back references, whereas \g used
3584
      with quotes or angle brackets is a recursion */
3585
3586
0
      *parsed_pattern++ =
3587
0
        (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3588
0
          META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3589
0
      *parsed_pattern++ = namelen;
3590
3591
0
      PUTOFFSET(offset, parsed_pattern);
3592
0
      okquantifier = TRUE;
3593
0
      break;  /* End special escape processing */
3594
30.0k
      }
3595
51.5k
    break;    /* End escape sequence processing */
3596
3597
3598
    /* ---- Single-character special items ---- */
3599
3600
51.5k
    case CHAR_CIRCUMFLEX_ACCENT:
3601
6.33k
    *parsed_pattern++ = META_CIRCUMFLEX;
3602
6.33k
    break;
3603
3604
2.20k
    case CHAR_DOLLAR_SIGN:
3605
2.20k
    *parsed_pattern++ = META_DOLLAR;
3606
2.20k
    break;
3607
3608
11.9k
    case CHAR_DOT:
3609
11.9k
    *parsed_pattern++ = META_DOT;
3610
11.9k
    okquantifier = TRUE;
3611
11.9k
    break;
3612
3613
3614
    /* ---- Single-character quantifiers ---- */
3615
3616
11.8k
    case CHAR_ASTERISK:
3617
11.8k
    meta_quantifier = META_ASTERISK;
3618
11.8k
    goto CHECK_QUANTIFIER;
3619
3620
14.9k
    case CHAR_PLUS:
3621
14.9k
    meta_quantifier = META_PLUS;
3622
14.9k
    goto CHECK_QUANTIFIER;
3623
3624
43.2k
    case CHAR_QUESTION_MARK:
3625
43.2k
    meta_quantifier = META_QUERY;
3626
43.2k
    goto CHECK_QUANTIFIER;
3627
3628
3629
    /* ---- Potential {n,m} quantifier ---- */
3630
3631
2.52k
    case CHAR_LEFT_CURLY_BRACKET:
3632
2.52k
    if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3633
2.52k
        &errorcode))
3634
2.51k
      {
3635
2.51k
      if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3636
2.51k
      PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3637
2.51k
      break;                               /* No more quantifier processing */
3638
2.51k
      }
3639
6
    meta_quantifier = META_MINMAX;
3640
    /* Fall through */
3641
3642
3643
    /* ---- Quantifier post-processing ---- */
3644
3645
    /* Check that a quantifier is allowed after the previous item. This
3646
    guarantees that there is a previous item. */
3647
3648
70.0k
    CHECK_QUANTIFIER:
3649
70.0k
    if (!prev_okquantifier)
3650
277
      {
3651
277
      errorcode = ERR9;
3652
277
      goto FAILED_BACK;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
3653
277
      }
3654
3655
    /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3656
    quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3657
    sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3658
    wrapping it in non-capturing brackets, but we have to allow for a preceding
3659
    (*MARK) for when (*ACCEPT) has an argument. */
3660
3661
69.8k
    if (*prev_parsed_item == META_ACCEPT)
3662
0
      {
3663
0
      uint32_t *p;
3664
0
      for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3665
0
      *verbstartptr = META_NOCAPTURE;
3666
0
      parsed_pattern[1] = META_KET;
3667
0
      parsed_pattern += 2;
3668
3669
#ifdef PCRE2_DEBUG
3670
      PCRE2_ASSERT(parsed_pattern_extra >= 2);
3671
      parsed_pattern_extra -= 2;
3672
#endif
3673
0
      }
3674
3675
    /* Now we can put the quantifier into the parsed pattern vector. At this
3676
    stage, we have only the basic quantifier. The check for a following + or ?
3677
    modifier happens at the top of the loop, after any intervening comments
3678
    have been removed. */
3679
3680
69.8k
    *parsed_pattern++ = meta_quantifier;
3681
69.8k
    if (c == CHAR_LEFT_CURLY_BRACKET)
3682
6
      {
3683
6
      *parsed_pattern++ = min_repeat;
3684
6
      *parsed_pattern++ = max_repeat;
3685
6
      }
3686
69.8k
    break;
3687
3688
3689
    /* ---- Character class ---- */
3690
3691
18.4k
    case CHAR_LEFT_SQUARE_BRACKET:
3692
3693
    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3694
    used for "start of word" and "end of word". As these are otherwise illegal
3695
    sequences, we don't break anything by recognizing them. They are replaced
3696
    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3697
    erroneous and are handled by the normal code below. */
3698
3699
18.4k
    if (ptrend - ptr >= 6 &&
3700
18.4k
         (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3701
18.4k
          PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3702
0
      {
3703
0
      *parsed_pattern++ = META_ESCAPE + ESC_b;
3704
3705
0
      if (ptr[2] == CHAR_LESS_THAN_SIGN)
3706
0
        {
3707
0
        *parsed_pattern++ = META_LOOKAHEAD;
3708
0
        }
3709
0
      else
3710
0
        {
3711
0
        *parsed_pattern++ = META_LOOKBEHIND;
3712
0
        *has_lookbehind = TRUE;
3713
3714
        /* The offset is used only for the "non-fixed length" error; this won't
3715
        occur here, so just store zero. */
3716
3717
0
        PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3718
0
        }
3719
3720
0
      if ((options & PCRE2_UCP) == 0)
3721
0
        *parsed_pattern++ = META_ESCAPE + ESC_w;
3722
0
      else
3723
0
        {
3724
0
        *parsed_pattern++ = META_ESCAPE + ESC_p;
3725
0
        *parsed_pattern++ = PT_WORD << 16;
3726
0
        }
3727
0
      *parsed_pattern++ = META_KET;
3728
0
      ptr += 6;
3729
0
      okquantifier = TRUE;
3730
0
      break;
3731
0
      }
3732
3733
    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3734
    they are encountered at the top level, so we'll do that too. */
3735
3736
18.4k
    if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3737
18.4k
         *ptr == CHAR_EQUALS_SIGN) &&
3738
18.4k
        check_posix_syntax(ptr, ptrend, &tempptr))
3739
0
      {
3740
0
      errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3741
0
      goto FAILED;
3742
0
      }
3743
3744
18.4k
    class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?
3745
18.4k
        CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;
3746
3747
    /* Jump here from '(?[...])'. That jump must initialize class_mode_state,
3748
    set c to the '[' character, and ptr to just after the '['. */
3749
3750
18.4k
    FROM_PERL_EXTENDED_CLASS:
3751
18.4k
    okquantifier = TRUE;
3752
3753
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3754
    because there are holes in the encoding, and simply using the range A-Z
3755
    (for example) would include the characters in the holes. This applies only
3756
    to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3757
    in this respect. In order to accommodate this, we keep track of whether
3758
    character values are literal or not, and a state variable for handling
3759
    ranges. */
3760
3761
    /* Loop for the contents of the class. Classes may be nested, if
3762
    PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */
3763
3764
    /* c is still set to '[' so the loop will handle the start of the class. */
3765
3766
18.4k
    class_depth_m1 = -1;
3767
18.4k
    class_maxdepth_m1 = -1;
3768
18.4k
    class_range_state = RANGE_NO;
3769
18.4k
    class_op_state = CLASS_OP_EMPTY;
3770
18.4k
    class_start = NULL;
3771
3772
18.4k
    for (;;)
3773
393k
      {
3774
393k
      BOOL char_is_literal = TRUE;
3775
3776
      /* Inside \Q...\E everything is literal except \E */
3777
3778
393k
      if (inescq)
3779
2.08k
        {
3780
2.08k
        if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3781
0
          {
3782
0
          inescq = FALSE;                   /* Reset literal state */
3783
0
          ptr++;                            /* Skip the 'E' */
3784
0
          goto CLASS_CONTINUE;
3785
0
          }
3786
3787
        /* Surprisingly, you cannot use \Q..\E to escape a character inside a
3788
        Perl extended class. However, empty \Q\E sequences are allowed, so here
3789
        were're only giving an error if the \Q..\E is non-empty. */
3790
3791
2.08k
        if (class_mode_state == CLASS_MODE_PERL_EXT)
3792
0
          {
3793
0
          errorcode = ERR116;
3794
0
          goto FAILED;
3795
0
          }
3796
3797
2.08k
        goto CLASS_LITERAL;
3798
2.08k
        }
3799
3800
      /* Skip over space and tab (only) in extended-more mode, or anywhere
3801
      inside a Perl extended class (which implies /xx). */
3802
3803
391k
      if ((c == CHAR_SPACE || c == CHAR_HT) &&
3804
391k
          ((options & PCRE2_EXTENDED_MORE) != 0 ||
3805
4.39k
           class_mode_state >= CLASS_MODE_PERL_EXT))
3806
0
        goto CLASS_CONTINUE;
3807
3808
      /* Handle POSIX class names. Perl allows a negation extension of the
3809
      form [:^name:]. A square bracket that doesn't match the syntax is
3810
      treated as a literal. We also recognize the POSIX constructions
3811
      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3812
      5.6 and 5.8 do. */
3813
3814
391k
      if (class_depth_m1 >= 0 &&
3815
391k
          c == CHAR_LEFT_SQUARE_BRACKET &&
3816
391k
          ptrend - ptr >= 3 &&
3817
391k
          (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3818
8.96k
           *ptr == CHAR_EQUALS_SIGN) &&
3819
391k
          check_posix_syntax(ptr, ptrend, &tempptr))
3820
0
        {
3821
0
        BOOL posix_negate = FALSE;
3822
0
        int posix_class;
3823
3824
        /* Perl treats a hyphen before a POSIX class as a literal, not the
3825
        start of a range. However, it gives a warning in its warning mode. PCRE
3826
        does not have a warning mode, so we give an error, because this is
3827
        likely an error on the user's part. */
3828
3829
0
        if (class_range_state == RANGE_STARTED)
3830
0
          {
3831
0
          ptr = tempptr + 2;
3832
0
          errorcode = ERR50;
3833
0
          goto FAILED;
3834
0
          }
3835
3836
        /* Perl treats a hyphen after a POSIX class as a literal, not the
3837
        start of a range. However, it gives a warning in its warning mode
3838
        unless the hyphen is the last character in the class. PCRE does not
3839
        have a warning mode, so we give an error, because this is likely an
3840
        error on the user's part.
3841
3842
        Roll back to the hyphen for the error position. */
3843
3844
0
        if (class_range_state == RANGE_FORBID_STARTED)
3845
0
          {
3846
0
          ptr = class_range_forbid_ptr;
3847
0
          errorcode = ERR50;
3848
0
          goto FAILED;
3849
0
          }
3850
3851
        /* Disallow implicit union in Perl extended classes. */
3852
3853
0
        if (class_op_state == CLASS_OP_OPERAND &&
3854
0
            class_mode_state == CLASS_MODE_PERL_EXT)
3855
0
          {
3856
0
          ptr = tempptr + 2;
3857
0
          errorcode = ERR113;
3858
0
          goto FAILED;
3859
0
          }
3860
3861
0
        if (*ptr != CHAR_COLON)
3862
0
          {
3863
0
          ptr = tempptr + 2;
3864
0
          errorcode = ERR13;
3865
0
          goto FAILED;
3866
0
          }
3867
3868
0
        if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3869
0
          {
3870
0
          posix_negate = TRUE;
3871
0
          ptr++;
3872
0
          }
3873
3874
0
        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3875
0
        ptr = tempptr + 2;
3876
0
        if (posix_class < 0)
3877
0
          {
3878
0
          errorcode = ERR30;
3879
0
          goto FAILED;
3880
0
          }
3881
3882
        /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
3883
        case, the hyphen is treated as a literal, but for '-1' it is disallowed
3884
        (because it would be interpreted as range). */
3885
3886
0
        class_range_state = RANGE_FORBID_NO;
3887
0
        class_op_state = CLASS_OP_OPERAND;
3888
3889
        /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3890
        of the POSIX classes are converted to use Unicode properties \p or \P
3891
        or, in one case, \h or \H. The substitutes table has two values per
3892
        class, containing the type and value of a \p or \P item. The special
3893
        cases are specified with a negative type: a non-zero value causes \h or
3894
        \H to be used, and a zero value falls through to behave like a non-UCP
3895
        POSIX class. There are now also some extra options that force ASCII for
3896
        some classes. */
3897
3898
0
#ifdef SUPPORT_UNICODE
3899
0
        if ((options & PCRE2_UCP) != 0 &&
3900
0
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3901
0
            !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3902
0
              (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3903
0
          {
3904
0
          int ptype = posix_substitutes[2*posix_class];
3905
0
          int pvalue = posix_substitutes[2*posix_class + 1];
3906
3907
0
          if (ptype >= 0)
3908
0
            {
3909
0
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3910
0
            *parsed_pattern++ = (ptype << 16) | pvalue;
3911
0
            goto CLASS_CONTINUE;
3912
0
            }
3913
3914
0
          if (pvalue != 0)
3915
0
            {
3916
0
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3917
0
            goto CLASS_CONTINUE;
3918
0
            }
3919
3920
          /* Fall through */
3921
0
          }
3922
0
#endif  /* SUPPORT_UNICODE */
3923
3924
        /* Non-UCP POSIX class */
3925
3926
0
        *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3927
0
        *parsed_pattern++ = posix_class;
3928
0
        }
3929
3930
      /* Check for the start of the outermost class, or the start of a nested class. */
3931
3932
391k
      else if ((c == CHAR_LEFT_SQUARE_BRACKET &&
3933
391k
                (class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||
3934
27.4k
                 class_mode_state == CLASS_MODE_PERL_EXT)) ||
3935
391k
               (c == CHAR_LEFT_PARENTHESIS &&
3936
372k
                class_mode_state == CLASS_MODE_PERL_EXT))
3937
18.4k
        {
3938
18.4k
        uint32_t start_c = c;
3939
18.4k
        uint32_t new_class_mode_state;
3940
3941
        /* Update the class mode, if moving into a 'leaf' inside a Perl extended
3942
        class. */
3943
3944
18.4k
        if (start_c == CHAR_LEFT_SQUARE_BRACKET &&
3945
18.4k
            class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)
3946
0
          new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;
3947
18.4k
        else
3948
18.4k
          new_class_mode_state = class_mode_state;
3949
3950
        /* Tidy up the other class before starting the nested class. */
3951
        /* -[ beginning a nested class is a literal '-' */
3952
3953
18.4k
        if (class_range_state == RANGE_STARTED)
3954
0
          parsed_pattern[-1] = CHAR_MINUS;
3955
3956
        /* Disallow implicit union in Perl extended classes. */
3957
3958
18.4k
        if (class_op_state == CLASS_OP_OPERAND &&
3959
18.4k
            class_mode_state == CLASS_MODE_PERL_EXT)
3960
0
          {
3961
0
          errorcode = ERR113;
3962
0
          goto FAILED;
3963
0
          }
3964
3965
        /* Validate nesting depth */
3966
18.4k
        if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)
3967
0
          {
3968
0
          errorcode = ERR107;
3969
0
          goto FAILED;        /* Classes too deeply nested */
3970
0
          }
3971
3972
        /* Process the character class start. If the first character is '^', set
3973
        the negation flag. If the first few characters (either before or after ^)
3974
        are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3975
        This makes for compatibility with Perl. */
3976
3977
18.4k
        negate_class = FALSE;
3978
18.4k
        for (;;)
3979
26.0k
          {
3980
26.0k
          if (ptr >= ptrend)
3981
3
            {
3982
3
            if (start_c == CHAR_LEFT_PARENTHESIS)
3983
0
              errorcode = ERR14;  /* Missing terminating ')' */
3984
3
            else
3985
3
              errorcode = ERR6;   /* Missing terminating ']' */
3986
3
            goto FAILED;
3987
3
            }
3988
3989
26.0k
          GETCHARINCTEST(c, ptr);
3990
26.0k
          if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;
3991
26.0k
          else if (c == CHAR_BACKSLASH)
3992
675
            {
3993
675
            if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3994
494
            else if (ptrend - ptr >= 3 &&
3995
494
                PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3996
0
              ptr += 3;
3997
494
            else
3998
494
              break;
3999
675
            }
4000
25.3k
          else if ((c == CHAR_SPACE || c == CHAR_HT) &&  /* Note: just these two */
4001
25.3k
                   ((options & PCRE2_EXTENDED_MORE) != 0 ||
4002
17
                    new_class_mode_state >= CLASS_MODE_PERL_EXT))
4003
0
            continue;
4004
25.3k
          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4005
7.38k
            negate_class = TRUE;
4006
17.9k
          else break;
4007
26.0k
          }
4008
4009
        /* Now the real contents of the class; c has the first "real" character.
4010
        Empty classes are permitted only if the option is set, and if it's not
4011
        a Perl-extended class. */
4012
4013
18.4k
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4014
18.4k
            (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&
4015
18.4k
            new_class_mode_state < CLASS_MODE_PERL_EXT)
4016
0
          {
4017
0
          PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);
4018
4019
0
          if (class_start != NULL)
4020
0
            {
4021
0
            PCRE2_ASSERT(class_depth_m1 >= 0);
4022
            /* Represents that the class is an extended class. */
4023
0
            *class_start |= CLASS_IS_ECLASS;
4024
0
            class_start = NULL;
4025
0
            }
4026
4027
0
          *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
4028
4029
          /* Leave nesting depth unchanged; but check for zero depth to handle the
4030
          very first (top-level) class being empty. */
4031
0
          if (class_depth_m1 < 0) break;
4032
4033
0
          class_range_state = RANGE_NO; /* for processing the containing class */
4034
0
          class_op_state = CLASS_OP_OPERAND;
4035
0
          goto CLASS_CONTINUE;
4036
0
          }
4037
4038
        /* Enter a non-empty class. */
4039
4040
18.4k
        if (class_start != NULL)
4041
0
          {
4042
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4043
          /* Represents that the class is an extended class. */
4044
0
          *class_start |= CLASS_IS_ECLASS;
4045
0
          class_start = NULL;
4046
0
          }
4047
4048
18.4k
        class_start = parsed_pattern;
4049
18.4k
        *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
4050
18.4k
        class_range_state = RANGE_NO;
4051
18.4k
        class_op_state = CLASS_OP_EMPTY;
4052
18.4k
        class_mode_state = new_class_mode_state;
4053
18.4k
        ++class_depth_m1;
4054
18.4k
        if (class_maxdepth_m1 < class_depth_m1)
4055
18.4k
          class_maxdepth_m1 = class_depth_m1;
4056
        /* Reset; no op seen yet at new depth. */
4057
18.4k
        cb->class_op_used[class_depth_m1] = 0;
4058
4059
        /* Implement the special start-of-class literal meaning of ']'. */
4060
18.4k
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4061
18.4k
            new_class_mode_state != CLASS_MODE_PERL_EXT)
4062
596
          {
4063
596
          class_range_state = RANGE_OK_LITERAL;
4064
596
          class_op_state = CLASS_OP_OPERAND;
4065
596
          PARSED_LITERAL(c, parsed_pattern);
4066
596
          goto CLASS_CONTINUE;
4067
596
          }
4068
4069
17.8k
        continue;  /* We have already loaded c with the next character */
4070
18.4k
        }
4071
4072
      /* Check for the end of the class. */
4073
4074
372k
      else if (c == CHAR_RIGHT_SQUARE_BRACKET ||
4075
372k
               (c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))
4076
17.9k
        {
4077
        /* In Perl extended mode, the ']' can only be used to match the
4078
        opening '[', and ')' must match an opening parenthesis. */
4079
17.9k
        if (class_mode_state == CLASS_MODE_PERL_EXT)
4080
0
          {
4081
0
          if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)
4082
0
            {
4083
0
            errorcode = ERR14;
4084
0
            goto FAILED_BACK;
4085
0
            }
4086
0
          if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)
4087
0
            {
4088
0
            errorcode = ERR22;
4089
0
            goto FAILED;
4090
0
            }
4091
0
          }
4092
4093
        /* Check no trailing operator. */
4094
17.9k
        if (class_op_state == CLASS_OP_OPERATOR)
4095
0
          {
4096
0
          errorcode = ERR110;
4097
0
          goto FAILED;
4098
0
          }
4099
4100
        /* Check no empty expression for Perl extended expressions. */
4101
17.9k
        if (class_mode_state == CLASS_MODE_PERL_EXT &&
4102
17.9k
            class_op_state == CLASS_OP_EMPTY)
4103
0
          {
4104
0
          errorcode = ERR114;
4105
0
          goto FAILED;
4106
0
          }
4107
4108
        /* -] at the end of a class is a literal '-' */
4109
17.9k
        if (class_range_state == RANGE_STARTED)
4110
217
          parsed_pattern[-1] = CHAR_MINUS;
4111
4112
17.9k
        *parsed_pattern++ = META_CLASS_END;
4113
4114
17.9k
        if (--class_depth_m1 < 0)
4115
17.9k
          {
4116
          /* Check for and consume ')' after '(?[...]'. */
4117
17.9k
          PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);
4118
17.9k
          if (class_mode_state == CLASS_MODE_PERL_EXT)
4119
0
            {
4120
0
            if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4121
0
              {
4122
0
              errorcode = ERR115;
4123
0
              goto FAILED;
4124
0
              }
4125
4126
0
            ptr++;
4127
0
            }
4128
4129
17.9k
          break;
4130
17.9k
          }
4131
4132
0
        class_range_state = RANGE_NO; /* for processing the containing class */
4133
0
        class_op_state = CLASS_OP_OPERAND;
4134
0
        if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)
4135
0
          class_mode_state = CLASS_MODE_PERL_EXT;
4136
        /* The extended class flag has already
4137
        been set for the parent class. */
4138
0
        class_start = NULL;
4139
0
        }
4140
4141
      /* Handle a Perl set binary operator */
4142
4143
354k
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4144
354k
               (c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4145
2
                c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))
4146
0
        {
4147
        /* Check that there was a preceding operand. */
4148
0
        if (class_op_state != CLASS_OP_OPERAND)
4149
0
          {
4150
0
          errorcode = ERR109;
4151
0
          goto FAILED;
4152
0
          }
4153
4154
0
        if (class_start != NULL)
4155
0
          {
4156
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4157
          /* Represents that the class is an extended class. */
4158
0
          *class_start |= CLASS_IS_ECLASS;
4159
0
          class_start = NULL;
4160
0
          }
4161
4162
0
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4163
0
                     class_range_state != RANGE_FORBID_STARTED);
4164
4165
0
        *parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :
4166
0
                            c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4167
0
                            c == CHAR_MINUS? META_ECLASS_SUB :
4168
0
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4169
0
                            META_ECLASS_XOR;
4170
0
        class_range_state = RANGE_NO;
4171
0
        class_op_state = CLASS_OP_OPERATOR;
4172
0
        }
4173
4174
      /* Handle a Perl set unary operator */
4175
4176
354k
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4177
354k
               c == CHAR_EXCLAMATION_MARK)
4178
0
        {
4179
        /* Check that the "!" has not got a preceding operand (i.e. it's the
4180
        start of the class, or follows an operator). */
4181
0
        if (class_op_state == CLASS_OP_OPERAND)
4182
0
          {
4183
0
          errorcode = ERR113;
4184
0
          goto FAILED;
4185
0
          }
4186
4187
0
        if (class_start != NULL)
4188
0
          {
4189
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4190
          /* Represents that the class is an extended class. */
4191
0
          *class_start |= CLASS_IS_ECLASS;
4192
0
          class_start = NULL;
4193
0
          }
4194
4195
0
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4196
0
                     class_range_state != RANGE_FORBID_STARTED);
4197
4198
0
        *parsed_pattern++ = META_ECLASS_NOT;
4199
0
        class_range_state = RANGE_NO;
4200
0
        class_op_state = CLASS_OP_OPERATOR;
4201
0
        }
4202
4203
      /* Handle a UTS#18 set operator */
4204
4205
354k
      else if (class_mode_state == CLASS_MODE_ALT_EXT &&
4206
354k
               (c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4207
0
                c == CHAR_AMPERSAND || c == CHAR_TILDE) &&
4208
354k
               ptr < ptrend && *ptr == c)
4209
0
        {
4210
0
        ++ptr;
4211
4212
        /* Check there isn't a triple-repetition. */
4213
0
        if (ptr < ptrend && *ptr == c)
4214
0
          {
4215
0
          while (ptr < ptrend && *ptr == c) ++ptr;  /* Improve error offset. */
4216
0
          errorcode = ERR108;
4217
0
          goto FAILED;
4218
0
          }
4219
4220
        /* Check for a preceding operand. */
4221
0
        if (class_op_state != CLASS_OP_OPERAND)
4222
0
          {
4223
0
          errorcode = ERR109;
4224
0
          goto FAILED;
4225
0
          }
4226
4227
        /* Check for mixed precedence. Forbid [A--B&&C]. */
4228
0
        if (cb->class_op_used[class_depth_m1] != 0 &&
4229
0
            cb->class_op_used[class_depth_m1] != (uint8_t)c)
4230
0
          {
4231
0
          errorcode = ERR111;
4232
0
          goto FAILED;
4233
0
          }
4234
4235
0
        if (class_start != NULL)
4236
0
          {
4237
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4238
          /* Represents that the class is an extended class. */
4239
0
          *class_start |= CLASS_IS_ECLASS;
4240
0
          class_start = NULL;
4241
0
          }
4242
4243
        /* Dangling '-' before an operator is a literal */
4244
0
        if (class_range_state == RANGE_STARTED)
4245
0
          parsed_pattern[-1] = CHAR_MINUS;
4246
4247
0
        *parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4248
0
                            c == CHAR_MINUS? META_ECLASS_SUB :
4249
0
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4250
0
                            META_ECLASS_XOR;
4251
0
        class_range_state = RANGE_NO;
4252
0
        class_op_state = CLASS_OP_OPERATOR;
4253
0
        cb->class_op_used[class_depth_m1] = (uint8_t)c;
4254
0
        }
4255
4256
      /* Handle escapes in a class */
4257
4258
354k
      else if (c == CHAR_BACKSLASH)
4259
10.4k
        {
4260
10.4k
        tempptr = ptr;
4261
10.4k
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
4262
10.4k
          xoptions, cb->bracount, TRUE, cb);
4263
4264
10.4k
        if (errorcode != 0)
4265
11
          {
4266
11
          if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||
4267
11
              class_mode_state >= CLASS_MODE_PERL_EXT)
4268
11
            goto FAILED;
4269
0
          ptr = tempptr;
4270
0
          if (ptr >= ptrend) c = CHAR_BACKSLASH; else
4271
0
            {
4272
0
            GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
4273
0
            }
4274
0
          escape = 0;                 /* Treat as literal character */
4275
0
          }
4276
4277
10.4k
        switch(escape)
4278
10.4k
          {
4279
5.25k
          case 0:  /* Escaped character code point is in c */
4280
5.25k
          char_is_literal = FALSE;
4281
5.25k
          goto CLASS_LITERAL;      /* (a few lines above) */
4282
4283
53
          case ESC_b:
4284
53
          c = CHAR_BS;    /* \b is backspace in a class */
4285
53
          char_is_literal = FALSE;
4286
53
          goto CLASS_LITERAL;
4287
4288
15
          case ESC_k:
4289
15
          c = CHAR_k;     /* \k is not special in a class, just like \g */
4290
15
          char_is_literal = FALSE;
4291
15
          goto CLASS_LITERAL;
4292
4293
6
          case ESC_Q:
4294
6
          inescq = TRUE;  /* Enter literal mode */
4295
6
          goto CLASS_CONTINUE;
4296
4297
89
          case ESC_E:     /* Ignore orphan \E */
4298
89
          goto CLASS_CONTINUE;
4299
4300
0
          case ESC_B:     /* Always an error in a class */
4301
15
          case ESC_R:
4302
18
          case ESC_X:
4303
18
          errorcode = ERR7;
4304
18
          ptr--;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
4305
18
          goto FAILED;
4306
4307
0
          case ESC_N:     /* Not permitted by Perl either */
4308
0
          errorcode = ERR71;
4309
0
          goto FAILED;
4310
4311
10
          case ESC_H:
4312
17
          case ESC_h:
4313
130
          case ESC_V:
4314
182
          case ESC_v:
4315
182
          *parsed_pattern++ = META_ESCAPE + escape;
4316
182
          break;
4317
4318
          /* These escapes may be converted to Unicode property tests when
4319
          PCRE2_UCP is set. */
4320
4321
1.10k
          case ESC_d:
4322
1.76k
          case ESC_D:
4323
1.80k
          case ESC_s:
4324
1.84k
          case ESC_S:
4325
2.16k
          case ESC_w:
4326
3.59k
          case ESC_W:
4327
3.59k
          parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
4328
3.59k
            xoptions);
4329
3.59k
          break;
4330
4331
          /* Explicit Unicode property matching */
4332
4333
438
          case ESC_P:
4334
1.24k
          case ESC_p:
4335
1.24k
#ifdef SUPPORT_UNICODE
4336
1.24k
            {
4337
1.24k
            BOOL negated;
4338
1.24k
            uint16_t ptype = 0, pdata = 0;
4339
1.24k
            if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
4340
0
              goto FAILED;
4341
4342
            /* In caseless matching, particular characteristics Lu, Ll, and Lt
4343
            get converted to the general characteristic L&. That is, upper,
4344
            lower, and title case letters are all conflated. */
4345
4346
1.24k
            if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
4347
1.24k
                (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
4348
0
              {
4349
0
              ptype = PT_LAMP;
4350
0
              pdata = 0;
4351
0
              }
4352
4353
1.24k
            if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
4354
1.24k
            *parsed_pattern++ = META_ESCAPE + escape;
4355
1.24k
            *parsed_pattern++ = (ptype << 16) | pdata;
4356
1.24k
            }
4357
#else
4358
          errorcode = ERR45;
4359
          goto FAILED;
4360
#endif
4361
0
          break;  /* End \P and \p */
4362
4363
          /* All others are not allowed in a class */
4364
4365
0
          default:
4366
0
          PCRE2_DEBUG_UNREACHABLE();
4367
          /* Fall through */
4368
4369
0
          case ESC_A:
4370
0
          case ESC_Z:
4371
0
          case ESC_z:
4372
0
          case ESC_G:
4373
0
          case ESC_K:
4374
5
          case ESC_C:
4375
5
          errorcode = ERR7;
4376
5
          ptr--;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
4377
5
          goto FAILED;
4378
10.4k
          }
4379
4380
        /* All the switch-cases above which end in "break" describe a set
4381
        of characters. None may start a range. */
4382
4383
        /* The second part of a range can be a single-character escape
4384
        sequence (detected above), but not any of the other escapes. Perl
4385
        treats a hyphen as a literal in such circumstances. However, in Perl's
4386
        warning mode, a warning is given, so PCRE now faults it, as it is
4387
        almost certainly a mistake on the user's part. */
4388
4389
5.02k
        if (class_range_state == RANGE_STARTED)
4390
0
          {
4391
0
          errorcode = ERR50;
4392
0
          goto FAILED;
4393
0
          }
4394
4395
        /* Perl gives a warning unless the hyphen following a multi-character
4396
        escape is the last character in the class. PCRE throws an error. */
4397
4398
5.02k
        if (class_range_state == RANGE_FORBID_STARTED)
4399
0
          {
4400
0
          ptr = class_range_forbid_ptr;
4401
0
          errorcode = ERR50;
4402
0
          goto FAILED;
4403
0
          }
4404
4405
        /* Disallow implicit union in Perl extended classes. */
4406
4407
5.02k
        if (class_op_state == CLASS_OP_OPERAND &&
4408
5.02k
            class_mode_state == CLASS_MODE_PERL_EXT)
4409
0
          {
4410
0
          errorcode = ERR113;
4411
0
          goto FAILED;
4412
0
          }
4413
4414
5.02k
        class_range_state = RANGE_FORBID_NO;
4415
5.02k
        class_op_state = CLASS_OP_OPERAND;
4416
5.02k
        }
4417
4418
      /* Forbid unescaped literals, and the special meaning of '-', inside a
4419
      Perl extended class. */
4420
4421
344k
      else if (class_mode_state == CLASS_MODE_PERL_EXT)
4422
2
        {
4423
2
        errorcode = ERR116;
4424
2
        goto FAILED;
4425
2
        }
4426
4427
      /* Handle potential start of range */
4428
4429
344k
      else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
4430
1.96k
        {
4431
1.96k
        *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
4432
1.92k
          META_RANGE_LITERAL : META_RANGE_ESCAPED;
4433
1.96k
        class_range_state = RANGE_STARTED;
4434
1.96k
        }
4435
4436
      /* Handle forbidden start of range */
4437
4438
342k
      else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
4439
0
        {
4440
0
        *parsed_pattern++ = CHAR_MINUS;
4441
0
        class_range_state = RANGE_FORBID_STARTED;
4442
0
        class_range_forbid_ptr = ptr;
4443
0
        }
4444
4445
      /* Handle a literal character */
4446
4447
342k
      else
4448
342k
        {
4449
349k
        CLASS_LITERAL:
4450
4451
        /* Disallow implicit union in Perl extended classes. */
4452
4453
349k
        if (class_op_state == CLASS_OP_OPERAND &&
4454
349k
            class_mode_state == CLASS_MODE_PERL_EXT)
4455
0
          {
4456
0
          errorcode = ERR113;
4457
0
          goto FAILED;
4458
0
          }
4459
4460
349k
        if (class_range_state == RANGE_STARTED)
4461
1.74k
          {
4462
1.74k
          if (c == parsed_pattern[-2])       /* Optimize one-char range */
4463
338
            parsed_pattern--;
4464
1.40k
          else if (parsed_pattern[-2] > c)   /* Check range is in order */
4465
43
            {
4466
43
            errorcode = ERR8;
4467
43
            goto FAILED_BACK;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
4468
43
            }
4469
1.36k
          else
4470
1.36k
            {
4471
1.36k
            if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
4472
88
              parsed_pattern[-1] = META_RANGE_ESCAPED;
4473
1.36k
            PARSED_LITERAL(c, parsed_pattern);
4474
1.36k
            }
4475
1.70k
          class_range_state = RANGE_NO;
4476
1.70k
          class_op_state = CLASS_OP_OPERAND;
4477
1.70k
          }
4478
347k
        else if (class_range_state == RANGE_FORBID_STARTED)
4479
0
          {
4480
0
          ptr = class_range_forbid_ptr;
4481
0
          errorcode = ERR50;
4482
0
          goto FAILED;
4483
0
          }
4484
347k
        else  /* Potential start of range */
4485
347k
          {
4486
347k
          class_range_state = char_is_literal?
4487
342k
            RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
4488
347k
          class_op_state = CLASS_OP_OPERAND;
4489
347k
          PARSED_LITERAL(c, parsed_pattern);
4490
347k
          }
4491
349k
        }
4492
4493
      /* Proceed to next thing in the class. */
4494
4495
357k
      CLASS_CONTINUE:
4496
357k
      if (ptr >= ptrend)
4497
371
        {
4498
371
        if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)
4499
0
          errorcode = ERR14;   /* Missing terminating ')' */
4500
371
        if (class_mode_state == CLASS_MODE_ALT_EXT &&
4501
371
            class_depth_m1 == 0 && class_maxdepth_m1 == 1)
4502
0
          errorcode = ERR112;  /* Missing terminating ']', but we saw '[ [ ]...' */
4503
371
        else
4504
371
          errorcode = ERR6;    /* Missing terminating ']' */
4505
371
        goto FAILED;
4506
371
        }
4507
356k
      GETCHARINCTEST(c, ptr);
4508
356k
      }     /* End of class-processing loop */
4509
4510
17.9k
    break;  /* End of character class */
4511
4512
4513
    /* ---- Opening parenthesis ---- */
4514
4515
17.9k
    case CHAR_LEFT_PARENTHESIS:
4516
8.66k
    if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4517
4518
    /* If ( is not followed by ? it is either a capture or a special verb or an
4519
    alpha assertion or a positive non-atomic lookahead. */
4520
4521
8.66k
    if (*ptr != CHAR_QUESTION_MARK)
4522
7.27k
      {
4523
7.27k
      const char *vn;
4524
4525
      /* Handle capturing brackets (or non-capturing if auto-capture is turned
4526
      off). */
4527
4528
7.27k
      if (*ptr != CHAR_ASTERISK)
4529
7.22k
        {
4530
7.22k
        nest_depth++;
4531
7.22k
        if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
4532
7.21k
          {
4533
7.21k
          if (cb->bracount >= MAX_GROUP_NUMBER)
4534
0
            {
4535
0
            errorcode = ERR97;
4536
0
            goto FAILED;
4537
0
            }
4538
7.21k
          cb->bracount++;
4539
7.21k
          *parsed_pattern++ = META_CAPTURE | cb->bracount;
4540
7.21k
          }
4541
8
        else *parsed_pattern++ = META_NOCAPTURE;
4542
7.22k
        }
4543
4544
      /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
4545
      quantifier" error rather than "(*MARK) must have an argument". */
4546
4547
54
      else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
4548
0
        break;
4549
4550
      /* Handle "alpha assertions" such as (*pla:...). Most of these are
4551
      synonyms for the historical symbolic assertions, but the script run and
4552
      non-atomic lookaround ones are new. They are distinguished by starting
4553
      with a lower case letter. Checking both ends of the alphabet makes this
4554
      work in all character codes. */
4555
4556
54
      else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
4557
0
        {
4558
0
        uint32_t meta;
4559
4560
0
        vn = alasnames;
4561
0
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4562
0
          &errorcode, cb)) goto FAILED;
4563
0
        if (ptr >= ptrend || *ptr != CHAR_COLON)
4564
0
          {
4565
0
          errorcode = ERR95;  /* Malformed */
4566
0
          goto FAILED;
4567
0
          }
4568
4569
        /* Scan the table of alpha assertion names */
4570
4571
0
        for (i = 0; i < alascount; i++)
4572
0
          {
4573
0
          if (namelen == alasmeta[i].len &&
4574
0
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4575
0
            break;
4576
0
          vn += alasmeta[i].len + 1;
4577
0
          }
4578
4579
0
        if (i >= alascount)
4580
0
          {
4581
0
          errorcode = ERR95;  /* Alpha assertion not recognized */
4582
0
          goto FAILED;
4583
0
          }
4584
4585
        /* Check for expecting an assertion condition. If so, only atomic
4586
        lookaround assertions are valid. */
4587
4588
0
        meta = alasmeta[i].meta;
4589
0
        if (prev_expect_cond_assert > 0 &&
4590
0
            (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
4591
0
          {
4592
0
          errorcode = ERR28;  /* Atomic assertion expected */
4593
0
          goto FAILED;
4594
0
          }
4595
4596
        /* The lookaround alphabetic synonyms can mostly be handled by jumping
4597
        to the code that handles the traditional symbolic forms. */
4598
4599
0
        switch(meta)
4600
0
          {
4601
0
          default:
4602
0
          PCRE2_DEBUG_UNREACHABLE();
4603
0
          errorcode = ERR89;  /* Unknown code; should never occur because */
4604
0
          goto FAILED;        /* the meta values come from a table above. */
4605
4606
0
          case META_ATOMIC:
4607
0
          goto ATOMIC_GROUP;
4608
4609
0
          case META_LOOKAHEAD:
4610
0
          goto POSITIVE_LOOK_AHEAD;
4611
4612
0
          case META_LOOKAHEAD_NA:
4613
0
          goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4614
4615
0
          case META_LOOKAHEADNOT:
4616
0
          goto NEGATIVE_LOOK_AHEAD;
4617
4618
0
          case META_SCS:
4619
0
          if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4620
4621
0
          if (*ptr != CHAR_LEFT_PARENTHESIS)
4622
0
            {
4623
0
            errorcode = ERR15;
4624
0
            goto FAILED;
4625
0
            }
4626
4627
0
          ptr++;
4628
0
          *parsed_pattern++ = META_SCS;
4629
          /* Temporary variable, zero in the first iteration. */
4630
0
          offset = 0;
4631
4632
0
          for (;;)
4633
0
            {
4634
0
            PCRE2_SIZE next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4635
4636
            /* Handle (scan_substring:([+-]number)... */
4637
0
            if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
4638
0
                &i, &errorcode))
4639
0
              {
4640
0
              PCRE2_ASSERT(i >= 0);
4641
0
              if (i <= 0)
4642
0
                {
4643
0
                errorcode = ERR15;
4644
0
                goto FAILED;
4645
0
                }
4646
0
              meta = META_SCS_NUMBER;
4647
0
              namelen = (uint32_t)i;
4648
0
              }
4649
0
            else if (errorcode != 0) goto FAILED;   /* Number too big */
4650
0
            else
4651
0
              {
4652
0
              if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4653
4654
              /* Handle (*scan_substring:('name') or (*scan_substring:(<name>) */
4655
0
              if (*ptr == CHAR_LESS_THAN_SIGN)
4656
0
                terminator = CHAR_GREATER_THAN_SIGN;
4657
0
              else if (*ptr == CHAR_APOSTROPHE)
4658
0
                terminator = CHAR_APOSTROPHE;
4659
0
              else
4660
0
                {
4661
0
                errorcode = ERR15;
4662
0
                goto FAILED;
4663
0
                }
4664
4665
0
              if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,
4666
0
                  &name, &namelen, &errorcode, cb)) goto FAILED;
4667
4668
0
              meta = META_SCS_NAME;
4669
0
              }
4670
4671
0
            PCRE2_ASSERT(next_offset > 0);
4672
0
            if (offset == 0 || (next_offset - offset) >= 0x10000)
4673
0
              {
4674
0
              *parsed_pattern++ = META_OFFSET;
4675
0
              PUTOFFSET(next_offset, parsed_pattern);
4676
0
              offset = next_offset;
4677
0
              }
4678
4679
            /* The offset is encoded as a relative offset, because for some
4680
            inputs such as ",2" in (*scs:(1,2,3)...), we only have space for
4681
            two uint32_t values, and an opcode and absolute offset may require
4682
            three uint32_t values. */
4683
0
            *parsed_pattern++ = meta | (uint32_t)(next_offset - offset);
4684
0
            *parsed_pattern++ = namelen;
4685
0
            offset = next_offset;
4686
4687
0
            if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4688
4689
0
            if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
4690
4691
0
            if (*ptr != CHAR_COMMA)
4692
0
              {
4693
0
              errorcode = ERR24;
4694
0
              goto FAILED;
4695
0
              }
4696
4697
0
            ptr++;
4698
0
            }
4699
0
          ptr++;
4700
0
          goto POST_ASSERTION;
4701
4702
0
          case META_LOOKBEHIND:
4703
0
          case META_LOOKBEHINDNOT:
4704
0
          case META_LOOKBEHIND_NA:
4705
0
          *parsed_pattern++ = meta;
4706
0
          ptr--;
4707
0
          goto POST_LOOKBEHIND;
4708
4709
          /* The script run facilities are handled here. Unicode support is
4710
          required (give an error if not, as this is a security issue). Always
4711
          record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4712
          META_ATOMIC and remember that we need two META_KETs at the end. */
4713
4714
0
          case META_SCRIPT_RUN:
4715
0
          case META_ATOMIC_SCRIPT_RUN:
4716
0
#ifdef SUPPORT_UNICODE
4717
0
          *parsed_pattern++ = META_SCRIPT_RUN;
4718
0
          nest_depth++;
4719
0
          ptr++;
4720
0
          if (meta == META_ATOMIC_SCRIPT_RUN)
4721
0
            {
4722
0
            *parsed_pattern++ = META_ATOMIC;
4723
0
            if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4724
0
            else if (++top_nest >= end_nests)
4725
0
              {
4726
0
              errorcode = ERR84;
4727
0
              goto FAILED;
4728
0
              }
4729
0
            top_nest->nest_depth = nest_depth;
4730
0
            top_nest->flags = NSF_ATOMICSR;
4731
0
            top_nest->options = options & PARSE_TRACKED_OPTIONS;
4732
0
            top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4733
4734
#ifdef PCRE2_DEBUG
4735
            /* We'll write out two META_KETs for a single ")" in the input
4736
            pattern, so we reserve space for that in our bounds check. */
4737
            parsed_pattern_extra++;
4738
#endif
4739
0
            }
4740
0
          break;
4741
#else  /* SUPPORT_UNICODE */
4742
          errorcode = ERR96;
4743
          goto FAILED;
4744
#endif
4745
0
          }
4746
0
        }
4747
4748
4749
      /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4750
4751
54
      else
4752
54
        {
4753
54
        vn = verbnames;
4754
54
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4755
54
          &errorcode, cb)) goto FAILED;
4756
54
        if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4757
54
                              *ptr != CHAR_RIGHT_PARENTHESIS))
4758
5
          {
4759
5
          errorcode = ERR60;  /* Malformed */
4760
5
          goto FAILED;
4761
5
          }
4762
4763
        /* Scan the table of verb names */
4764
4765
49
        for (i = 0; i < verbcount; i++)
4766
49
          {
4767
49
          if (namelen == verbs[i].len &&
4768
49
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4769
49
            break;
4770
0
          vn += verbs[i].len + 1;
4771
0
          }
4772
4773
49
        if (i >= verbcount)
4774
0
          {
4775
0
          errorcode = ERR60;  /* Verb not recognized */
4776
0
          goto FAILED;
4777
0
          }
4778
4779
        /* An empty argument is treated as no argument. */
4780
4781
49
        if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4782
49
             ptr[1] == CHAR_RIGHT_PARENTHESIS)
4783
0
          ptr++;    /* Advance to the closing parens */
4784
4785
        /* Check for mandatory non-empty argument; this is (*MARK) */
4786
4787
49
        if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4788
0
          {
4789
0
          errorcode = ERR66;
4790
0
          goto FAILED;
4791
0
          }
4792
4793
        /* Remember where this verb, possibly with a preceding (*MARK), starts,
4794
        for handling quantified (*ACCEPT). */
4795
4796
49
        verbstartptr = parsed_pattern;
4797
49
        okquantifier = (verbs[i].meta == META_ACCEPT);
4798
#ifdef PCRE2_DEBUG
4799
        /* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)
4800
        with a non-capturing bracket, if there is a following quantifier. */
4801
        if (okquantifier) parsed_pattern_extra += 2;
4802
#endif
4803
4804
        /* It appears that Perl allows any characters whatsoever, other than a
4805
        closing parenthesis, to appear in arguments ("names"), so we no longer
4806
        insist on letters, digits, and underscores. Perl does not, however, do
4807
        any interpretation within arguments, and has no means of including a
4808
        closing parenthesis. PCRE supports escape processing but only when it
4809
        is requested by an option. We set inverbname TRUE here, and let the
4810
        main loop take care of this so that escape and \x processing is done by
4811
        the main code above. */
4812
4813
49
        if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4814
49
          {
4815
          /* Some optional arguments can be treated as a preceding (*MARK) */
4816
4817
49
          if (verbs[i].has_arg < 0)
4818
0
            {
4819
0
            add_after_mark = verbs[i].meta;
4820
0
            *parsed_pattern++ = META_MARK;
4821
0
            }
4822
4823
          /* The remaining verbs with arguments (except *MARK) need a different
4824
          opcode. */
4825
4826
49
          else
4827
49
            {
4828
49
            *parsed_pattern++ = verbs[i].meta +
4829
49
              ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4830
49
            }
4831
4832
          /* Set up for reading the name in the main loop. */
4833
4834
49
          verblengthptr = parsed_pattern++;
4835
49
          verbnamestart = ptr;
4836
49
          inverbname = TRUE;
4837
49
          }
4838
0
        else  /* No verb "name" argument */
4839
0
          {
4840
0
          *parsed_pattern++ = verbs[i].meta;
4841
0
          }
4842
49
        }     /* End of (*VERB) handling */
4843
7.27k
      break;  /* Done with this parenthesis */
4844
7.27k
      }       /* End of groups that don't start with (? */
4845
4846
4847
    /* ---- Items starting (? ---- */
4848
4849
    /* The type of item is determined by what follows (?. Handle (?| and option
4850
    changes under "default" because both need a new block on the nest stack.
4851
    Comments starting with (?# are handled above. Note that there is some
4852
    ambiguity about the sequence (?- because if a digit follows it's a relative
4853
    recursion or subroutine call whereas otherwise it's an option unsetting. */
4854
4855
1.38k
    if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4856
4857
1.38k
    switch(*ptr)
4858
1.38k
      {
4859
403
      default:
4860
403
      if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4861
0
        goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4862
4863
      /* We now have either (?| or a (possibly empty) option setting,
4864
      optionally followed by a non-capturing group. */
4865
4866
403
      nest_depth++;
4867
403
      if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4868
143
      else if (++top_nest >= end_nests)
4869
0
        {
4870
0
        errorcode = ERR84;
4871
0
        goto FAILED;
4872
0
        }
4873
403
      top_nest->nest_depth = nest_depth;
4874
403
      top_nest->flags = 0;
4875
403
      top_nest->options = options & PARSE_TRACKED_OPTIONS;
4876
403
      top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4877
4878
      /* Start of non-capturing group that resets the capture count for each
4879
      branch. */
4880
4881
403
      if (*ptr == CHAR_VERTICAL_LINE)
4882
0
        {
4883
0
        top_nest->reset_group = (uint16_t)cb->bracount;
4884
0
        top_nest->max_group = (uint16_t)cb->bracount;
4885
0
        top_nest->flags |= NSF_RESET;
4886
0
        cb->external_flags |= PCRE2_DUPCAPUSED;
4887
0
        *parsed_pattern++ = META_NOCAPTURE;
4888
0
        ptr++;
4889
0
        }
4890
4891
      /* Scan for options imnrsxJU to be set or unset. */
4892
4893
403
      else
4894
403
        {
4895
403
        BOOL hyphenok = TRUE;
4896
403
        uint32_t oldoptions = options;
4897
403
        uint32_t oldxoptions = xoptions;
4898
4899
403
        top_nest->reset_group = 0;
4900
403
        top_nest->max_group = 0;
4901
403
        set = unset = 0;
4902
403
        optset = &set;
4903
403
        xset = xunset = 0;
4904
403
        xoptset = &xset;
4905
4906
        /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4907
4908
403
        if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4909
0
          {
4910
0
          options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4911
0
                       PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4912
0
          xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4913
0
          hyphenok = FALSE;
4914
0
          ptr++;
4915
0
          }
4916
4917
413
        while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4918
413
                               *ptr != CHAR_COLON)
4919
15
          {
4920
15
          switch (*ptr++)
4921
15
            {
4922
0
            case CHAR_MINUS:
4923
0
            if (!hyphenok)
4924
0
              {
4925
0
              errorcode = ERR94;
4926
0
              ptr--;  /* Correct the offset */
4927
0
              goto FAILED;
4928
0
              }
4929
0
            optset = &unset;
4930
0
            xoptset = &xunset;
4931
0
            hyphenok = FALSE;
4932
0
            break;
4933
4934
            /* There are some two-character sequences that start with 'a'. */
4935
4936
0
            case CHAR_a:
4937
0
            if (ptr < ptrend)
4938
0
              {
4939
0
              if (*ptr == CHAR_D)
4940
0
                {
4941
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSD;
4942
0
                ptr++;
4943
0
                break;
4944
0
                }
4945
0
              if (*ptr == CHAR_P)
4946
0
                {
4947
0
                *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4948
0
                ptr++;
4949
0
                break;
4950
0
                }
4951
0
              if (*ptr == CHAR_S)
4952
0
                {
4953
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSS;
4954
0
                ptr++;
4955
0
                break;
4956
0
                }
4957
0
              if (*ptr == CHAR_T)
4958
0
                {
4959
0
                *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4960
0
                ptr++;
4961
0
                break;
4962
0
                }
4963
0
              if (*ptr == CHAR_W)
4964
0
                {
4965
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSW;
4966
0
                ptr++;
4967
0
                break;
4968
0
                }
4969
0
              }
4970
0
            *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4971
0
                        PCRE2_EXTRA_ASCII_BSW|
4972
0
                        PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4973
0
            break;
4974
4975
7
            case CHAR_J:  /* Record that it changed in the external options */
4976
7
            *optset |= PCRE2_DUPNAMES;
4977
7
            cb->external_flags |= PCRE2_JCHANGED;
4978
7
            break;
4979
4980
2
            case CHAR_i: *optset |= PCRE2_CASELESS; break;
4981
0
            case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4982
0
            case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4983
1
            case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4984
0
            case CHAR_s: *optset |= PCRE2_DOTALL; break;
4985
0
            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4986
4987
            /* If x appears twice it sets the extended extended option. */
4988
4989
0
            case CHAR_x:
4990
0
            *optset |= PCRE2_EXTENDED;
4991
0
            if (ptr < ptrend && *ptr == CHAR_x)
4992
0
              {
4993
0
              *optset |= PCRE2_EXTENDED_MORE;
4994
0
              ptr++;
4995
0
              }
4996
0
            break;
4997
4998
5
            default:
4999
5
            errorcode = ERR11;
5000
5
            ptr--;    /* Correct the offset */
5001
5
            goto FAILED;
5002
15
            }
5003
15
          }
5004
5005
        /* If we are setting extended without extended-more, ensure that any
5006
        existing extended-more gets unset. Also, unsetting extended must also
5007
        unset extended-more. */
5008
5009
398
        if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
5010
398
            (unset & PCRE2_EXTENDED) != 0)
5011
0
          unset |= PCRE2_EXTENDED_MORE;
5012
5013
398
        options = (options | set) & (~unset);
5014
398
        xoptions = (xoptions | xset) & (~xunset);
5015
5016
        /* If the options ended with ')' this is not the start of a nested
5017
        group with option changes, so the options change at this level.
5018
        In this case, if the previous level set up a nest block, discard the
5019
        one we have just created. Otherwise adjust it for the previous level.
5020
        If the options ended with ':' we are starting a non-capturing group,
5021
        possibly with an options setting. */
5022
5023
398
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5024
398
        if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
5025
38
          {
5026
38
          nest_depth--;  /* This is not a nested group after all. */
5027
38
          if (top_nest > (nest_save *)(cb->start_workspace) &&
5028
38
              (top_nest-1)->nest_depth == nest_depth) top_nest--;
5029
32
          else top_nest->nest_depth = nest_depth;
5030
38
          }
5031
360
        else *parsed_pattern++ = META_NOCAPTURE;
5032
5033
        /* If nothing changed, no need to record. */
5034
5035
398
        if (options != oldoptions || xoptions != oldxoptions)
5036
8
          {
5037
8
          *parsed_pattern++ = META_OPTIONS;
5038
8
          *parsed_pattern++ = options;
5039
8
          *parsed_pattern++ = xoptions;
5040
8
          }
5041
398
        }     /* End options processing */
5042
398
      break;  /* End default case after (? */
5043
5044
5045
      /* ---- Python syntax support ---- */
5046
5047
398
      case CHAR_P:
5048
0
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5049
5050
      /* (?P<name> is the same as (?<name>, which defines a named group. */
5051
5052
0
      if (*ptr == CHAR_LESS_THAN_SIGN)
5053
0
        {
5054
0
        terminator = CHAR_GREATER_THAN_SIGN;
5055
0
        goto DEFINE_NAME;
5056
0
        }
5057
5058
      /* (?P>name) is the same as (?&name), which is a recursion or subroutine
5059
      call. */
5060
5061
0
      if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
5062
5063
      /* (?P=name) is the same as \k<name>, a back reference by name. Anything
5064
      else after (?P is an error. */
5065
5066
0
      if (*ptr != CHAR_EQUALS_SIGN)
5067
0
        {
5068
0
        errorcode = ERR41;
5069
0
        goto FAILED;
5070
0
        }
5071
0
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5072
0
          &namelen, &errorcode, cb)) goto FAILED;
5073
0
      *parsed_pattern++ = META_BACKREF_BYNAME;
5074
0
      *parsed_pattern++ = namelen;
5075
0
      PUTOFFSET(offset, parsed_pattern);
5076
0
      okquantifier = TRUE;
5077
0
      break;   /* End of (?P processing */
5078
5079
5080
      /* ---- Recursion/subroutine calls by number ---- */
5081
5082
0
      case CHAR_R:
5083
0
      i = 0;         /* (?R) == (?R0) */
5084
0
      ptr++;
5085
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5086
0
        {
5087
0
        errorcode = ERR58;
5088
0
        goto FAILED;
5089
0
        }
5090
0
      goto SET_RECURSION;
5091
5092
      /* An item starting (?- followed by a digit comes here via the "default"
5093
      case because (?- followed by a non-digit is an options setting. */
5094
5095
0
      case CHAR_PLUS:
5096
0
      if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
5097
0
        {
5098
0
        errorcode = ERR29;   /* Missing number */
5099
0
        goto FAILED;
5100
0
        }
5101
      /* Fall through */
5102
5103
737
      case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5104
777
      case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5105
777
      RECURSION_BYNUMBER:
5106
777
      if (!read_number(&ptr, ptrend,
5107
777
          (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
5108
777
          MAX_GROUP_NUMBER, ERR61,
5109
777
          &i, &errorcode)) goto FAILED;
5110
777
      PCRE2_ASSERT(i >= 0);  /* NB (?0) is permitted, represented by i=0 */
5111
777
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5112
0
        goto UNCLOSED_PARENTHESIS;
5113
5114
777
      SET_RECURSION:
5115
777
      *parsed_pattern++ = META_RECURSE | (uint32_t)i;
5116
777
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5117
777
      ptr++;
5118
777
      PUTOFFSET(offset, parsed_pattern);
5119
777
      okquantifier = TRUE;
5120
777
      break;  /* End of recursive call by number handling */
5121
5122
5123
      /* ---- Recursion/subroutine calls by name ---- */
5124
5125
9
      case CHAR_AMPERSAND:
5126
9
      RECURSE_BY_NAME:
5127
9
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5128
9
          &namelen, &errorcode, cb)) goto FAILED;
5129
0
      *parsed_pattern++ = META_RECURSE_BYNAME;
5130
0
      *parsed_pattern++ = namelen;
5131
0
      PUTOFFSET(offset, parsed_pattern);
5132
0
      okquantifier = TRUE;
5133
0
      break;
5134
5135
      /* ---- Callout with numerical or string argument ---- */
5136
5137
0
      case CHAR_C:
5138
0
      if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)
5139
0
        {
5140
0
        errorcode = ERR103;
5141
0
        goto FAILED;
5142
0
        }
5143
5144
0
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5145
5146
      /* If the previous item was a condition starting (?(? an assertion,
5147
      optionally preceded by a callout, is expected. This is checked later on,
5148
      during actual compilation. However we need to identify this kind of
5149
      assertion in this pass because it must not be qualified. The value of
5150
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5151
      for a callout - still leaving a positive value that identifies the
5152
      assertion. Multiple callouts or any other items will make it zero or
5153
      less, which doesn't matter because they will cause an error later. */
5154
5155
0
      expect_cond_assert = prev_expect_cond_assert - 1;
5156
5157
      /* If previous_callout is not NULL, it means this follows a previous
5158
      callout. If it was a manual callout, do nothing; this means its "length
5159
      of next pattern item" field will remain zero. If it was an automatic
5160
      callout, abolish it. */
5161
5162
0
      if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
5163
0
          previous_callout == parsed_pattern - 4 &&
5164
0
          parsed_pattern[-1] == 255)
5165
0
        parsed_pattern = previous_callout;
5166
5167
      /* Save for updating next pattern item length, and skip one item before
5168
      completing. */
5169
5170
0
      previous_callout = parsed_pattern;
5171
0
      after_manual_callout = 1;
5172
5173
      /* Handle a string argument; specific delimiter is required. */
5174
5175
0
      if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
5176
0
        {
5177
0
        PCRE2_SIZE calloutlength;
5178
0
        PCRE2_SPTR startptr = ptr;
5179
5180
0
        delimiter = 0;
5181
0
        for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
5182
0
          {
5183
0
          if (*ptr == PRIV(callout_start_delims)[i])
5184
0
            {
5185
0
            delimiter = PRIV(callout_end_delims)[i];
5186
0
            break;
5187
0
            }
5188
0
          }
5189
0
        if (delimiter == 0)
5190
0
          {
5191
0
          errorcode = ERR82;
5192
0
          goto FAILED;
5193
0
          }
5194
5195
0
        *parsed_pattern = META_CALLOUT_STRING;
5196
0
        parsed_pattern += 3;   /* Skip pattern info */
5197
5198
0
        for (;;)
5199
0
          {
5200
0
          if (++ptr >= ptrend)
5201
0
            {
5202
0
            errorcode = ERR81;
5203
0
            ptr = startptr;   /* To give a more useful message */
5204
0
            goto FAILED;
5205
0
            }
5206
0
          if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
5207
0
            break;
5208
0
          }
5209
5210
0
        calloutlength = (PCRE2_SIZE)(ptr - startptr);
5211
0
        if (calloutlength > UINT32_MAX)
5212
0
          {
5213
0
          errorcode = ERR72;
5214
0
          goto FAILED;
5215
0
          }
5216
0
        *parsed_pattern++ = (uint32_t)calloutlength;
5217
0
        offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
5218
0
        PUTOFFSET(offset, parsed_pattern);
5219
0
        }
5220
5221
      /* Handle a callout with an optional numerical argument, which must be
5222
      less than or equal to 255. A missing argument gives 0. */
5223
5224
0
      else
5225
0
        {
5226
0
        int n = 0;
5227
0
        *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
5228
0
        parsed_pattern += 3;                       /* Skip pattern info */
5229
0
        while (ptr < ptrend && IS_DIGIT(*ptr))
5230
0
          {
5231
0
          n = n * 10 + (*ptr++ - CHAR_0);
5232
0
          if (n > 255)
5233
0
            {
5234
0
            errorcode = ERR38;
5235
0
            goto FAILED;
5236
0
            }
5237
0
          }
5238
0
        *parsed_pattern++ = n;
5239
0
        }
5240
5241
      /* Both formats must have a closing parenthesis */
5242
5243
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5244
0
        {
5245
0
        errorcode = ERR39;
5246
0
        goto FAILED;
5247
0
        }
5248
0
      ptr++;
5249
5250
      /* Remember the offset to the next item in the pattern, and set a default
5251
      length. This should get updated after the next item is read. */
5252
5253
0
      previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
5254
0
      previous_callout[2] = 0;
5255
0
      break;                  /* End callout */
5256
5257
5258
      /* ---- Conditional group ---- */
5259
5260
      /* A condition can be an assertion, a number (referring to a numbered
5261
      group's having been set), a name (referring to a named group), or 'R',
5262
      referring to overall recursion. R<digits> and R&name are also permitted
5263
      for recursion state tests. Numbers may be preceded by + or - to specify a
5264
      relative group number.
5265
5266
      There are several syntaxes for testing a named group: (?(name)) is used
5267
      by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5268
5269
      There are two unfortunate ambiguities. 'R' can be the recursive thing or
5270
      the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
5271
      the Perl DEFINE feature or the Python named test. We look for a name
5272
      first; if not found, we try the other case.
5273
5274
      For compatibility with auto-callouts, we allow a callout to be specified
5275
      before a condition that is an assertion. */
5276
5277
3
      case CHAR_LEFT_PARENTHESIS:
5278
3
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5279
3
      nest_depth++;
5280
5281
      /* If the next character is ? or * there must be an assertion next
5282
      (optionally preceded by a callout). We do not check this here, but
5283
      instead we set expect_cond_assert to 2. If this is still greater than
5284
      zero (callouts decrement it) when the next assertion is read, it will be
5285
      marked as a condition that must not be repeated. A value greater than
5286
      zero also causes checking that an assertion (possibly with callout)
5287
      follows. */
5288
5289
3
      if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
5290
0
        {
5291
0
        *parsed_pattern++ = META_COND_ASSERT;
5292
0
        ptr--;   /* Pull pointer back to the opening parenthesis. */
5293
0
        expect_cond_assert = 2;
5294
0
        break;  /* End of conditional */
5295
0
        }
5296
5297
      /* Handle (?([+-]number)... */
5298
5299
3
      if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
5300
3
          &errorcode))
5301
0
        {
5302
0
        PCRE2_ASSERT(i >= 0);
5303
0
        if (i <= 0)
5304
0
          {
5305
0
          errorcode = ERR15;
5306
0
          goto FAILED;
5307
0
          }
5308
0
        *parsed_pattern++ = META_COND_NUMBER;
5309
0
        offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5310
0
        PUTOFFSET(offset, parsed_pattern);
5311
0
        *parsed_pattern++ = i;
5312
0
        }
5313
3
      else if (errorcode != 0) goto FAILED;   /* Number too big */
5314
5315
      /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
5316
5317
3
      else if (ptrend - ptr >= 10 &&
5318
3
               PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
5319
3
               ptr[7] != CHAR_RIGHT_PARENTHESIS)
5320
0
        {
5321
0
        uint32_t ge = 0;
5322
0
        int major = 0;
5323
0
        int minor = 0;
5324
5325
0
        ptr += 7;
5326
0
        if (*ptr == CHAR_GREATER_THAN_SIGN)
5327
0
          {
5328
0
          ge = 1;
5329
0
          ptr++;
5330
0
          }
5331
5332
        /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
5333
        references its argument twice. */
5334
5335
0
        if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
5336
0
          goto BAD_VERSION_CONDITION;
5337
5338
0
        if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
5339
0
          goto FAILED;
5340
5341
0
        if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
5342
0
        if (*ptr == CHAR_DOT)
5343
0
          {
5344
0
          if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
5345
0
          minor = (*ptr++ - CHAR_0) * 10;
5346
0
          if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
5347
0
          if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
5348
0
          if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5349
0
            goto BAD_VERSION_CONDITION;
5350
0
          }
5351
5352
0
        *parsed_pattern++ = META_COND_VERSION;
5353
0
        *parsed_pattern++ = ge;
5354
0
        *parsed_pattern++ = major;
5355
0
        *parsed_pattern++ = minor;
5356
0
        }
5357
5358
      /* All the remaining cases now require us to read a name. We cannot at
5359
      this stage distinguish ambiguous cases such as (?(R12) which might be a
5360
      recursion test by number or a name, because the named groups have not yet
5361
      all been identified. Those cases are treated as names, but given a
5362
      different META code. */
5363
5364
3
      else
5365
3
        {
5366
3
        BOOL was_r_ampersand = FALSE;
5367
5368
3
        if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
5369
0
          {
5370
0
          terminator = CHAR_RIGHT_PARENTHESIS;
5371
0
          was_r_ampersand = TRUE;
5372
0
          ptr++;
5373
0
          }
5374
3
        else if (*ptr == CHAR_LESS_THAN_SIGN)
5375
0
          terminator = CHAR_GREATER_THAN_SIGN;
5376
3
        else if (*ptr == CHAR_APOSTROPHE)
5377
0
          terminator = CHAR_APOSTROPHE;
5378
3
        else
5379
3
          {
5380
3
          terminator = CHAR_RIGHT_PARENTHESIS;
5381
3
          ptr--;   /* Point to char before name */
5382
3
          }
5383
3
        if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5384
3
            &errorcode, cb)) goto FAILED;
5385
5386
        /* Handle (?(R&name) */
5387
5388
0
        if (was_r_ampersand)
5389
0
          {
5390
0
          *parsed_pattern = META_COND_RNAME;
5391
0
          ptr--;   /* Back to closing parens */
5392
0
          }
5393
5394
        /* Handle (?(name). If the name is "DEFINE" we identify it with a
5395
        special code. Likewise if the name consists of R followed only by
5396
        digits. Otherwise, handle it like a quoted name. */
5397
5398
0
        else if (terminator == CHAR_RIGHT_PARENTHESIS)
5399
0
          {
5400
0
          if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
5401
0
            *parsed_pattern = META_COND_DEFINE;
5402
0
          else
5403
0
            {
5404
0
            for (i = 1; i < (int)namelen; i++)
5405
0
              if (!IS_DIGIT(name[i])) break;
5406
0
            *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
5407
0
              META_COND_RNUMBER : META_COND_NAME;
5408
0
            }
5409
0
          ptr--;   /* Back to closing parens */
5410
0
          }
5411
5412
        /* Handle (?('name') or (?(<name>) */
5413
5414
0
        else *parsed_pattern = META_COND_NAME;
5415
5416
        /* All these cases except DEFINE end with the name length and offset;
5417
        DEFINE just has an offset (for the "too many branches" error). */
5418
5419
0
        if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
5420
0
        PUTOFFSET(offset, parsed_pattern);
5421
0
        }  /* End cases that read a name */
5422
5423
      /* Check the closing parenthesis of the condition */
5424
5425
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5426
0
        {
5427
0
        errorcode = ERR24;
5428
0
        goto FAILED;
5429
0
        }
5430
0
      ptr++;
5431
0
      break;  /* End of condition processing */
5432
5433
5434
      /* ---- Atomic group ---- */
5435
5436
0
      case CHAR_GREATER_THAN_SIGN:
5437
0
      ATOMIC_GROUP:                          /* Come from (*atomic: */
5438
0
      *parsed_pattern++ = META_ATOMIC;
5439
0
      nest_depth++;
5440
0
      ptr++;
5441
0
      break;
5442
5443
5444
      /* ---- Lookahead assertions ---- */
5445
5446
26
      case CHAR_EQUALS_SIGN:
5447
26
      POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
5448
26
      *parsed_pattern++ = META_LOOKAHEAD;
5449
26
      ptr++;
5450
26
      goto POST_ASSERTION;
5451
5452
0
      case CHAR_ASTERISK:
5453
0
      POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (*napla: */
5454
0
      *parsed_pattern++ = META_LOOKAHEAD_NA;
5455
0
      ptr++;
5456
0
      goto POST_ASSERTION;
5457
5458
12
      case CHAR_EXCLAMATION_MARK:
5459
12
      NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
5460
12
      *parsed_pattern++ = META_LOOKAHEADNOT;
5461
12
      ptr++;
5462
12
      goto POST_ASSERTION;
5463
5464
5465
      /* ---- Lookbehind assertions ---- */
5466
5467
      /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
5468
      is the start of the name of a capturing group. */
5469
5470
150
      case CHAR_LESS_THAN_SIGN:
5471
150
      if (ptrend - ptr <= 1 ||
5472
150
         (ptr[1] != CHAR_EQUALS_SIGN &&
5473
150
          ptr[1] != CHAR_EXCLAMATION_MARK &&
5474
150
          ptr[1] != CHAR_ASTERISK))
5475
0
        {
5476
0
        terminator = CHAR_GREATER_THAN_SIGN;
5477
0
        goto DEFINE_NAME;
5478
0
        }
5479
150
      *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
5480
114
        META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
5481
36
        META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
5482
5483
150
      POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
5484
150
      *has_lookbehind = TRUE;
5485
150
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5486
150
      PUTOFFSET(offset, parsed_pattern);
5487
150
      ptr += 2;
5488
      /* Fall through */
5489
5490
      /* If the previous item was a condition starting (?(? an assertion,
5491
      optionally preceded by a callout, is expected. This is checked later on,
5492
      during actual compilation. However we need to identify this kind of
5493
      assertion in this pass because it must not be qualified. The value of
5494
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5495
      for a callout - still leaving a positive value that identifies the
5496
      assertion. Multiple callouts or any other items will make it zero or
5497
      less, which doesn't matter because they will cause an error later. */
5498
5499
188
      POST_ASSERTION:
5500
188
      nest_depth++;
5501
188
      if (prev_expect_cond_assert > 0)
5502
0
        {
5503
0
        if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
5504
0
        else if (++top_nest >= end_nests)
5505
0
          {
5506
0
          errorcode = ERR84;
5507
0
          goto FAILED;
5508
0
          }
5509
0
        top_nest->nest_depth = nest_depth;
5510
0
        top_nest->flags = NSF_CONDASSERT;
5511
0
        top_nest->options = options & PARSE_TRACKED_OPTIONS;
5512
0
        top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5513
0
        }
5514
188
      break;
5515
5516
5517
      /* ---- Define a named group ---- */
5518
5519
      /* A named group may be defined as (?'name') or (?<name>). In the latter
5520
      case we jump to DEFINE_NAME from the disambiguation of (?< above with the
5521
      terminator set to '>'. */
5522
5523
188
      case CHAR_APOSTROPHE:
5524
3
      terminator = CHAR_APOSTROPHE;    /* Terminator */
5525
5526
3
      DEFINE_NAME:
5527
3
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5528
3
          &errorcode, cb)) goto FAILED;
5529
5530
      /* We have a name for this capturing group. It is also assigned a number,
5531
      which is its primary means of identification. */
5532
5533
0
      if (cb->bracount >= MAX_GROUP_NUMBER)
5534
0
        {
5535
0
        errorcode = ERR97;
5536
0
        goto FAILED;
5537
0
        }
5538
0
      cb->bracount++;
5539
0
      *parsed_pattern++ = META_CAPTURE | cb->bracount;
5540
0
      nest_depth++;
5541
5542
      /* Check not too many names */
5543
5544
0
      if (cb->names_found >= MAX_NAME_COUNT)
5545
0
        {
5546
0
        errorcode = ERR49;
5547
0
        goto FAILED;
5548
0
        }
5549
5550
      /* Adjust the entry size to accommodate the longest name found. */
5551
5552
0
      if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
5553
0
        cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
5554
5555
      /* Scan the list to check for duplicates. For duplicate names, if the
5556
      number is the same, break the loop, which causes the name to be
5557
      discarded; otherwise, if DUPNAMES is not set, give an error.
5558
      If it is set, allow the name with a different number, but continue
5559
      scanning in case this is a duplicate with the same number. For
5560
      non-duplicate names, give an error if the number is duplicated. */
5561
5562
0
      isdupname = FALSE;
5563
0
      ng = cb->named_groups;
5564
0
      for (i = 0; i < cb->names_found; i++, ng++)
5565
0
        {
5566
0
        if (namelen == ng->length &&
5567
0
            PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
5568
0
          {
5569
0
          if (ng->number == cb->bracount) break;
5570
0
          if ((options & PCRE2_DUPNAMES) == 0)
5571
0
            {
5572
0
            errorcode = ERR43;
5573
0
            goto FAILED;
5574
0
            }
5575
0
          isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
5576
0
          cb->dupnames = TRUE;              /* Duplicate names exist */
5577
0
          }
5578
0
        else if (ng->number == cb->bracount)
5579
0
          {
5580
0
          errorcode = ERR65;
5581
0
          goto FAILED;
5582
0
          }
5583
0
        }
5584
5585
0
      if (i < cb->names_found) break;   /* Ignore duplicate with same number */
5586
5587
      /* Increase the list size if necessary */
5588
5589
0
      if (cb->names_found >= cb->named_group_list_size)
5590
0
        {
5591
0
        uint32_t newsize = cb->named_group_list_size * 2;
5592
0
        named_group *newspace =
5593
0
          cb->cx->memctl.malloc(newsize * sizeof(named_group),
5594
0
          cb->cx->memctl.memory_data);
5595
0
        if (newspace == NULL)
5596
0
          {
5597
0
          errorcode = ERR21;
5598
0
          goto FAILED;
5599
0
          }
5600
5601
0
        memcpy(newspace, cb->named_groups,
5602
0
          cb->named_group_list_size * sizeof(named_group));
5603
0
        if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5604
0
          cb->cx->memctl.free((void *)cb->named_groups,
5605
0
          cb->cx->memctl.memory_data);
5606
0
        cb->named_groups = newspace;
5607
0
        cb->named_group_list_size = newsize;
5608
0
        }
5609
5610
      /* Add this name to the list */
5611
5612
0
      cb->named_groups[cb->names_found].name = name;
5613
0
      cb->named_groups[cb->names_found].length = (uint16_t)namelen;
5614
0
      cb->named_groups[cb->names_found].number = cb->bracount;
5615
0
      cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
5616
0
      cb->names_found++;
5617
0
      break;
5618
5619
5620
      /* ---- Perl extended character class ---- */
5621
5622
      /* These are of the form '(?[...])'. We handle these via the same parser
5623
      that consumes ordinary '[...]' classes, but with a flag set to activate
5624
      the extended behaviour. */
5625
5626
2
      case CHAR_LEFT_SQUARE_BRACKET:
5627
2
      class_mode_state = CLASS_MODE_PERL_EXT;
5628
2
      c = *ptr++;
5629
2
      goto FROM_PERL_EXTENDED_CLASS;
5630
1.38k
      }        /* End of (? switch */
5631
1.36k
    break;     /* End of ( handling */
5632
5633
5634
    /* ---- Branch terminators ---- */
5635
5636
    /* Alternation: reset the capture count if we are in a (?| group. */
5637
5638
30.8k
    case CHAR_VERTICAL_LINE:
5639
30.8k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
5640
30.8k
        (top_nest->flags & NSF_RESET) != 0)
5641
0
      {
5642
0
      if (cb->bracount > top_nest->max_group)
5643
0
        top_nest->max_group = (uint16_t)cb->bracount;
5644
0
      cb->bracount = top_nest->reset_group;
5645
0
      }
5646
30.8k
    *parsed_pattern++ = META_ALT;
5647
30.8k
    break;
5648
5649
    /* End of group; reset the capture count to the maximum if we are in a (?|
5650
    group and/or reset the options that are tracked during parsing. Disallow
5651
    quantifier for a condition that is an assertion. */
5652
5653
5.89k
    case CHAR_RIGHT_PARENTHESIS:
5654
5.89k
    okquantifier = TRUE;
5655
5.89k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth)
5656
363
      {
5657
363
      options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
5658
363
      xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
5659
363
      if ((top_nest->flags & NSF_RESET) != 0 &&
5660
363
          top_nest->max_group > cb->bracount)
5661
0
        cb->bracount = top_nest->max_group;
5662
363
      if ((top_nest->flags & NSF_CONDASSERT) != 0)
5663
0
        okquantifier = FALSE;
5664
5665
363
      if ((top_nest->flags & NSF_ATOMICSR) != 0)
5666
0
        {
5667
0
        *parsed_pattern++ = META_KET;
5668
5669
#ifdef PCRE2_DEBUG
5670
        PCRE2_ASSERT(parsed_pattern_extra > 0);
5671
        parsed_pattern_extra--;
5672
#endif
5673
0
        }
5674
5675
363
      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
5676
117
        else top_nest--;
5677
363
      }
5678
5.89k
    if (nest_depth == 0)    /* Unmatched closing parenthesis */
5679
54
      {
5680
54
      errorcode = ERR22;
5681
54
      goto FAILED_BACK;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
5682
54
      }
5683
5.84k
    nest_depth--;
5684
5.84k
    *parsed_pattern++ = META_KET;
5685
5.84k
    break;
5686
933k
    }  /* End of switch on pattern character */
5687
933k
  }    /* End of main character scan loop */
5688
5689
/* End of pattern reached. Check for missing ) at the end of a verb name. */
5690
5691
1.39k
if (inverbname && ptr >= ptrend)
5692
0
  {
5693
0
  errorcode = ERR60;
5694
0
  goto FAILED;
5695
0
  }
5696
5697
5698
1.39k
PARSED_END:
5699
5700
1.39k
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
5701
1.39k
             (parsed_pattern_extra - parsed_pattern_extra_check) <=
5702
1.39k
               max_parsed_pattern(ptr_check, ptr, utf, options));
5703
5704
/* Manage callout for the final item */
5705
5706
1.39k
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
5707
1.39k
  parsed_pattern, cb);
5708
5709
/* Insert trailing items for word and line matching (features provided for the
5710
benefit of pcre2grep). */
5711
5712
1.39k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
5713
0
  {
5714
0
  *parsed_pattern++ = META_KET;
5715
0
  *parsed_pattern++ = META_DOLLAR;
5716
0
  }
5717
1.39k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5718
0
  {
5719
0
  *parsed_pattern++ = META_KET;
5720
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
5721
0
  }
5722
5723
/* Terminate the parsed pattern, then return success if all groups are closed.
5724
Otherwise we have unclosed parentheses. */
5725
5726
1.39k
if (parsed_pattern >= parsed_pattern_end)
5727
0
  {
5728
0
  PCRE2_DEBUG_UNREACHABLE();
5729
0
  errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5730
0
  goto FAILED;
5731
0
  }
5732
5733
1.39k
*parsed_pattern = META_END;
5734
1.39k
if (nest_depth == 0) return 0;
5735
5736
129
UNCLOSED_PARENTHESIS:
5737
129
errorcode = ERR14;
5738
5739
/* Come here for all failures. */
5740
5741
1.08k
FAILED:
5742
1.08k
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5743
1.08k
return errorcode;
5744
5745
/* Some errors need to indicate the previous character. */
5746
5747
374
FAILED_BACK:
5748
374
ptr--;
5749
374
goto FAILED;
5750
5751
/* This failure happens several times. */
5752
5753
0
BAD_VERSION_CONDITION:
5754
0
errorcode = ERR79;
5755
0
goto FAILED;
5756
129
}
5757
5758
5759
5760
/*************************************************
5761
*       Find first significant opcode            *
5762
*************************************************/
5763
5764
/* This is called by several functions that scan a compiled expression looking
5765
for a fixed first character, or an anchoring opcode etc. It skips over things
5766
that do not influence this. For some calls, it makes sense to skip negative
5767
forward and all backward assertions, and also the \b assertion; for others it
5768
does not.
5769
5770
Arguments:
5771
  code         pointer to the start of the group
5772
  skipassert   TRUE if certain assertions are to be skipped
5773
5774
Returns:       pointer to the first significant opcode
5775
*/
5776
5777
static const PCRE2_UCHAR*
5778
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5779
3.38k
{
5780
3.38k
for (;;)
5781
3.47k
  {
5782
3.47k
  switch ((int)*code)
5783
3.47k
    {
5784
0
    case OP_ASSERT_NOT:
5785
0
    case OP_ASSERTBACK:
5786
0
    case OP_ASSERTBACK_NOT:
5787
0
    case OP_ASSERTBACK_NA:
5788
0
    if (!skipassert) return code;
5789
0
    do code += GET(code, 1); while (*code == OP_ALT);
5790
0
    code += PRIV(OP_lengths)[*code];
5791
0
    break;
5792
5793
0
    case OP_WORD_BOUNDARY:
5794
168
    case OP_NOT_WORD_BOUNDARY:
5795
168
    case OP_UCP_WORD_BOUNDARY:
5796
287
    case OP_NOT_UCP_WORD_BOUNDARY:
5797
287
    if (!skipassert) return code;
5798
    /* Fall through */
5799
5800
95
    case OP_CALLOUT:
5801
95
    case OP_CREF:
5802
95
    case OP_DNCREF:
5803
95
    case OP_RREF:
5804
95
    case OP_DNRREF:
5805
95
    case OP_FALSE:
5806
95
    case OP_TRUE:
5807
95
    code += PRIV(OP_lengths)[*code];
5808
95
    break;
5809
5810
0
    case OP_CALLOUT_STR:
5811
0
    code += GET(code, 1 + 2*LINK_SIZE);
5812
0
    break;
5813
5814
0
    case OP_SKIPZERO:
5815
0
    code += 2 + GET(code, 2) + LINK_SIZE;
5816
0
    break;
5817
5818
0
    case OP_COND:
5819
0
    case OP_SCOND:
5820
0
    if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
5821
0
        code[GET(code, 1)] != OP_KET)      /* More than one branch */
5822
0
      return code;
5823
0
    code += GET(code, 1) + 1 + LINK_SIZE;
5824
0
    break;
5825
5826
0
    case OP_MARK:
5827
0
    case OP_COMMIT_ARG:
5828
0
    case OP_PRUNE_ARG:
5829
0
    case OP_SKIP_ARG:
5830
0
    case OP_THEN_ARG:
5831
0
    code += code[1] + PRIV(OP_lengths)[*code];
5832
0
    break;
5833
5834
3.18k
    default:
5835
3.18k
    return code;
5836
3.47k
    }
5837
3.47k
  }
5838
5839
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
5840
0
}
5841
5842
5843
5844
/*************************************************
5845
*    Find details of duplicate group names       *
5846
*************************************************/
5847
5848
/* This is called from compile_branch() when it needs to know the index and
5849
count of duplicates in the names table when processing named backreferences,
5850
either directly, or as conditions.
5851
5852
Arguments:
5853
  name          points to the name
5854
  length        the length of the name
5855
  indexptr      where to put the index
5856
  countptr      where to put the count of duplicates
5857
  errorcodeptr  where to put an error code
5858
  cb            the compile block
5859
5860
Returns:        TRUE if OK, FALSE if not, error code set
5861
*/
5862
5863
static BOOL
5864
find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5865
  int *countptr, int *errorcodeptr, compile_block *cb)
5866
0
{
5867
0
uint32_t i, groupnumber;
5868
0
int count;
5869
0
PCRE2_UCHAR *slot = cb->name_table;
5870
5871
/* Find the first entry in the table */
5872
5873
0
for (i = 0; i < cb->names_found; i++)
5874
0
  {
5875
0
  if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5876
0
      slot[IMM2_SIZE+length] == 0) break;
5877
0
  slot += cb->name_entry_size;
5878
0
  }
5879
5880
/* This should not occur, because this function is called only when we know we
5881
have duplicate names. Give an internal error. */
5882
5883
0
if (i >= cb->names_found)
5884
0
  {
5885
0
  PCRE2_DEBUG_UNREACHABLE();
5886
0
  *errorcodeptr = ERR53;
5887
0
  cb->erroroffset = name - cb->start_pattern;
5888
0
  return FALSE;
5889
0
  }
5890
5891
/* Record the index and then see how many duplicates there are, updating the
5892
backref map and maximum back reference as we do. */
5893
5894
0
*indexptr = i;
5895
0
count = 0;
5896
5897
0
for (;;)
5898
0
  {
5899
0
  count++;
5900
0
  groupnumber = GET2(slot,0);
5901
0
  cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5902
0
  if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5903
0
  if (++i >= cb->names_found) break;
5904
0
  slot += cb->name_entry_size;
5905
0
  if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5906
0
    (slot+IMM2_SIZE)[length] != 0) break;
5907
0
  }
5908
5909
0
*countptr = count;
5910
0
return TRUE;
5911
0
}
5912
5913
5914
5915
/*************************************************
5916
*           Compile one branch                   *
5917
*************************************************/
5918
5919
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5920
the options are changed during the branch, the pointer is used to change the
5921
external options bits. This function is used during the pre-compile phase when
5922
we are trying to find out the amount of memory needed, as well as during the
5923
real compile phase. The value of lengthptr distinguishes the two phases.
5924
5925
Arguments:
5926
  optionsptr        pointer to the option bits
5927
  xoptionsptr       pointer to the extra option bits
5928
  codeptr           points to the pointer to the current code point
5929
  pptrptr           points to the current parsed pattern pointer
5930
  errorcodeptr      points to error code variable
5931
  firstcuptr        place to put the first required code unit
5932
  firstcuflagsptr   place to put the first code unit flags
5933
  reqcuptr          place to put the last required code unit
5934
  reqcuflagsptr     place to put the last required code unit flags
5935
  bcptr             points to current branch chain
5936
  open_caps         points to current capitem
5937
  cb                contains pointers to tables etc.
5938
  lengthptr         NULL during the real compile phase
5939
                    points to length accumulator during pre-compile phase
5940
5941
Returns:            0 There's been an error, *errorcodeptr is non-zero
5942
                   +1 Success, this branch must match at least one character
5943
                   -1 Success, this branch may match an empty string
5944
*/
5945
5946
static int
5947
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5948
  PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5949
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5950
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5951
  compile_block *cb, PCRE2_SIZE *lengthptr)
5952
45.4k
{
5953
45.4k
int bravalue = 0;
5954
45.4k
int okreturn = -1;
5955
45.4k
int group_return = 0;
5956
45.4k
uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5957
45.4k
uint32_t greedy_default, greedy_non_default;
5958
45.4k
uint32_t repeat_type, op_type;
5959
45.4k
uint32_t options = *optionsptr;               /* May change dynamically */
5960
45.4k
uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
5961
45.4k
uint32_t firstcu, reqcu;
5962
45.4k
uint32_t zeroreqcu, zerofirstcu;
5963
45.4k
uint32_t *pptr = *pptrptr;
5964
45.4k
uint32_t meta, meta_arg;
5965
45.4k
uint32_t firstcuflags, reqcuflags;
5966
45.4k
uint32_t zeroreqcuflags, zerofirstcuflags;
5967
45.4k
uint32_t req_caseopt, reqvary, tempreqvary;
5968
/* Some opcodes, such as META_SCS_NUMBER or META_SCS_NAME,
5969
depends on the previous value of offset. */
5970
45.4k
PCRE2_SIZE offset = 0;
5971
45.4k
PCRE2_SIZE length_prevgroup = 0;
5972
45.4k
PCRE2_UCHAR *code = *codeptr;
5973
45.4k
PCRE2_UCHAR *last_code = code;
5974
45.4k
PCRE2_UCHAR *orig_code = code;
5975
45.4k
PCRE2_UCHAR *tempcode;
5976
45.4k
PCRE2_UCHAR *previous = NULL;
5977
45.4k
PCRE2_UCHAR op_previous;
5978
45.4k
BOOL groupsetfirstcu = FALSE;
5979
45.4k
BOOL had_accept = FALSE;
5980
45.4k
BOOL matched_char = FALSE;
5981
45.4k
BOOL previous_matched_char = FALSE;
5982
45.4k
BOOL reset_caseful = FALSE;
5983
5984
/* We can fish out the UTF setting once and for all into a BOOL, but we must
5985
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5986
as we process the pattern. */
5987
5988
45.4k
#ifdef SUPPORT_UNICODE
5989
45.4k
BOOL utf = (options & PCRE2_UTF) != 0;
5990
45.4k
BOOL ucp = (options & PCRE2_UCP) != 0;
5991
#else  /* No Unicode support */
5992
BOOL utf = FALSE;
5993
#endif
5994
5995
/* Set up the default and non-default settings for greediness */
5996
5997
45.4k
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5998
45.4k
greedy_non_default = greedy_default ^ 1;
5999
6000
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
6001
matching encountered yet". It gets changed to REQ_NONE if we hit something that
6002
matches a non-fixed first unit; reqcu just remains unset if we never find one.
6003
6004
When we hit a repeat whose minimum is zero, we may have to adjust these values
6005
to take the zero repeat into account. This is implemented by setting them to
6006
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
6007
item types that can be repeated set these backoff variables appropriately. */
6008
6009
45.4k
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
6010
45.4k
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
6011
6012
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
6013
according to the current setting of the caseless flag. The REQ_CASELESS value
6014
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
6015
to record the case status of the value. This is used only for ASCII characters.
6016
*/
6017
6018
45.4k
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6019
6020
/* Switch on next META item until the end of the branch */
6021
6022
1.12M
for (;; pptr++)
6023
1.16M
  {
6024
1.16M
  BOOL possessive_quantifier;
6025
1.16M
  BOOL note_group_empty;
6026
1.16M
  uint32_t mclength;
6027
1.16M
  uint32_t skipunits;
6028
1.16M
  uint32_t subreqcu, subfirstcu;
6029
1.16M
  uint32_t groupnumber;
6030
1.16M
  uint32_t verbarglen, verbculen;
6031
1.16M
  uint32_t subreqcuflags, subfirstcuflags;
6032
1.16M
  open_capitem *oc;
6033
1.16M
  PCRE2_UCHAR mcbuffer[8];
6034
6035
  /* Get next META item in the pattern and its potential argument. */
6036
6037
1.16M
  meta = META_CODE(*pptr);
6038
1.16M
  meta_arg = META_DATA(*pptr);
6039
6040
  /* If we are in the pre-compile phase, accumulate the length used for the
6041
  previous cycle of this loop, unless the next item is a quantifier. */
6042
6043
1.16M
  if (lengthptr != NULL)
6044
597k
    {
6045
597k
    if (code > cb->start_workspace + cb->workspace_size -
6046
597k
        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
6047
0
      {
6048
0
      if (code >= cb->start_workspace + cb->workspace_size)
6049
0
        {
6050
0
        PCRE2_DEBUG_UNREACHABLE();
6051
0
        *errorcodeptr = ERR52;  /* Over-ran workspace - internal error */
6052
0
        }
6053
0
      else
6054
0
        *errorcodeptr = ERR86;
6055
0
      return 0;
6056
0
      }
6057
6058
    /* There is at least one situation where code goes backwards: this is the
6059
    case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
6060
    is processed, the whole class is eliminated. However, it is created first,
6061
    so we have to allow memory for it. Therefore, don't ever reduce the length
6062
    at this point. */
6063
6064
597k
    if (code < last_code) code = last_code;
6065
6066
    /* If the next thing is not a quantifier, we add the length of the previous
6067
    item into the total, and reset the code pointer to the start of the
6068
    workspace. Otherwise leave the previous item available to be quantified. */
6069
6070
597k
    if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6071
550k
      {
6072
550k
      if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
6073
0
        {
6074
0
        *errorcodeptr = ERR20;   /* Integer overflow */
6075
0
        return 0;
6076
0
        }
6077
550k
      *lengthptr += (PCRE2_SIZE)(code - orig_code);
6078
550k
      if (*lengthptr > MAX_PATTERN_SIZE)
6079
0
        {
6080
0
        *errorcodeptr = ERR20;   /* Pattern is too large */
6081
0
        return 0;
6082
0
        }
6083
550k
      code = orig_code;
6084
550k
      }
6085
6086
    /* Remember where this code item starts so we can catch the "backwards"
6087
    case above next time round. */
6088
6089
597k
    last_code = code;
6090
597k
    }
6091
6092
  /* Process the next parsed pattern item. If it is not a quantifier, remember
6093
  where it starts so that it can be quantified when a quantifier follows.
6094
  Checking for the legality of quantifiers happens in parse_regex(), except for
6095
  a quantifier after an assertion that is a condition. */
6096
6097
1.16M
  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6098
1.07M
    {
6099
1.07M
    previous = code;
6100
1.07M
    if (matched_char && !had_accept) okreturn = 1;
6101
1.07M
    }
6102
6103
1.16M
  previous_matched_char = matched_char;
6104
1.16M
  matched_char = FALSE;
6105
1.16M
  note_group_empty = FALSE;
6106
1.16M
  skipunits = 0;         /* Default value for most subgroups */
6107
6108
1.16M
  switch(meta)
6109
1.16M
    {
6110
    /* ===================================================================*/
6111
    /* The branch terminates at pattern end or | or ) */
6112
6113
2.33k
    case META_END:
6114
39.5k
    case META_ALT:
6115
45.3k
    case META_KET:
6116
45.3k
    *firstcuptr = firstcu;
6117
45.3k
    *firstcuflagsptr = firstcuflags;
6118
45.3k
    *reqcuptr = reqcu;
6119
45.3k
    *reqcuflagsptr = reqcuflags;
6120
45.3k
    *codeptr = code;
6121
45.3k
    *pptrptr = pptr;
6122
45.3k
    return okreturn;
6123
6124
6125
    /* ===================================================================*/
6126
    /* Handle single-character metacharacters. In multiline mode, ^ disables
6127
    the setting of any following char as a first character. */
6128
6129
10.2k
    case META_CIRCUMFLEX:
6130
10.2k
    if ((options & PCRE2_MULTILINE) != 0)
6131
498
      {
6132
498
      if (firstcuflags == REQ_UNSET)
6133
20
        zerofirstcuflags = firstcuflags = REQ_NONE;
6134
498
      *code++ = OP_CIRCM;
6135
498
      }
6136
9.74k
    else *code++ = OP_CIRC;
6137
10.2k
    break;
6138
6139
2.89k
    case META_DOLLAR:
6140
2.89k
    *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
6141
2.89k
    break;
6142
6143
    /* There can never be a first char if '.' is first, whatever happens about
6144
    repeats. The value of reqcu doesn't change either. */
6145
6146
19.2k
    case META_DOT:
6147
19.2k
    matched_char = TRUE;
6148
19.2k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6149
19.2k
    zerofirstcu = firstcu;
6150
19.2k
    zerofirstcuflags = firstcuflags;
6151
19.2k
    zeroreqcu = reqcu;
6152
19.2k
    zeroreqcuflags = reqcuflags;
6153
19.2k
    *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
6154
19.2k
    break;
6155
6156
6157
    /* ===================================================================*/
6158
    /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
6159
    Otherwise, an initial ']' is taken as a data character. When empty classes
6160
    are allowed, [] must generate an empty class - we have no dedicated opcode
6161
    to optimise the representation, but it's a rare case (the '(*FAIL)'
6162
    construct would be a clearer way for a pattern author to represent a
6163
    non-matching branch, but it does have different semantics to '[]' if both
6164
    are followed by a quantifier). The empty-negated [^] matches any character,
6165
    so is useful: generate OP_ALLANY for this. */
6166
6167
0
    case META_CLASS_EMPTY:
6168
0
    case META_CLASS_EMPTY_NOT:
6169
0
    matched_char = TRUE;
6170
0
    if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;
6171
0
    else
6172
0
      {
6173
0
      *code++ = OP_CLASS;
6174
0
      memset(code, 0, 32);
6175
0
      code += 32 / sizeof(PCRE2_UCHAR);
6176
0
      }
6177
6178
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6179
0
    zerofirstcu = firstcu;
6180
0
    zerofirstcuflags = firstcuflags;
6181
0
    break;
6182
6183
6184
    /* ===================================================================*/
6185
    /* Non-empty character class. If the included characters are all < 256, we
6186
    build a 32-byte bitmap of the permitted characters, except in the special
6187
    case where there is only one such character. For negated classes, we build
6188
    the map as usual, then invert it at the end. However, we use a different
6189
    opcode so that data characters > 255 can be handled correctly.
6190
6191
    If the class contains characters outside the 0-255 range, a different
6192
    opcode is compiled. It may optionally have a bit map for characters < 256,
6193
    but those above are explicitly listed afterwards. A flag code unit tells
6194
    whether the bitmap is present, and whether this is a negated class or
6195
    not. */
6196
6197
11.8k
    case META_CLASS_NOT:
6198
25.7k
    case META_CLASS:
6199
25.7k
    matched_char = TRUE;
6200
6201
    /* Check for complex extended classes and handle them separately. */
6202
6203
25.7k
    if ((*pptr & CLASS_IS_ECLASS) != 0)
6204
0
      {
6205
0
      if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,
6206
0
                                      errorcodeptr, cb, lengthptr))
6207
0
        return 0;
6208
0
      goto CLASS_END_PROCESSING;
6209
0
      }
6210
6211
    /* We can optimize the case of a single character in a class by generating
6212
    OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
6213
    negative. In the negative case there can be no first char if this item is
6214
    first, whatever repeat count may follow. In the case of reqcu, save the
6215
    previous value for reinstating. */
6216
6217
    /* NOTE: at present this optimization is not effective if the only
6218
    character in a class in 32-bit, non-UCP mode has its top bit set. */
6219
6220
25.7k
    if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
6221
5.96k
      {
6222
5.96k
      uint32_t c = pptr[1];
6223
6224
5.96k
      pptr += 2;                 /* Move on to class end */
6225
5.96k
      if (meta == META_CLASS)    /* A positive one-char class can be */
6226
502
        {                        /* handled as a normal literal character. */
6227
502
        meta = c;                /* Set up the character */
6228
502
        goto NORMAL_CHAR_SET;
6229
502
        }
6230
6231
      /* Handle a negative one-character class */
6232
6233
5.46k
      zeroreqcu = reqcu;
6234
5.46k
      zeroreqcuflags = reqcuflags;
6235
5.46k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6236
5.46k
      zerofirstcu = firstcu;
6237
5.46k
      zerofirstcuflags = firstcuflags;
6238
6239
      /* For caseless UTF or UCP mode, check whether this character has more
6240
      than one other case. If so, generate a special OP_NOTPROP item instead of
6241
      OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
6242
      caseless set that starts with an ASCII character. If the character is
6243
      affected by the special Turkish rules, hardcode the not-matching
6244
      characters using a caseset. */
6245
6246
5.46k
#ifdef SUPPORT_UNICODE
6247
5.46k
      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
6248
767
        {
6249
767
        uint32_t caseset;
6250
6251
767
        if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6252
767
              PCRE2_EXTRA_TURKISH_CASING &&
6253
767
            UCD_ANY_I(c))
6254
0
          {
6255
0
          caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
6256
0
          }
6257
767
        else if ((caseset = UCD_CASESET(c)) != 0 &&
6258
767
                 (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6259
767
                 PRIV(ucd_caseless_sets)[caseset] < 128)
6260
0
          {
6261
0
          caseset = 0;  /* Ignore the caseless set if it's restricted. */
6262
0
          }
6263
6264
767
        if (caseset != 0)
6265
6
          {
6266
6
          *code++ = OP_NOTPROP;
6267
6
          *code++ = PT_CLIST;
6268
6
          *code++ = caseset;
6269
6
          break;   /* We are finished with this class */
6270
6
          }
6271
767
        }
6272
5.46k
#endif
6273
      /* Char has only one other (usable) case, or UCP not available */
6274
6275
5.46k
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
6276
5.46k
      code += PUTCHAR(c, code);
6277
5.46k
      break;   /* We are finished with this class */
6278
5.46k
      }        /* End of 1-char optimization */
6279
6280
    /* Handle character classes that contain more than just one literal
6281
    character. If there are exactly two characters in a positive class, see if
6282
    they are case partners. This can be optimized to generate a caseless single
6283
    character match (which also sets first/required code units if relevant).
6284
    When casing restrictions apply, ignore a caseless set if both characters
6285
    are ASCII. When Turkish casing applies, an 'i' does not match its normal
6286
    Unicode "othercase". */
6287
6288
19.7k
    if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
6289
19.7k
        pptr[3] == META_CLASS_END)
6290
1.31k
      {
6291
1.31k
      uint32_t c = pptr[1];
6292
6293
1.31k
#ifdef SUPPORT_UNICODE
6294
1.31k
      if ((UCD_CASESET(c) == 0 ||
6295
1.31k
           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6296
0
            c < 128 && pptr[2] < 128)) &&
6297
1.31k
          !((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6298
1.31k
              PCRE2_EXTRA_TURKISH_CASING &&
6299
1.31k
            UCD_ANY_I(c)))
6300
1.31k
#endif
6301
1.31k
        {
6302
1.31k
        uint32_t d;
6303
6304
1.31k
#ifdef SUPPORT_UNICODE
6305
1.31k
        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
6306
1.31k
#endif
6307
1.31k
          {
6308
#if PCRE2_CODE_UNIT_WIDTH != 8
6309
          if (c > 255) d = c; else
6310
#endif
6311
1.31k
          d = TABLE_GET(c, cb->fcc, c);
6312
1.31k
          }
6313
6314
1.31k
        if (c != d && pptr[2] == d)
6315
0
          {
6316
0
          pptr += 3;                 /* Move on to class end */
6317
0
          meta = c;
6318
0
          if ((options & PCRE2_CASELESS) == 0)
6319
0
            {
6320
0
            reset_caseful = TRUE;
6321
0
            options |= PCRE2_CASELESS;
6322
0
            req_caseopt = REQ_CASELESS;
6323
0
            }
6324
0
          goto CLASS_CASELESS_CHAR;
6325
0
          }
6326
1.31k
        }
6327
1.31k
      }
6328
6329
    /* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */
6330
6331
19.7k
    pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,
6332
19.7k
                                          &code, meta == META_CLASS_NOT, NULL,
6333
19.7k
                                          errorcodeptr, cb, lengthptr);
6334
19.7k
    if (pptr == NULL) return 0;
6335
19.7k
    PCRE2_ASSERT(*pptr == META_CLASS_END);
6336
6337
19.7k
    CLASS_END_PROCESSING:
6338
6339
    /* If this class is the first thing in the branch, there can be no first
6340
    char setting, whatever the repeat count. Any reqcu setting must remain
6341
    unchanged after any kind of repeat. */
6342
6343
19.7k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6344
19.7k
    zerofirstcu = firstcu;
6345
19.7k
    zerofirstcuflags = firstcuflags;
6346
19.7k
    zeroreqcu = reqcu;
6347
19.7k
    zeroreqcuflags = reqcuflags;
6348
19.7k
    break;  /* End of class processing */
6349
6350
6351
    /* ===================================================================*/
6352
    /* Deal with (*VERB)s. */
6353
6354
    /* Check for open captures before ACCEPT and close those that are within
6355
    the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6356
    assertion. In the first pass, just accumulate the length required;
6357
    otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6358
    workspace overflow. Do not set firstcu after *ACCEPT. */
6359
6360
0
    case META_ACCEPT:
6361
0
    cb->had_accept = had_accept = TRUE;
6362
0
    for (oc = open_caps;
6363
0
         oc != NULL && oc->assert_depth >= cb->assert_depth;
6364
0
         oc = oc->next)
6365
0
      {
6366
0
      if (lengthptr != NULL)
6367
0
        {
6368
0
        *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6369
0
        }
6370
0
      else
6371
0
        {
6372
0
        *code++ = OP_CLOSE;
6373
0
        PUT2INC(code, 0, oc->number);
6374
0
        }
6375
0
      }
6376
0
    *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6377
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6378
0
    break;
6379
6380
0
    case META_PRUNE:
6381
0
    case META_SKIP:
6382
0
    cb->had_pruneorskip = TRUE;
6383
    /* Fall through */
6384
0
    case META_COMMIT:
6385
0
    case META_FAIL:
6386
0
    *code++ = verbops[(meta - META_MARK) >> 16];
6387
0
    break;
6388
6389
0
    case META_THEN:
6390
0
    cb->external_flags |= PCRE2_HASTHEN;
6391
0
    *code++ = OP_THEN;
6392
0
    break;
6393
6394
    /* Handle verbs with arguments. Arguments can be very long, especially in
6395
    16- and 32-bit modes, and can overflow the workspace in the first pass.
6396
    However, the argument length is constrained to be small enough to fit in
6397
    one code unit. This check happens in parse_regex(). In the first pass,
6398
    instead of putting the argument into memory, we just update the length
6399
    counter and set up an empty argument. */
6400
6401
0
    case META_THEN_ARG:
6402
0
    cb->external_flags |= PCRE2_HASTHEN;
6403
0
    goto VERB_ARG;
6404
6405
0
    case META_PRUNE_ARG:
6406
0
    case META_SKIP_ARG:
6407
0
    cb->had_pruneorskip = TRUE;
6408
    /* Fall through */
6409
98
    case META_MARK:
6410
98
    case META_COMMIT_ARG:
6411
98
    VERB_ARG:
6412
98
    *code++ = verbops[(meta - META_MARK) >> 16];
6413
    /* The length is in characters. */
6414
98
    verbarglen = *(++pptr);
6415
98
    verbculen = 0;
6416
98
    tempcode = code++;
6417
12.4k
    for (int i = 0; i < (int)verbarglen; i++)
6418
12.3k
      {
6419
12.3k
      meta = *(++pptr);
6420
12.3k
#ifdef SUPPORT_UNICODE
6421
12.3k
      if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6422
12.3k
#endif
6423
12.3k
        {
6424
12.3k
        mclength = 1;
6425
12.3k
        mcbuffer[0] = meta;
6426
12.3k
        }
6427
12.3k
      if (lengthptr != NULL) *lengthptr += mclength; else
6428
6.17k
        {
6429
6.17k
        memcpy(code, mcbuffer, CU2BYTES(mclength));
6430
6.17k
        code += mclength;
6431
6.17k
        verbculen += mclength;
6432
6.17k
        }
6433
12.3k
      }
6434
6435
98
    *tempcode = verbculen;   /* Fill in the code unit length */
6436
98
    *code++ = 0;             /* Terminating zero */
6437
98
    break;
6438
6439
6440
    /* ===================================================================*/
6441
    /* Handle options change. The new setting must be passed back for use in
6442
    subsequent branches. Reset the greedy defaults and the case value for
6443
    firstcu and reqcu. */
6444
6445
8
    case META_OPTIONS:
6446
8
    *optionsptr = options = *(++pptr);
6447
8
    *xoptionsptr = xoptions = *(++pptr);
6448
8
    greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6449
8
    greedy_non_default = greedy_default ^ 1;
6450
8
    req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6451
8
    break;
6452
6453
0
    case META_OFFSET:
6454
0
    GETPLUSOFFSET(offset, pptr);
6455
0
    break;
6456
6457
0
    case META_SCS:
6458
0
    bravalue = OP_ASSERT_SCS;
6459
0
    cb->assert_depth += 1;
6460
0
    goto GROUP_PROCESS;
6461
6462
6463
    /* ===================================================================*/
6464
    /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6465
    because it could be a numerical check on recursion, or a name check on a
6466
    group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6467
    we can handle it either way. We first try for a name; if not found, process
6468
    the number. */
6469
6470
0
    case META_COND_RNUMBER:   /* (?(Rdigits) */
6471
0
    case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6472
0
    case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6473
0
    case META_SCS_NAME:       /* Name of scan substring */
6474
0
    bravalue = OP_COND;
6475
0
      {
6476
0
      int count, index;
6477
0
      unsigned int i;
6478
0
      PCRE2_SPTR name;
6479
0
      named_group *ng = cb->named_groups;
6480
0
      uint32_t length = *(++pptr);
6481
6482
0
      if (meta == META_SCS_NAME)
6483
0
        offset += meta_arg;
6484
0
      else
6485
0
        GETPLUSOFFSET(offset, pptr);
6486
0
      name = cb->start_pattern + offset;
6487
6488
      /* In the first pass, the names generated in the pre-pass are available,
6489
      but the main name table has not yet been created. Scan the list of names
6490
      generated in the pre-pass in order to get a number and whether or not
6491
      this name is duplicated. If it is not duplicated, we can handle it as a
6492
      numerical group. */
6493
6494
0
      for (i = 0; i < cb->names_found; i++, ng++)
6495
0
        if (length == ng->length &&
6496
0
            PRIV(strncmp)(name, ng->name, length) == 0) break;
6497
6498
0
      if (i >= cb->names_found)
6499
0
        {
6500
        /* If the name was not found we have a bad reference, unless we are
6501
        dealing with R<digits>, which is treated as a recursion test by
6502
        number. */
6503
6504
0
        groupnumber = 0;
6505
0
        if (meta == META_COND_RNUMBER)
6506
0
          {
6507
0
          for (i = 1; i < length; i++)
6508
0
            {
6509
0
            groupnumber = groupnumber * 10 + (name[i] - CHAR_0);
6510
0
            if (groupnumber > MAX_GROUP_NUMBER)
6511
0
              {
6512
0
              *errorcodeptr = ERR61;
6513
0
              cb->erroroffset = offset + i;
6514
0
              return 0;
6515
0
              }
6516
0
            }
6517
0
          }
6518
6519
0
        if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6520
0
          {
6521
0
          *errorcodeptr = ERR15;
6522
0
          cb->erroroffset = offset;
6523
0
          return 0;
6524
0
          }
6525
6526
        /* (?Rdigits) treated as a recursion reference by number. A value of
6527
        zero (which is the result of both (?R) and (?R0)) means "any", and is
6528
        translated into RREF_ANY (which is 0xffff). */
6529
6530
0
        if (groupnumber == 0) groupnumber = RREF_ANY;
6531
0
        code[1+LINK_SIZE] = OP_RREF;
6532
0
        PUT2(code, 2+LINK_SIZE, groupnumber);
6533
0
        skipunits = 1+IMM2_SIZE;
6534
0
        goto GROUP_PROCESS_NOTE_EMPTY;
6535
0
        }
6536
0
      else if (!ng->isdup)
6537
0
        {
6538
        /* Otherwise found a duplicated name */
6539
0
        if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6540
6541
0
        if (meta == META_SCS_NAME)
6542
0
          {
6543
0
          code[0] = OP_CREF;
6544
0
          PUT2(code, 1, ng->number);
6545
0
          code += 1+IMM2_SIZE;
6546
0
          break;
6547
0
          }
6548
6549
0
        code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6550
0
        PUT2(code, 2+LINK_SIZE, ng->number);
6551
0
        skipunits = 1+IMM2_SIZE;
6552
0
        if (meta != META_SCS_NAME) goto GROUP_PROCESS_NOTE_EMPTY;
6553
0
        cb->assert_depth += 1;
6554
0
        goto GROUP_PROCESS;
6555
0
        }
6556
6557
      /* We have a duplicated name. In the compile pass we have to search the
6558
      main table in order to get the index and count values. */
6559
6560
0
      count = 0;  /* Values for first pass (avoids compiler warning) */
6561
0
      index = 0;
6562
0
      if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6563
0
            &count, errorcodeptr, cb)) return 0;
6564
6565
0
      if (meta == META_SCS_NAME)
6566
0
        {
6567
0
        code[0] = OP_DNCREF;
6568
0
        PUT2(code, 1, index);
6569
0
        PUT2(code, 1+IMM2_SIZE, count);
6570
0
        code += 1+2*IMM2_SIZE;
6571
0
        break;
6572
0
        }
6573
6574
      /* A duplicated name was found. Note that if an R<digits> name is found
6575
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6576
6577
0
      code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;
6578
6579
      /* Insert appropriate data values. */
6580
0
      skipunits = 1+2*IMM2_SIZE;
6581
0
      PUT2(code, 2+LINK_SIZE, index);
6582
0
      PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6583
0
      }
6584
6585
0
    PCRE2_ASSERT(meta != META_SCS_NAME);
6586
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6587
6588
    /* The DEFINE condition is always false. Its internal groups may never
6589
    be called, so matched_char must remain false, hence the jump to
6590
    GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6591
6592
0
    case META_COND_DEFINE:
6593
0
    bravalue = OP_COND;
6594
0
    GETPLUSOFFSET(offset, pptr);
6595
0
    code[1+LINK_SIZE] = OP_DEFINE;
6596
0
    skipunits = 1;
6597
0
    goto GROUP_PROCESS;
6598
6599
    /* Conditional test of a group's being set. */
6600
6601
0
    case META_COND_NUMBER:
6602
0
    case META_SCS_NUMBER:
6603
0
    bravalue = OP_COND;
6604
0
    if (meta == META_SCS_NUMBER)
6605
0
      offset += meta_arg;
6606
0
    else
6607
0
      GETPLUSOFFSET(offset, pptr);
6608
6609
0
    groupnumber = *(++pptr);
6610
0
    if (groupnumber > cb->bracount)
6611
0
      {
6612
0
      *errorcodeptr = ERR15;
6613
0
      cb->erroroffset = offset;
6614
0
      return 0;
6615
0
      }
6616
0
    if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6617
6618
0
    if (meta == META_SCS_NUMBER)
6619
0
      {
6620
0
      code[0] = OP_CREF;
6621
0
      PUT2(code, 1, groupnumber);
6622
0
      code += 1+IMM2_SIZE;
6623
0
      break;
6624
0
      }
6625
6626
    /* Point at initial ( for too many branches error */
6627
0
    offset -= 2;
6628
0
    code[1+LINK_SIZE] = OP_CREF;
6629
0
    skipunits = 1+IMM2_SIZE;
6630
0
    PUT2(code, 2+LINK_SIZE, groupnumber);
6631
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6632
6633
    /* Test for the PCRE2 version. */
6634
6635
0
    case META_COND_VERSION:
6636
0
    bravalue = OP_COND;
6637
0
    if (pptr[1] > 0)
6638
0
      code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6639
0
        (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6640
0
          OP_TRUE : OP_FALSE;
6641
0
    else
6642
0
      code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6643
0
        OP_TRUE : OP_FALSE;
6644
0
    skipunits = 1;
6645
0
    pptr += 3;
6646
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6647
6648
    /* The condition is an assertion, possibly preceded by a callout. */
6649
6650
0
    case META_COND_ASSERT:
6651
0
    bravalue = OP_COND;
6652
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6653
6654
6655
    /* ===================================================================*/
6656
    /* Handle all kinds of nested bracketed groups. The non-capturing,
6657
    non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6658
6659
34
    case META_LOOKAHEAD:
6660
34
    bravalue = OP_ASSERT;
6661
34
    cb->assert_depth += 1;
6662
34
    goto GROUP_PROCESS;
6663
6664
0
    case META_LOOKAHEAD_NA:
6665
0
    bravalue = OP_ASSERT_NA;
6666
0
    cb->assert_depth += 1;
6667
0
    goto GROUP_PROCESS;
6668
6669
    /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6670
    thing to do, but Perl allows all assertions to be quantified, and when
6671
    they contain capturing parentheses there may be a potential use for
6672
    this feature. Not that that applies to a quantified (?!) but we allow
6673
    it for uniformity. */
6674
6675
14
    case META_LOOKAHEADNOT:
6676
14
    if (pptr[1] == META_KET &&
6677
14
         (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6678
0
      {
6679
0
      *code++ = OP_FAIL;
6680
0
      pptr++;
6681
0
      }
6682
14
    else
6683
14
      {
6684
14
      bravalue = OP_ASSERT_NOT;
6685
14
      cb->assert_depth += 1;
6686
14
      goto GROUP_PROCESS;
6687
14
      }
6688
0
    break;
6689
6690
116
    case META_LOOKBEHIND:
6691
116
    bravalue = OP_ASSERTBACK;
6692
116
    cb->assert_depth += 1;
6693
116
    goto GROUP_PROCESS;
6694
6695
0
    case META_LOOKBEHINDNOT:
6696
0
    bravalue = OP_ASSERTBACK_NOT;
6697
0
    cb->assert_depth += 1;
6698
0
    goto GROUP_PROCESS;
6699
6700
58
    case META_LOOKBEHIND_NA:
6701
58
    bravalue = OP_ASSERTBACK_NA;
6702
58
    cb->assert_depth += 1;
6703
58
    goto GROUP_PROCESS;
6704
6705
0
    case META_ATOMIC:
6706
0
    bravalue = OP_ONCE;
6707
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6708
6709
0
    case META_SCRIPT_RUN:
6710
0
    bravalue = OP_SCRIPT_RUN;
6711
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6712
6713
374
    case META_NOCAPTURE:
6714
374
    bravalue = OP_BRA;
6715
    /* Fall through */
6716
6717
    /* Process nested bracketed regex. The nesting depth is maintained for the
6718
    benefit of the stackguard function. The test for too deep nesting is now
6719
    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6720
    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6721
    note of whether or not they may match an empty string. */
6722
6723
5.60k
    GROUP_PROCESS_NOTE_EMPTY:
6724
5.60k
    note_group_empty = TRUE;
6725
6726
5.83k
    GROUP_PROCESS:
6727
5.83k
    cb->parens_depth += 1;
6728
5.83k
    *code = bravalue;
6729
5.83k
    pptr++;
6730
5.83k
    tempcode = code;
6731
5.83k
    tempreqvary = cb->req_varyopt;        /* Save value before group */
6732
5.83k
    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6733
6734
5.83k
    if ((group_return =
6735
5.83k
         compile_regex(
6736
5.83k
         options,                         /* The options state */
6737
5.83k
         xoptions,                        /* The extra options state */
6738
5.83k
         &tempcode,                       /* Where to put code (updated) */
6739
5.83k
         &pptr,                           /* Input pointer (updated) */
6740
5.83k
         errorcodeptr,                    /* Where to put an error message */
6741
5.83k
         skipunits,                       /* Skip over bracket number */
6742
5.83k
         &subfirstcu,                     /* For possible first char */
6743
5.83k
         &subfirstcuflags,
6744
5.83k
         &subreqcu,                       /* For possible last char */
6745
5.83k
         &subreqcuflags,
6746
5.83k
         bcptr,                           /* Current branch chain */
6747
5.83k
         open_caps,                       /* Pointer to capture stack */
6748
5.83k
         cb,                              /* Compile data block */
6749
5.83k
         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6750
5.83k
           &length_prevgroup              /* Pre-compile phase */
6751
5.83k
         )) == 0)
6752
7
      return 0;  /* Error */
6753
6754
5.82k
    cb->parens_depth -= 1;
6755
6756
    /* If that was a non-conditional significant group (not an assertion, not a
6757
    DEFINE) that matches at least one character, then the current item matches
6758
    a character. Conditionals are handled below. */
6759
6760
5.82k
    if (note_group_empty && bravalue != OP_COND && group_return > 0)
6761
1.80k
      matched_char = TRUE;
6762
6763
    /* If we've just compiled an assertion, pop the assert depth. */
6764
6765
5.82k
    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)
6766
222
      cb->assert_depth -= 1;
6767
6768
    /* At the end of compiling, code is still pointing to the start of the
6769
    group, while tempcode has been updated to point past the end of the group.
6770
    The parsed pattern pointer (pptr) is on the closing META_KET.
6771
6772
    If this is a conditional bracket, check that there are no more than
6773
    two branches in the group, or just one if it's a DEFINE group. We do this
6774
    in the real compile phase, not in the pre-pass, where the whole group may
6775
    not be available. */
6776
6777
5.82k
    if (bravalue == OP_COND && lengthptr == NULL)
6778
0
      {
6779
0
      PCRE2_UCHAR *tc = code;
6780
0
      int condcount = 0;
6781
6782
0
      do {
6783
0
         condcount++;
6784
0
         tc += GET(tc,1);
6785
0
         }
6786
0
      while (*tc != OP_KET);
6787
6788
      /* A DEFINE group is never obeyed inline (the "condition" is always
6789
      false). It must have only one branch. Having checked this, change the
6790
      opcode to OP_FALSE. */
6791
6792
0
      if (code[LINK_SIZE+1] == OP_DEFINE)
6793
0
        {
6794
0
        if (condcount > 1)
6795
0
          {
6796
0
          cb->erroroffset = offset;
6797
0
          *errorcodeptr = ERR54;
6798
0
          return 0;
6799
0
          }
6800
0
        code[LINK_SIZE+1] = OP_FALSE;
6801
0
        bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6802
0
        }
6803
6804
      /* A "normal" conditional group. If there is just one branch, we must not
6805
      make use of its firstcu or reqcu, because this is equivalent to an
6806
      empty second branch. Also, it may match an empty string. If there are two
6807
      branches, this item must match a character if the group must. */
6808
6809
0
      else
6810
0
        {
6811
0
        if (condcount > 2)
6812
0
          {
6813
0
          cb->erroroffset = offset;
6814
0
          *errorcodeptr = ERR27;
6815
0
          return 0;
6816
0
          }
6817
0
        if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6818
0
          else if (group_return > 0) matched_char = TRUE;
6819
0
        }
6820
0
      }
6821
6822
    /* In the pre-compile phase, update the length by the length of the group,
6823
    less the brackets at either end. Then reduce the compiled code to just a
6824
    set of non-capturing brackets so that it doesn't use much memory if it is
6825
    duplicated by a quantifier.*/
6826
6827
5.82k
    if (lengthptr != NULL)
6828
3.01k
      {
6829
3.01k
      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6830
0
        {
6831
0
        *errorcodeptr = ERR20;
6832
0
        return 0;
6833
0
        }
6834
3.01k
      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6835
3.01k
      code++;   /* This already contains bravalue */
6836
3.01k
      PUTINC(code, 0, 1 + LINK_SIZE);
6837
3.01k
      *code++ = OP_KET;
6838
3.01k
      PUTINC(code, 0, 1 + LINK_SIZE);
6839
3.01k
      break;    /* No need to waste time with special character handling */
6840
3.01k
      }
6841
6842
    /* Otherwise update the main code pointer to the end of the group. */
6843
6844
2.80k
    code = tempcode;
6845
6846
    /* For a DEFINE group, required and first character settings are not
6847
    relevant. */
6848
6849
2.80k
    if (bravalue == OP_DEFINE) break;
6850
6851
    /* Handle updating of the required and first code units for other types of
6852
    group. Update for normal brackets of all kinds, and conditions with two
6853
    branches (see code above). If the bracket is followed by a quantifier with
6854
    zero repeat, we have to back off. Hence the definition of zeroreqcu and
6855
    zerofirstcu outside the main loop so that they can be accessed for the back
6856
    off. */
6857
6858
2.80k
    zeroreqcu = reqcu;
6859
2.80k
    zeroreqcuflags = reqcuflags;
6860
2.80k
    zerofirstcu = firstcu;
6861
2.80k
    zerofirstcuflags = firstcuflags;
6862
2.80k
    groupsetfirstcu = FALSE;
6863
6864
2.80k
    if (bravalue >= OP_ONCE)  /* Not an assertion */
6865
2.69k
      {
6866
      /* If we have not yet set a firstcu in this branch, take it from the
6867
      subpattern, remembering that it was set here so that a repeat of more
6868
      than one can replicate it as reqcu if necessary. If the subpattern has
6869
      no firstcu, set "none" for the whole branch. In both cases, a zero
6870
      repeat forces firstcu to "none". */
6871
6872
2.69k
      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6873
477
        {
6874
477
        if (subfirstcuflags < REQ_NONE)
6875
225
          {
6876
225
          firstcu = subfirstcu;
6877
225
          firstcuflags = subfirstcuflags;
6878
225
          groupsetfirstcu = TRUE;
6879
225
          }
6880
252
        else firstcuflags = REQ_NONE;
6881
477
        zerofirstcuflags = REQ_NONE;
6882
477
        }
6883
6884
      /* If firstcu was previously set, convert the subpattern's firstcu
6885
      into reqcu if there wasn't one, using the vary flag that was in
6886
      existence beforehand. */
6887
6888
2.21k
      else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6889
65
        {
6890
65
        subreqcu = subfirstcu;
6891
65
        subreqcuflags = subfirstcuflags | tempreqvary;
6892
65
        }
6893
6894
      /* If the subpattern set a required code unit (or set a first code unit
6895
      that isn't really the first code unit - see above), set it. */
6896
6897
2.69k
      if (subreqcuflags < REQ_NONE)
6898
559
        {
6899
559
        reqcu = subreqcu;
6900
559
        reqcuflags = subreqcuflags;
6901
559
        }
6902
2.69k
      }
6903
6904
    /* For a forward assertion, we take the reqcu, if set, provided that the
6905
    group has also set a firstcu. This can be helpful if the pattern that
6906
    follows the assertion doesn't set a different char. For example, it's
6907
    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6908
    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6909
    the "real" "a" would then become a reqcu instead of a firstcu. This is
6910
    overcome by a scan at the end if there's no firstcu, looking for an
6911
    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6912
    we must only take the reqcu when the group also set a firstcu. Otherwise,
6913
    in that example, 'X' ends up set for both. */
6914
6915
111
    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
6916
111
             subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
6917
15
      {
6918
15
      reqcu = subreqcu;
6919
15
      reqcuflags = subreqcuflags;
6920
15
      }
6921
6922
2.80k
    break;  /* End of nested group handling */
6923
6924
6925
    /* ===================================================================*/
6926
    /* Handle named backreferences and recursions. */
6927
6928
0
    case META_BACKREF_BYNAME:
6929
0
    case META_RECURSE_BYNAME:
6930
0
      {
6931
0
      int count, index;
6932
0
      PCRE2_SPTR name;
6933
0
      BOOL is_dupname = FALSE;
6934
0
      named_group *ng = cb->named_groups;
6935
0
      uint32_t length = *(++pptr);
6936
6937
0
      GETPLUSOFFSET(offset, pptr);
6938
0
      name = cb->start_pattern + offset;
6939
6940
      /* In the first pass, the names generated in the pre-pass are available,
6941
      but the main name table has not yet been created. Scan the list of names
6942
      generated in the pre-pass in order to get a number and whether or not
6943
      this name is duplicated. */
6944
6945
0
      groupnumber = 0;
6946
0
      for (unsigned int i = 0; i < cb->names_found; i++, ng++)
6947
0
        {
6948
0
        if (length == ng->length &&
6949
0
            PRIV(strncmp)(name, ng->name, length) == 0)
6950
0
          {
6951
0
          is_dupname = ng->isdup;
6952
0
          groupnumber = ng->number;
6953
6954
          /* For a recursion, that's all that is needed. We can now go to
6955
          the code that handles numerical recursion, applying it to the first
6956
          group with the given name. */
6957
6958
0
          if (meta == META_RECURSE_BYNAME)
6959
0
            {
6960
0
            meta_arg = groupnumber;
6961
0
            goto HANDLE_NUMERICAL_RECURSION;
6962
0
            }
6963
6964
          /* For a back reference, update the back reference map and the
6965
          maximum back reference. */
6966
6967
0
          cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6968
0
          if (groupnumber > cb->top_backref)
6969
0
            cb->top_backref = groupnumber;
6970
0
          }
6971
0
        }
6972
6973
      /* If the name was not found we have a bad reference. */
6974
6975
0
      if (groupnumber == 0)
6976
0
        {
6977
0
        *errorcodeptr = ERR15;
6978
0
        cb->erroroffset = offset;
6979
0
        return 0;
6980
0
        }
6981
6982
      /* If a back reference name is not duplicated, we can handle it as
6983
      a numerical reference. */
6984
6985
0
      if (!is_dupname)
6986
0
        {
6987
0
        meta_arg = groupnumber;
6988
0
        goto HANDLE_SINGLE_REFERENCE;
6989
0
        }
6990
6991
      /* If a back reference name is duplicated, we generate a different
6992
      opcode to a numerical back reference. In the second pass we must
6993
      search for the index and count in the final name table. */
6994
6995
0
      count = 0;  /* Values for first pass (avoids compiler warning) */
6996
0
      index = 0;
6997
0
      if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6998
0
            &count, errorcodeptr, cb)) return 0;
6999
7000
0
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7001
0
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7002
0
      PUT2INC(code, 0, index);
7003
0
      PUT2INC(code, 0, count);
7004
0
      if ((options & PCRE2_CASELESS) != 0)
7005
0
        *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7006
0
                   REFI_FLAG_CASELESS_RESTRICT : 0) |
7007
0
                  (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7008
0
                   REFI_FLAG_TURKISH_CASING : 0);
7009
0
      }
7010
0
    break;
7011
7012
7013
    /* ===================================================================*/
7014
    /* Handle a numerical callout. */
7015
7016
0
    case META_CALLOUT_NUMBER:
7017
0
    code[0] = OP_CALLOUT;
7018
0
    PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7019
0
    PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7020
0
    code[1 + 2*LINK_SIZE] = pptr[3];
7021
0
    pptr += 3;
7022
0
    code += PRIV(OP_lengths)[OP_CALLOUT];
7023
0
    break;
7024
7025
7026
    /* ===================================================================*/
7027
    /* Handle a callout with a string argument. In the pre-pass we just compute
7028
    the length without generating anything. The length in pptr[3] includes both
7029
    delimiters; in the actual compile only the first one is copied, but a
7030
    terminating zero is added. Any doubled delimiters within the string make
7031
    this an overestimate, but it is not worth bothering about. */
7032
7033
0
    case META_CALLOUT_STRING:
7034
0
    if (lengthptr != NULL)
7035
0
      {
7036
0
      *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7037
0
      pptr += 3;
7038
0
      SKIPOFFSET(pptr);
7039
0
      }
7040
7041
    /* In the real compile we can copy the string. The starting delimiter is
7042
     included so that the client can discover it if they want. We also pass the
7043
     start offset to help a script language give better error messages. */
7044
7045
0
    else
7046
0
      {
7047
0
      PCRE2_SPTR pp;
7048
0
      uint32_t delimiter;
7049
0
      uint32_t length = pptr[3];
7050
0
      PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7051
7052
0
      code[0] = OP_CALLOUT_STR;
7053
0
      PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7054
0
      PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7055
7056
0
      pptr += 3;
7057
0
      GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7058
0
      pp = cb->start_pattern + offset;
7059
0
      delimiter = *callout_string++ = *pp++;
7060
0
      if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7061
0
        delimiter = CHAR_RIGHT_CURLY_BRACKET;
7062
0
      PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7063
7064
      /* The syntax of the pattern was checked in the parsing scan. The length
7065
      includes both delimiters, but we have passed the opening one just above,
7066
      so we reduce length before testing it. The test is for > 1 because we do
7067
      not want to copy the final delimiter. This also ensures that pp[1] is
7068
      accessible. */
7069
7070
0
      while (--length > 1)
7071
0
        {
7072
0
        if (*pp == delimiter && pp[1] == delimiter)
7073
0
          {
7074
0
          *callout_string++ = delimiter;
7075
0
          pp += 2;
7076
0
          length--;
7077
0
          }
7078
0
        else *callout_string++ = *pp++;
7079
0
        }
7080
0
      *callout_string++ = CHAR_NUL;
7081
7082
      /* Set the length of the entire item, the advance to its end. */
7083
7084
0
      PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7085
0
      code = callout_string;
7086
0
      }
7087
0
    break;
7088
7089
7090
    /* ===================================================================*/
7091
    /* Handle repetition. The different types are all sorted out in the parsing
7092
    pass. */
7093
7094
0
    case META_MINMAX_PLUS:
7095
0
    case META_MINMAX_QUERY:
7096
4
    case META_MINMAX:
7097
4
    repeat_min = *(++pptr);
7098
4
    repeat_max = *(++pptr);
7099
4
    goto REPEAT;
7100
7101
9.82k
    case META_ASTERISK:
7102
10.1k
    case META_ASTERISK_PLUS:
7103
13.8k
    case META_ASTERISK_QUERY:
7104
13.8k
    repeat_min = 0;
7105
13.8k
    repeat_max = REPEAT_UNLIMITED;
7106
13.8k
    goto REPEAT;
7107
7108
14.7k
    case META_PLUS:
7109
15.7k
    case META_PLUS_PLUS:
7110
19.4k
    case META_PLUS_QUERY:
7111
19.4k
    repeat_min = 1;
7112
19.4k
    repeat_max = REPEAT_UNLIMITED;
7113
19.4k
    goto REPEAT;
7114
7115
46.0k
    case META_QUERY:
7116
47.4k
    case META_QUERY_PLUS:
7117
58.6k
    case META_QUERY_QUERY:
7118
58.6k
    repeat_min = 0;
7119
58.6k
    repeat_max = 1;
7120
7121
91.9k
    REPEAT:
7122
91.9k
    if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7123
7124
    /* Remember whether this is a variable length repeat, and default to
7125
    single-char opcodes. */
7126
7127
91.9k
    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7128
7129
    /* Adjust first and required code units for a zero repeat. */
7130
7131
91.9k
    if (repeat_min == 0)
7132
72.5k
      {
7133
72.5k
      firstcu = zerofirstcu;
7134
72.5k
      firstcuflags = zerofirstcuflags;
7135
72.5k
      reqcu = zeroreqcu;
7136
72.5k
      reqcuflags = zeroreqcuflags;
7137
72.5k
      }
7138
7139
    /* Note the greediness and possessiveness. */
7140
7141
91.9k
    switch (meta)
7142
91.9k
      {
7143
0
      case META_MINMAX_PLUS:
7144
312
      case META_ASTERISK_PLUS:
7145
1.25k
      case META_PLUS_PLUS:
7146
2.59k
      case META_QUERY_PLUS:
7147
2.59k
      repeat_type = 0;                  /* Force greedy */
7148
2.59k
      possessive_quantifier = TRUE;
7149
2.59k
      break;
7150
7151
0
      case META_MINMAX_QUERY:
7152
3.69k
      case META_ASTERISK_QUERY:
7153
7.37k
      case META_PLUS_QUERY:
7154
18.6k
      case META_QUERY_QUERY:
7155
18.6k
      repeat_type = greedy_non_default;
7156
18.6k
      possessive_quantifier = FALSE;
7157
18.6k
      break;
7158
7159
70.7k
      default:
7160
70.7k
      repeat_type = greedy_default;
7161
70.7k
      possessive_quantifier = FALSE;
7162
70.7k
      break;
7163
91.9k
      }
7164
7165
    /* Save start of previous item, in case we have to move it up in order to
7166
    insert something before it, and remember what it was. */
7167
7168
91.9k
    PCRE2_ASSERT(previous != NULL);
7169
91.9k
    tempcode = previous;
7170
91.9k
    op_previous = *previous;
7171
7172
    /* Now handle repetition for the different types of item. If the repeat
7173
    minimum and the repeat maximum are both 1, we can ignore the quantifier for
7174
    non-parenthesized items, as they have only one alternative. For anything in
7175
    parentheses, we must not ignore if {1} is possessive. */
7176
7177
91.9k
    switch (op_previous)
7178
91.9k
      {
7179
      /* If previous was a character or negated character match, abolish the
7180
      item and generate a repeat item instead. If a char item has a minimum of
7181
      more than one, ensure that it is set in reqcu - it might not be if a
7182
      sequence such as x{3} is the first thing in a branch because the x will
7183
      have gone into firstcu instead.  */
7184
7185
35.0k
      case OP_CHAR:
7186
51.9k
      case OP_CHARI:
7187
55.0k
      case OP_NOT:
7188
56.7k
      case OP_NOTI:
7189
56.7k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7190
56.7k
      op_type = chartypeoffset[op_previous - OP_CHAR];
7191
7192
      /* Deal with UTF characters that take up more than one code unit. */
7193
7194
56.7k
#ifdef MAYBE_UTF_MULTI
7195
56.7k
      if (utf && NOT_FIRSTCU(code[-1]))
7196
14
        {
7197
14
        PCRE2_UCHAR *lastchar = code - 1;
7198
14
        BACKCHAR(lastchar);
7199
14
        mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7200
14
        memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7201
14
        }
7202
56.6k
      else
7203
56.6k
#endif  /* MAYBE_UTF_MULTI */
7204
7205
      /* Handle the case of a single code unit - either with no UTF support, or
7206
      with UTF disabled, or for a single-code-unit UTF character. In the latter
7207
      case, for a repeated positive match, get the caseless flag for the
7208
      required code unit from the previous character, because a class like [Aa]
7209
      sets a caseless A but by now the req_caseopt flag has been reset. */
7210
7211
56.6k
        {
7212
56.6k
        mcbuffer[0] = code[-1];
7213
56.6k
        mclength = 1;
7214
56.6k
        if (op_previous <= OP_CHARI && repeat_min > 1)
7215
0
          {
7216
0
          reqcu = mcbuffer[0];
7217
0
          reqcuflags = cb->req_varyopt;
7218
0
          if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7219
0
          }
7220
56.6k
        }
7221
56.7k
      goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7222
7223
      /* If previous was a character class or a back reference, we put the
7224
      repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7225
7226
0
#ifdef SUPPORT_WIDE_CHARS
7227
1.44k
      case OP_XCLASS:
7228
1.44k
      case OP_ECLASS:
7229
1.44k
#endif
7230
10.4k
      case OP_CLASS:
7231
14.1k
      case OP_NCLASS:
7232
14.1k
      case OP_REF:
7233
14.2k
      case OP_REFI:
7234
14.2k
      case OP_DNREF:
7235
14.2k
      case OP_DNREFI:
7236
7237
14.2k
      if (repeat_max == 0)
7238
0
        {
7239
0
        code = previous;
7240
0
        goto END_REPEAT;
7241
0
        }
7242
14.2k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7243
7244
14.2k
      if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7245
4.92k
        *code++ = OP_CRSTAR + repeat_type;
7246
9.28k
      else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7247
2.02k
        *code++ = OP_CRPLUS + repeat_type;
7248
7.26k
      else if (repeat_min == 0 && repeat_max == 1)
7249
7.26k
        *code++ = OP_CRQUERY + repeat_type;
7250
0
      else
7251
0
        {
7252
0
        *code++ = OP_CRRANGE + repeat_type;
7253
0
        PUT2INC(code, 0, repeat_min);
7254
0
        if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7255
0
        PUT2INC(code, 0, repeat_max);
7256
0
        }
7257
14.2k
      break;
7258
7259
      /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7260
      because pcre2_match() could not handle backtracking into recursively
7261
      called groups. Now that this backtracking is available, we no longer need
7262
      to do this. However, we still need to replicate recursions as we do for
7263
      groups so as to have independent backtracking points. We can replicate
7264
      for the minimum number of repeats directly. For optional repeats we now
7265
      wrap the recursion in OP_BRA brackets and make use of the bracket
7266
      repetition. */
7267
7268
0
      case OP_RECURSE:
7269
0
      if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7270
0
        goto END_REPEAT;
7271
7272
      /* Generate unwrapped repeats for a non-zero minimum, except when the
7273
      minimum is 1 and the maximum unlimited, because that can be handled with
7274
      OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7275
      minimum, we just need to generate the appropriate additional copies.
7276
      Otherwise we need to generate one more, to simulate the situation when
7277
      the minimum is zero. */
7278
7279
0
      if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7280
0
        {
7281
0
        int replicate = repeat_min;
7282
0
        if (repeat_min == repeat_max) replicate--;
7283
7284
        /* In the pre-compile phase, we don't actually do the replication. We
7285
        just adjust the length as if we had. Do some paranoid checks for
7286
        potential integer overflow. */
7287
7288
0
        if (lengthptr != NULL)
7289
0
          {
7290
0
          PCRE2_SIZE delta;
7291
0
          if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7292
0
              OFLOW_MAX - *lengthptr < delta)
7293
0
            {
7294
0
            *errorcodeptr = ERR20;
7295
0
            return 0;
7296
0
            }
7297
0
          *lengthptr += delta;
7298
0
          }
7299
7300
0
        else for (int i = 0; i < replicate; i++)
7301
0
          {
7302
0
          memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7303
0
          previous = code;
7304
0
          code += 1 + LINK_SIZE;
7305
0
          }
7306
7307
        /* If the number of repeats is fixed, we are done. Otherwise, adjust
7308
        the counts and fall through. */
7309
7310
0
        if (repeat_min == repeat_max) break;
7311
0
        if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7312
0
        repeat_min = 0;
7313
0
        }
7314
7315
      /* Wrap the recursion call in OP_BRA brackets. */
7316
7317
0
      (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7318
0
      op_previous = *previous = OP_BRA;
7319
0
      PUT(previous, 1, 2 + 2*LINK_SIZE);
7320
0
      previous[2 + 2*LINK_SIZE] = OP_KET;
7321
0
      PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7322
0
      code += 2 + 2 * LINK_SIZE;
7323
0
      length_prevgroup = 3 + 3*LINK_SIZE;
7324
0
      group_return = -1;  /* Set "may match empty string" */
7325
7326
      /* Now treat as a repeated OP_BRA. */
7327
      /* Fall through */
7328
7329
      /* If previous was a bracket group, we may have to replicate it in
7330
      certain cases. Note that at this point we can encounter only the "basic"
7331
      bracket opcodes such as BRA and CBRA, as this is the place where they get
7332
      converted into the more special varieties such as BRAPOS and SBRA.
7333
      Originally, PCRE did not allow repetition of assertions, but now it does,
7334
      for Perl compatibility. */
7335
7336
4
      case OP_ASSERT:
7337
4
      case OP_ASSERT_NOT:
7338
4
      case OP_ASSERT_NA:
7339
82
      case OP_ASSERTBACK:
7340
82
      case OP_ASSERTBACK_NOT:
7341
82
      case OP_ASSERTBACK_NA:
7342
82
      case OP_ASSERT_SCS:
7343
82
      case OP_ONCE:
7344
82
      case OP_SCRIPT_RUN:
7345
158
      case OP_BRA:
7346
1.44k
      case OP_CBRA:
7347
1.44k
      case OP_COND:
7348
1.44k
        {
7349
1.44k
        int len = (int)(code - previous);
7350
1.44k
        PCRE2_UCHAR *bralink = NULL;
7351
1.44k
        PCRE2_UCHAR *brazeroptr = NULL;
7352
7353
1.44k
        if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7354
0
          goto END_REPEAT;
7355
7356
        /* Repeating a DEFINE group (or any group where the condition is always
7357
        FALSE and there is only one branch) is pointless, but Perl allows the
7358
        syntax, so we just ignore the repeat. */
7359
7360
1.44k
        if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7361
1.44k
            previous[GET(previous, 1)] != OP_ALT)
7362
0
          goto END_REPEAT;
7363
7364
        /* Perl allows all assertions to be quantified, and when they contain
7365
        capturing parentheses and/or are optional there are potential uses for
7366
        this feature. PCRE2 used to force the maximum quantifier to 1 on the
7367
        invalid grounds that further repetition was never useful. This was
7368
        always a bit pointless, since an assertion could be wrapped with a
7369
        repeated group to achieve the effect. General repetition is now
7370
        permitted, but if the maximum is unlimited it is set to one more than
7371
        the minimum. */
7372
7373
1.44k
        if (op_previous < OP_ONCE)    /* Assertion */
7374
82
          {
7375
82
          if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7376
82
          }
7377
7378
        /* The case of a zero minimum is special because of the need to stick
7379
        OP_BRAZERO in front of it, and because the group appears once in the
7380
        data, whereas in other cases it appears the minimum number of times. For
7381
        this reason, it is simplest to treat this case separately, as otherwise
7382
        the code gets far too messy. There are several special subcases when the
7383
        minimum is zero. */
7384
7385
1.44k
        if (repeat_min == 0)
7386
50
          {
7387
          /* If the maximum is also zero, we used to just omit the group from
7388
          the output altogether, like this:
7389
7390
          ** if (repeat_max == 0)
7391
          **   {
7392
          **   code = previous;
7393
          **   goto END_REPEAT;
7394
          **   }
7395
7396
          However, that fails when a group or a subgroup within it is
7397
          referenced as a subroutine from elsewhere in the pattern, so now we
7398
          stick in OP_SKIPZERO in front of it so that it is skipped on
7399
          execution. As we don't have a list of which groups are referenced, we
7400
          cannot do this selectively.
7401
7402
          If the maximum is 1 or unlimited, we just have to stick in the
7403
          BRAZERO and do no more at this point. */
7404
7405
50
          if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7406
50
            {
7407
50
            (void)memmove(previous + 1, previous, CU2BYTES(len));
7408
50
            code++;
7409
50
            if (repeat_max == 0)
7410
0
              {
7411
0
              *previous++ = OP_SKIPZERO;
7412
0
              goto END_REPEAT;
7413
0
              }
7414
50
            brazeroptr = previous;    /* Save for possessive optimizing */
7415
50
            *previous++ = OP_BRAZERO + repeat_type;
7416
50
            }
7417
7418
          /* If the maximum is greater than 1 and limited, we have to replicate
7419
          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7420
          The first one has to be handled carefully because it's the original
7421
          copy, which has to be moved up. The remainder can be handled by code
7422
          that is common with the non-zero minimum case below. We have to
7423
          adjust the value or repeat_max, since one less copy is required. */
7424
7425
0
          else
7426
0
            {
7427
0
            int linkoffset;
7428
0
            (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7429
0
            code += 2 + LINK_SIZE;
7430
0
            *previous++ = OP_BRAZERO + repeat_type;
7431
0
            *previous++ = OP_BRA;
7432
7433
            /* We chain together the bracket link offset fields that have to be
7434
            filled in later when the ends of the brackets are reached. */
7435
7436
0
            linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7437
0
            bralink = previous;
7438
0
            PUTINC(previous, 0, linkoffset);
7439
0
            }
7440
7441
50
          if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7442
50
          }
7443
7444
        /* If the minimum is greater than zero, replicate the group as many
7445
        times as necessary, and adjust the maximum to the number of subsequent
7446
        copies that we need. */
7447
7448
1.39k
        else
7449
1.39k
          {
7450
1.39k
          if (repeat_min > 1)
7451
0
            {
7452
            /* In the pre-compile phase, we don't actually do the replication.
7453
            We just adjust the length as if we had. Do some paranoid checks for
7454
            potential integer overflow. */
7455
7456
0
            if (lengthptr != NULL)
7457
0
              {
7458
0
              PCRE2_SIZE delta;
7459
0
              if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7460
0
                                 (int)length_prevgroup) ||
7461
0
                  OFLOW_MAX - *lengthptr < delta)
7462
0
                {
7463
0
                *errorcodeptr = ERR20;
7464
0
                return 0;
7465
0
                }
7466
0
              *lengthptr += delta;
7467
0
              }
7468
7469
            /* This is compiling for real. If there is a set first code unit
7470
            for the group, and we have not yet set a "required code unit", set
7471
            it. */
7472
7473
0
            else
7474
0
              {
7475
0
              if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7476
0
                {
7477
0
                reqcu = firstcu;
7478
0
                reqcuflags = firstcuflags;
7479
0
                }
7480
0
              for (uint32_t i = 1; i < repeat_min; i++)
7481
0
                {
7482
0
                memcpy(code, previous, CU2BYTES(len));
7483
0
                code += len;
7484
0
                }
7485
0
              }
7486
0
            }
7487
7488
1.39k
          if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7489
1.39k
          }
7490
7491
        /* This code is common to both the zero and non-zero minimum cases. If
7492
        the maximum is limited, it replicates the group in a nested fashion,
7493
        remembering the bracket starts on a stack. In the case of a zero
7494
        minimum, the first one was set up above. In all cases the repeat_max
7495
        now specifies the number of additional copies needed. Again, we must
7496
        remember to replicate entries on the forward reference list. */
7497
7498
1.44k
        if (repeat_max != REPEAT_UNLIMITED)
7499
132
          {
7500
          /* In the pre-compile phase, we don't actually do the replication. We
7501
          just adjust the length as if we had. For each repetition we must add
7502
          1 to the length for BRAZERO and for all but the last repetition we
7503
          must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7504
          paranoid checks to avoid integer overflow. */
7505
7506
132
          if (lengthptr != NULL && repeat_max > 0)
7507
41
            {
7508
41
            PCRE2_SIZE delta;
7509
41
            if (PRIV(ckd_smul)(&delta, repeat_max,
7510
41
                               (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7511
41
                OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7512
0
              {
7513
0
              *errorcodeptr = ERR20;
7514
0
              return 0;
7515
0
              }
7516
41
            delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7517
41
            *lengthptr += delta;
7518
41
            }
7519
7520
          /* This is compiling for real */
7521
7522
132
          else for (uint32_t i = repeat_max; i >= 1; i--)
7523
41
            {
7524
41
            *code++ = OP_BRAZERO + repeat_type;
7525
7526
            /* All but the final copy start a new nesting, maintaining the
7527
            chain of brackets outstanding. */
7528
7529
41
            if (i != 1)
7530
0
              {
7531
0
              int linkoffset;
7532
0
              *code++ = OP_BRA;
7533
0
              linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7534
0
              bralink = code;
7535
0
              PUTINC(code, 0, linkoffset);
7536
0
              }
7537
7538
41
            memcpy(code, previous, CU2BYTES(len));
7539
41
            code += len;
7540
41
            }
7541
7542
          /* Now chain through the pending brackets, and fill in their length
7543
          fields (which are holding the chain links pro tem). */
7544
7545
132
          while (bralink != NULL)
7546
0
            {
7547
0
            int oldlinkoffset;
7548
0
            int linkoffset = (int)(code - bralink + 1);
7549
0
            PCRE2_UCHAR *bra = code - linkoffset;
7550
0
            oldlinkoffset = GET(bra, 1);
7551
0
            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7552
0
            *code++ = OP_KET;
7553
0
            PUTINC(code, 0, linkoffset);
7554
0
            PUT(bra, 1, linkoffset);
7555
0
            }
7556
132
          }
7557
7558
        /* If the maximum is unlimited, set a repeater in the final copy. For
7559
        SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7560
        possessively repeated ONCE brackets can be converted into non-capturing
7561
        brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7562
        saves having to deal with possessive ONCEs specially.
7563
7564
        Otherwise, when we are doing the actual compile phase, check to see
7565
        whether this group is one that could match an empty string. If so,
7566
        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7567
        that runtime checking can be done. [This check is also applied to ONCE
7568
        and SCRIPT_RUN groups at runtime, but in a different way.]
7569
7570
        Then, if the quantifier was possessive and the bracket is not a
7571
        conditional, we convert the BRA code to the POS form, and the KET code
7572
        to KETRPOS. (It turns out to be convenient at runtime to detect this
7573
        kind of subpattern at both the start and at the end.) The use of
7574
        special opcodes makes it possible to reduce greatly the stack usage in
7575
        pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7576
        OP_BRAPOSZERO.
7577
7578
        Then, if the minimum number of matches is 1 or 0, cancel the possessive
7579
        flag so that the default action below, of wrapping everything inside
7580
        atomic brackets, does not happen. When the minimum is greater than 1,
7581
        there will be earlier copies of the group, and so we still have to wrap
7582
        the whole thing. */
7583
7584
1.31k
        else
7585
1.31k
          {
7586
1.31k
          PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7587
1.31k
          PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7588
7589
          /* Convert possessive ONCE brackets to non-capturing */
7590
7591
1.31k
          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7592
7593
          /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7594
          to do is to set the KET. */
7595
7596
1.31k
          if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7597
0
            *ketcode = OP_KETRMAX + repeat_type;
7598
7599
          /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7600
          (which have been converted to non-capturing above). */
7601
7602
1.31k
          else
7603
1.31k
            {
7604
            /* In the compile phase, adjust the opcode if the group can match
7605
            an empty string. For a conditional group with only one branch, the
7606
            value of group_return will not show "could be empty", so we must
7607
            check that separately. */
7608
7609
1.31k
            if (lengthptr == NULL)
7610
608
              {
7611
608
              if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7612
608
              if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7613
0
                *bracode = OP_SCOND;
7614
608
              }
7615
7616
            /* Handle possessive quantifiers. */
7617
7618
1.31k
            if (possessive_quantifier)
7619
794
              {
7620
              /* For COND brackets, we wrap the whole thing in a possessively
7621
              repeated non-capturing bracket, because we have not invented POS
7622
              versions of the COND opcodes. */
7623
7624
794
              if (*bracode == OP_COND || *bracode == OP_SCOND)
7625
0
                {
7626
0
                int nlen = (int)(code - bracode);
7627
0
                (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7628
0
                code += 1 + LINK_SIZE;
7629
0
                nlen += 1 + LINK_SIZE;
7630
0
                *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7631
0
                *code++ = OP_KETRPOS;
7632
0
                PUTINC(code, 0, nlen);
7633
0
                PUT(bracode, 1, nlen);
7634
0
                }
7635
7636
              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7637
7638
794
              else
7639
794
                {
7640
794
                *bracode += 1;              /* Switch to xxxPOS opcodes */
7641
794
                *ketcode = OP_KETRPOS;
7642
794
                }
7643
7644
              /* If the minimum is zero, mark it as possessive, then unset the
7645
              possessive flag when the minimum is 0 or 1. */
7646
7647
794
              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7648
794
              if (repeat_min < 2) possessive_quantifier = FALSE;
7649
794
              }
7650
7651
            /* Non-possessive quantifier */
7652
7653
523
            else *ketcode = OP_KETRMAX + repeat_type;
7654
1.31k
            }
7655
1.31k
          }
7656
1.44k
        }
7657
1.44k
      break;
7658
7659
      /* If previous was a character type match (\d or similar), abolish it and
7660
      create a suitable repeat item. The code is shared with single-character
7661
      repeats by setting op_type to add a suitable offset into repeat_type.
7662
      Note the the Unicode property types will be present only when
7663
      SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7664
      here because it just makes it horribly messy. */
7665
7666
19.5k
      default:
7667
19.5k
      if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)
7668
0
        {
7669
0
        PCRE2_DEBUG_UNREACHABLE();
7670
0
        *errorcodeptr = ERR10;  /* Not a character type - internal error */
7671
0
        return 0;
7672
0
        }
7673
19.5k
      else
7674
19.5k
        {
7675
19.5k
        int prop_type, prop_value;
7676
19.5k
        PCRE2_UCHAR *oldcode;
7677
7678
19.5k
        if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7679
7680
19.5k
        op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7681
19.5k
        mclength = 0;                         /* Not a character */
7682
7683
19.5k
        if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7684
1.17k
          {
7685
1.17k
          prop_type = previous[1];
7686
1.17k
          prop_value = previous[2];
7687
1.17k
          }
7688
18.4k
        else
7689
18.4k
          {
7690
          /* Come here from just above with a character in mcbuffer/mclength.
7691
          You must also set op_type before the jump. */
7692
75.1k
          OUTPUT_SINGLE_REPEAT:
7693
75.1k
          prop_type = prop_value = -1;
7694
75.1k
          }
7695
7696
        /* At this point, if prop_type == prop_value == -1 we either have a
7697
        character in mcbuffer when mclength is greater than zero, or we have
7698
        mclength zero, in which case there is a non-property character type in
7699
        op_previous. If prop_type/value are not negative, we have a property
7700
        character type in op_previous. */
7701
7702
76.2k
        oldcode = code;                   /* Save where we were */
7703
76.2k
        code = previous;                  /* Usually overwrite previous item */
7704
7705
        /* If the maximum is zero then the minimum must also be zero; Perl allows
7706
        this case, so we do too - by simply omitting the item altogether. */
7707
7708
76.2k
        if (repeat_max == 0) goto END_REPEAT;
7709
7710
        /* Combine the op_type with the repeat_type */
7711
7712
76.2k
        repeat_type += op_type;
7713
7714
        /* A minimum of zero is handled either as the special case * or ?, or as
7715
        an UPTO, with the maximum given. */
7716
7717
76.2k
        if (repeat_min == 0)
7718
60.3k
          {
7719
60.3k
          if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7720
51.3k
            else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7721
0
          else
7722
0
            {
7723
0
            *code++ = OP_UPTO + repeat_type;
7724
0
            PUT2INC(code, 0, repeat_max);
7725
0
            }
7726
60.3k
          }
7727
7728
        /* A repeat minimum of 1 is optimized into some special cases. If the
7729
        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7730
        left in place and, if the maximum is greater than 1, we use OP_UPTO with
7731
        one less than the maximum. */
7732
7733
15.9k
        else if (repeat_min == 1)
7734
15.9k
          {
7735
15.9k
          if (repeat_max == REPEAT_UNLIMITED)
7736
15.9k
            *code++ = OP_PLUS + repeat_type;
7737
0
          else
7738
0
            {
7739
0
            code = oldcode;  /* Leave previous item in place */
7740
0
            if (repeat_max == 1) goto END_REPEAT;
7741
0
            *code++ = OP_UPTO + repeat_type;
7742
0
            PUT2INC(code, 0, repeat_max - 1);
7743
0
            }
7744
15.9k
          }
7745
7746
        /* The case {n,n} is just an EXACT, while the general case {n,m} is
7747
        handled as an EXACT followed by an UPTO or STAR or QUERY. */
7748
7749
0
        else
7750
0
          {
7751
0
          *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7752
0
          PUT2INC(code, 0, repeat_min);
7753
7754
          /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7755
          and then generate the second opcode. For a repeated Unicode property
7756
          match, there are two extra values that define the required property,
7757
          and mclength is set zero to indicate this. */
7758
7759
0
          if (repeat_max != repeat_min)
7760
0
            {
7761
0
            if (mclength > 0)
7762
0
              {
7763
0
              memcpy(code, mcbuffer, CU2BYTES(mclength));
7764
0
              code += mclength;
7765
0
              }
7766
0
            else
7767
0
              {
7768
0
              *code++ = op_previous;
7769
0
              if (prop_type >= 0)
7770
0
                {
7771
0
                *code++ = prop_type;
7772
0
                *code++ = prop_value;
7773
0
                }
7774
0
              }
7775
7776
            /* Now set up the following opcode */
7777
7778
0
            if (repeat_max == REPEAT_UNLIMITED)
7779
0
              *code++ = OP_STAR + repeat_type;
7780
0
            else
7781
0
              {
7782
0
              repeat_max -= repeat_min;
7783
0
              if (repeat_max == 1)
7784
0
                {
7785
0
                *code++ = OP_QUERY + repeat_type;
7786
0
                }
7787
0
              else
7788
0
                {
7789
0
                *code++ = OP_UPTO + repeat_type;
7790
0
                PUT2INC(code, 0, repeat_max);
7791
0
                }
7792
0
              }
7793
0
            }
7794
0
          }
7795
7796
        /* Fill in the character or character type for the final opcode. */
7797
7798
76.2k
        if (mclength > 0)
7799
56.6k
          {
7800
56.6k
          memcpy(code, mcbuffer, CU2BYTES(mclength));
7801
56.6k
          code += mclength;
7802
56.6k
          }
7803
19.5k
        else
7804
19.5k
          {
7805
19.5k
          *code++ = op_previous;
7806
19.5k
          if (prop_type >= 0)
7807
1.17k
            {
7808
1.17k
            *code++ = prop_type;
7809
1.17k
            *code++ = prop_value;
7810
1.17k
            }
7811
19.5k
          }
7812
76.2k
        }
7813
76.2k
      break;
7814
91.9k
      }  /* End of switch on different op_previous values */
7815
7816
7817
    /* If the character following a repeat is '+', possessive_quantifier is
7818
    TRUE. For some opcodes, there are special alternative opcodes for this
7819
    case. For anything else, we wrap the entire repeated item inside OP_ONCE
7820
    brackets. Logically, the '+' notation is just syntactic sugar, taken from
7821
    Sun's Java package, but the special opcodes can optimize it.
7822
7823
    Some (but not all) possessively repeated subpatterns have already been
7824
    completely handled in the code just above. For them, possessive_quantifier
7825
    is always FALSE at this stage. Note that the repeated item starts at
7826
    tempcode, not at previous, which might be the first part of a string whose
7827
    (former) last char we repeated. */
7828
7829
91.9k
    if (possessive_quantifier)
7830
1.80k
      {
7831
1.80k
      int len;
7832
7833
      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7834
      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7835
      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7836
      remains is greater than zero, there's a further opcode that can be
7837
      handled. If not, do nothing, leaving the EXACT alone. */
7838
7839
1.80k
      switch(*tempcode)
7840
1.80k
        {
7841
0
        case OP_TYPEEXACT:
7842
0
        tempcode += PRIV(OP_lengths)[*tempcode] +
7843
0
          ((tempcode[1 + IMM2_SIZE] == OP_PROP
7844
0
          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7845
0
        break;
7846
7847
        /* CHAR opcodes are used for exacts whose count is 1. */
7848
7849
0
        case OP_CHAR:
7850
0
        case OP_CHARI:
7851
0
        case OP_NOT:
7852
0
        case OP_NOTI:
7853
0
        case OP_EXACT:
7854
0
        case OP_EXACTI:
7855
0
        case OP_NOTEXACT:
7856
0
        case OP_NOTEXACTI:
7857
0
        tempcode += PRIV(OP_lengths)[*tempcode];
7858
0
#ifdef SUPPORT_UNICODE
7859
0
        if (utf && HAS_EXTRALEN(tempcode[-1]))
7860
0
          tempcode += GET_EXTRALEN(tempcode[-1]);
7861
0
#endif
7862
0
        break;
7863
7864
        /* For the class opcodes, the repeat operator appears at the end;
7865
        adjust tempcode to point to it. */
7866
7867
194
        case OP_CLASS:
7868
198
        case OP_NCLASS:
7869
198
        tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7870
198
        break;
7871
7872
0
#ifdef SUPPORT_WIDE_CHARS
7873
0
        case OP_XCLASS:
7874
0
        case OP_ECLASS:
7875
0
        tempcode += GET(tempcode, 1);
7876
0
        break;
7877
1.80k
#endif
7878
1.80k
        }
7879
7880
      /* If tempcode is equal to code (which points to the end of the repeated
7881
      item), it means we have skipped an EXACT item but there is no following
7882
      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7883
      all other cases, tempcode will be pointing to the repeat opcode, and will
7884
      be less than code, so the value of len will be greater than 0. */
7885
7886
1.80k
      len = (int)(code - tempcode);
7887
1.80k
      if (len > 0)
7888
1.80k
        {
7889
1.80k
        unsigned int repcode = *tempcode;
7890
7891
        /* There is a table for possessifying opcodes, all of which are less
7892
        than OP_CALLOUT. A zero entry means there is no possessified version.
7893
        */
7894
7895
1.80k
        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7896
1.80k
          *tempcode = opcode_possessify[repcode];
7897
7898
        /* For opcode without a special possessified version, wrap the item in
7899
        ONCE brackets. */
7900
7901
2
        else
7902
2
          {
7903
2
          (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7904
2
          code += 1 + LINK_SIZE;
7905
2
          len += 1 + LINK_SIZE;
7906
2
          tempcode[0] = OP_ONCE;
7907
2
          *code++ = OP_KET;
7908
2
          PUTINC(code, 0, len);
7909
2
          PUT(tempcode, 1, len);
7910
2
          }
7911
1.80k
        }
7912
1.80k
      }
7913
7914
    /* We set the "follows varying string" flag for subsequently encountered
7915
    reqcus if it isn't already set and we have just passed a varying length
7916
    item. */
7917
7918
91.9k
    END_REPEAT:
7919
91.9k
    cb->req_varyopt |= reqvary;
7920
91.9k
    break;
7921
7922
7923
    /* ===================================================================*/
7924
    /* Handle a 32-bit data character with a value greater than META_END. */
7925
7926
0
    case META_BIGVALUE:
7927
0
    pptr++;
7928
0
    goto NORMAL_CHAR;
7929
7930
7931
    /* ===============================================================*/
7932
    /* Handle a back reference by number, which is the meta argument. The
7933
    pattern offsets for back references to group numbers less than 10 are held
7934
    in a special vector, to avoid using more than two parsed pattern elements
7935
    in 64-bit environments. We only need the offset to the first occurrence,
7936
    because if that doesn't fail, subsequent ones will also be OK. */
7937
7938
1.56k
    case META_BACKREF:
7939
1.56k
    if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7940
142
      else GETPLUSOFFSET(offset, pptr);
7941
7942
1.56k
    if (meta_arg > cb->bracount)
7943
100
      {
7944
100
      cb->erroroffset = offset;
7945
100
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
7946
100
      return 0;
7947
100
      }
7948
7949
    /* Come here from named backref handling when the reference is to a
7950
    single group (that is, not to a duplicated name). The back reference
7951
    data will have already been updated. We must disable firstcu if not
7952
    set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7953
    later. */
7954
7955
1.46k
    HANDLE_SINGLE_REFERENCE:
7956
1.46k
    if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7957
1.46k
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7958
1.46k
    PUT2INC(code, 0, meta_arg);
7959
1.46k
    if ((options & PCRE2_CASELESS) != 0)
7960
72
      *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7961
72
                 REFI_FLAG_CASELESS_RESTRICT : 0) |
7962
72
                (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7963
72
                 REFI_FLAG_TURKISH_CASING : 0);
7964
7965
    /* Update the map of back references, and keep the highest one. We
7966
    could do this in parse_regex() for numerical back references, but not
7967
    for named back references, because we don't know the numbers to which
7968
    named back references refer. So we do it all in this function. */
7969
7970
1.46k
    cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7971
1.46k
    if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7972
1.46k
    break;
7973
7974
7975
    /* ===============================================================*/
7976
    /* Handle recursion by inserting the number of the called group (which is
7977
    the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7978
    scanned and these numbers are replaced by offsets within the pattern. It is
7979
    done like this to avoid problems with forward references and adjusting
7980
    offsets when groups are duplicated and moved (as discovered in previous
7981
    implementations). Note that a recursion does not have a set first
7982
    character. */
7983
7984
1.08k
    case META_RECURSE:
7985
1.08k
    GETPLUSOFFSET(offset, pptr);
7986
1.08k
    if (meta_arg > cb->bracount)
7987
2
      {
7988
2
      cb->erroroffset = offset;
7989
2
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
7990
2
      return 0;
7991
2
      }
7992
1.08k
    HANDLE_NUMERICAL_RECURSION:
7993
1.08k
    *code = OP_RECURSE;
7994
1.08k
    PUT(code, 1, meta_arg);
7995
1.08k
    code += 1 + LINK_SIZE;
7996
1.08k
    groupsetfirstcu = FALSE;
7997
1.08k
    cb->had_recurse = TRUE;
7998
1.08k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7999
1.08k
    zerofirstcu = firstcu;
8000
1.08k
    zerofirstcuflags = firstcuflags;
8001
1.08k
    break;
8002
8003
8004
    /* ===============================================================*/
8005
    /* Handle capturing parentheses; the number is the meta argument. */
8006
8007
5.23k
    case META_CAPTURE:
8008
5.23k
    bravalue = OP_CBRA;
8009
5.23k
    skipunits = IMM2_SIZE;
8010
5.23k
    PUT2(code, 1+LINK_SIZE, meta_arg);
8011
5.23k
    cb->lastcapture = meta_arg;
8012
5.23k
    goto GROUP_PROCESS_NOTE_EMPTY;
8013
8014
8015
    /* ===============================================================*/
8016
    /* Handle escape sequence items. For ones like \d, the ESC_values are
8017
    arranged to be the same as the corresponding OP_values in the default case
8018
    when PCRE2_UCP is not set (which is the only case in which they will appear
8019
    here).
8020
8021
    Note: \Q and \E are never seen here, as they were dealt with in
8022
    parse_pattern(). Neither are numerical back references or recursions, which
8023
    were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8024
    \g, when followed by names, are turned into META_BACKREF_BYNAME or
8025
    META_RECURSE_BYNAME. */
8026
8027
41.2k
    case META_ESCAPE:
8028
8029
    /* We can test for escape sequences that consume a character because their
8030
    values lie between ESC_b and ESC_Z; this may have to change if any new ones
8031
    are ever created. For these sequences, we disable the setting of a first
8032
    character if it hasn't already been set. */
8033
8034
41.2k
    if (meta_arg > ESC_b && meta_arg < ESC_Z)
8035
39.2k
      {
8036
39.2k
      matched_char = TRUE;
8037
39.2k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8038
39.2k
      }
8039
8040
    /* Set values to reset to if this is followed by a zero repeat. */
8041
8042
41.2k
    zerofirstcu = firstcu;
8043
41.2k
    zerofirstcuflags = firstcuflags;
8044
41.2k
    zeroreqcu = reqcu;
8045
41.2k
    zeroreqcuflags = reqcuflags;
8046
8047
    /* If Unicode is not supported, \P and \p are not allowed and are
8048
    faulted at parse time, so will never appear here. */
8049
8050
41.2k
#ifdef SUPPORT_UNICODE
8051
41.2k
    if (meta_arg == ESC_P || meta_arg == ESC_p)
8052
1.73k
      {
8053
1.73k
      uint32_t ptype = *(++pptr) >> 16;
8054
1.73k
      uint32_t pdata = *pptr & 0xffff;
8055
8056
      /* In caseless matching, particular characteristics Lu, Ll, and Lt get
8057
      converted to the general characteristic L&. That is, upper, lower, and
8058
      title case letters are all conflated. */
8059
8060
1.73k
      if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
8061
1.73k
          (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
8062
0
        {
8063
0
        ptype = PT_LAMP;
8064
0
        pdata = 0;
8065
0
        }
8066
8067
      /* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}
8068
      is compiled to [] so as to benefit from the auto-anchoring code. */
8069
8070
1.73k
      if (ptype == PT_ANY)
8071
0
        {
8072
0
        if (meta_arg == ESC_P)
8073
0
          {
8074
0
          *code++ = OP_CLASS;
8075
0
          memset(code, 0, 32);
8076
0
          code += 32 / sizeof(PCRE2_UCHAR);
8077
0
          }
8078
0
        else
8079
0
          *code++ = OP_ALLANY;
8080
0
        }
8081
1.73k
      else
8082
1.73k
        {
8083
1.73k
        *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8084
1.73k
        *code++ = ptype;
8085
1.73k
        *code++ = pdata;
8086
1.73k
        }
8087
1.73k
      break;  /* End META_ESCAPE */
8088
1.73k
      }
8089
39.5k
#endif
8090
8091
    /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8092
    done. However, there's an option, in case anyone was relying on it. */
8093
8094
39.5k
    if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8095
39.5k
        (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8096
0
      {
8097
0
      *errorcodeptr = ERR99;
8098
0
      return 0;
8099
0
      }
8100
8101
    /* For the rest (including \X when Unicode is supported - if not it's
8102
    faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8103
    not set; if it is set, most of them do not show up here because they are
8104
    converted into Unicode property tests in parse_regex().
8105
8106
    In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8107
    instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8108
    There are special UCP codes for \B and \b which are used in UCP mode unless
8109
    "word" matching is being forced to ASCII.
8110
8111
    Note that \b and \B do a one-character lookbehind, and \A also behaves as
8112
    if it does. */
8113
8114
39.5k
    switch(meta_arg)
8115
39.5k
      {
8116
230
      case ESC_C:
8117
230
      cb->external_flags |= PCRE2_HASBKC;  /* Record */
8118
#if PCRE2_CODE_UNIT_WIDTH == 32
8119
      meta_arg = OP_ALLANY;
8120
#else
8121
230
      if (!utf) meta_arg = OP_ALLANY;
8122
230
#endif
8123
230
      break;
8124
8125
216
      case ESC_B:
8126
1.39k
      case ESC_b:
8127
1.39k
      if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8128
94
        meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8129
94
          OP_UCP_WORD_BOUNDARY;
8130
      /* Fall through */
8131
8132
1.79k
      case ESC_A:
8133
1.79k
      if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8134
1.79k
      break;
8135
39.5k
      }
8136
8137
39.5k
    *code++ = meta_arg;
8138
39.5k
    break;  /* End META_ESCAPE */
8139
8140
8141
    /* ===================================================================*/
8142
    /* Handle an unrecognized meta value. A parsed pattern value less than
8143
    META_END is a literal. Otherwise we have a problem. */
8144
8145
923k
    default:
8146
923k
    if (meta >= META_END)
8147
0
      {
8148
0
      PCRE2_DEBUG_UNREACHABLE();
8149
0
      *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8150
0
      return 0;
8151
0
      }
8152
8153
    /* Handle a literal character. We come here by goto in the case of a
8154
    32-bit, non-UTF character whose value is greater than META_END. */
8155
8156
923k
    NORMAL_CHAR:
8157
923k
    meta = *pptr;     /* Get the full 32 bits */
8158
923k
    NORMAL_CHAR_SET:  /* Character is already in meta */
8159
923k
    matched_char = TRUE;
8160
8161
    /* For caseless UTF or UCP mode, check whether this character has more than
8162
    one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8163
    When casing restrictions apply, ignore caseless sets that start with an
8164
    ASCII character. If the character is affected by the special Turkish rules,
8165
    hardcode the matching characters using a caseset. */
8166
8167
923k
#ifdef SUPPORT_UNICODE
8168
923k
    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8169
40.3k
      {
8170
40.3k
      uint32_t caseset;
8171
8172
40.3k
      if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
8173
40.3k
            PCRE2_EXTRA_TURKISH_CASING &&
8174
40.3k
          UCD_ANY_I(meta))
8175
0
        {
8176
0
        caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);
8177
0
        }
8178
40.3k
      else if ((caseset = UCD_CASESET(meta)) != 0 &&
8179
40.3k
               (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
8180
40.3k
               PRIV(ucd_caseless_sets)[caseset] < 128)
8181
0
        {
8182
0
        caseset = 0;  /* Ignore the caseless set if it's restricted. */
8183
0
        }
8184
8185
40.3k
      if (caseset != 0)
8186
1.30k
        {
8187
1.30k
        *code++ = OP_PROP;
8188
1.30k
        *code++ = PT_CLIST;
8189
1.30k
        *code++ = caseset;
8190
1.30k
        if (firstcuflags == REQ_UNSET)
8191
22
          firstcuflags = zerofirstcuflags = REQ_NONE;
8192
1.30k
        break;  /* End handling this meta item */
8193
1.30k
        }
8194
40.3k
      }
8195
922k
#endif
8196
8197
    /* Caseful matches, or caseless and not one of the multicase characters. We
8198
    come here by goto in the case of a positive class that contains only
8199
    case-partners of a character with just two cases; matched_char has already
8200
    been set TRUE and options fudged if necessary. */
8201
8202
922k
    CLASS_CASELESS_CHAR:
8203
8204
    /* Get the character's code units into mcbuffer, with the length in
8205
    mclength. When not in UTF mode, the length is always 1. */
8206
8207
922k
#ifdef SUPPORT_UNICODE
8208
922k
    if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8209
878k
#endif
8210
878k
      {
8211
878k
      mclength = 1;
8212
878k
      mcbuffer[0] = meta;
8213
878k
      }
8214
8215
    /* Generate the appropriate code */
8216
8217
922k
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8218
922k
    memcpy(code, mcbuffer, CU2BYTES(mclength));
8219
922k
    code += mclength;
8220
8221
    /* Remember if \r or \n were seen */
8222
8223
922k
    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8224
23.6k
      cb->external_flags |= PCRE2_HASCRORLF;
8225
8226
    /* Set the first and required code units appropriately. If no previous
8227
    first code unit, set it from this character, but revert to none on a zero
8228
    repeat. Otherwise, leave the firstcu value alone, and don't change it on
8229
    a zero repeat. */
8230
8231
922k
    if (firstcuflags == REQ_UNSET)
8232
22.0k
      {
8233
22.0k
      zerofirstcuflags = REQ_NONE;
8234
22.0k
      zeroreqcu = reqcu;
8235
22.0k
      zeroreqcuflags = reqcuflags;
8236
8237
      /* If the character is more than one code unit long, we can set a single
8238
      firstcu only if it is not to be matched caselessly. Multiple possible
8239
      starting code units may be picked up later in the studying code. */
8240
8241
22.0k
      if (mclength == 1 || req_caseopt == 0)
8242
21.9k
        {
8243
21.9k
        firstcu = mcbuffer[0];
8244
21.9k
        firstcuflags = req_caseopt;
8245
21.9k
        if (mclength != 1)
8246
0
          {
8247
0
          reqcu = code[-1];
8248
0
          reqcuflags = cb->req_varyopt;
8249
0
          }
8250
21.9k
        }
8251
78
      else firstcuflags = reqcuflags = REQ_NONE;
8252
22.0k
      }
8253
8254
    /* firstcu was previously set; we can set reqcu only if the length is
8255
    1 or the matching is caseful. */
8256
8257
900k
    else
8258
900k
      {
8259
900k
      zerofirstcu = firstcu;
8260
900k
      zerofirstcuflags = firstcuflags;
8261
900k
      zeroreqcu = reqcu;
8262
900k
      zeroreqcuflags = reqcuflags;
8263
900k
      if (mclength == 1 || req_caseopt == 0)
8264
900k
        {
8265
900k
        reqcu = code[-1];
8266
900k
        reqcuflags = req_caseopt | cb->req_varyopt;
8267
900k
        }
8268
900k
      }
8269
8270
    /* If caselessness was temporarily instated, reset it. */
8271
8272
922k
    if (reset_caseful)
8273
0
      {
8274
0
      options &= ~PCRE2_CASELESS;
8275
0
      req_caseopt = 0;
8276
0
      reset_caseful = FALSE;
8277
0
      }
8278
8279
922k
    break;    /* End literal character handling */
8280
1.16M
    }         /* End of big switch */
8281
1.16M
  }           /* End of big loop */
8282
8283
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8284
0
return 0;                  /* Avoid compiler warnings */
8285
45.4k
}
8286
8287
8288
8289
/*************************************************
8290
*   Compile regex: a sequence of alternatives    *
8291
*************************************************/
8292
8293
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8294
the closing bracket or META_END. The code variable is pointing at the code unit
8295
into which the BRA operator has been stored. This function is used during the
8296
pre-compile phase when we are trying to find out the amount of memory needed,
8297
as well as during the real compile phase. The value of lengthptr distinguishes
8298
the two phases.
8299
8300
Arguments:
8301
  options           option bits, including any changes for this subpattern
8302
  xoptions          extra option bits, ditto
8303
  codeptr           -> the address of the current code pointer
8304
  pptrptr           -> the address of the current parsed pattern pointer
8305
  errorcodeptr      -> pointer to error code variable
8306
  skipunits         skip this many code units at start (for brackets and OP_COND)
8307
  firstcuptr        place to put the first required code unit
8308
  firstcuflagsptr   place to put the first code unit flags
8309
  reqcuptr          place to put the last required code unit
8310
  reqcuflagsptr     place to put the last required code unit flags
8311
  bcptr             pointer to the chain of currently open branches
8312
  cb                points to the data block with tables pointers etc.
8313
  lengthptr         NULL during the real compile phase
8314
                    points to length accumulator during pre-compile phase
8315
8316
Returns:            0 There has been an error
8317
                   +1 Success, this group must match at least one character
8318
                   -1 Success, this group may match an empty string
8319
*/
8320
8321
static int
8322
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8323
  uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8324
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8325
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8326
  compile_block *cb, PCRE2_SIZE *lengthptr)
8327
8.26k
{
8328
8.26k
PCRE2_UCHAR *code = *codeptr;
8329
8.26k
PCRE2_UCHAR *last_branch = code;
8330
8.26k
PCRE2_UCHAR *start_bracket = code;
8331
8.26k
BOOL lookbehind;
8332
8.26k
open_capitem capitem;
8333
8.26k
int capnumber = 0;
8334
8.26k
int okreturn = 1;
8335
8.26k
uint32_t *pptr = *pptrptr;
8336
8.26k
uint32_t firstcu, reqcu;
8337
8.26k
uint32_t lookbehindlength;
8338
8.26k
uint32_t lookbehindminlength;
8339
8.26k
uint32_t firstcuflags, reqcuflags;
8340
8.26k
PCRE2_SIZE length;
8341
8.26k
branch_chain bc;
8342
8343
/* If set, call the external function that checks for stack availability. */
8344
8345
8.26k
if (cb->cx->stack_guard != NULL &&
8346
8.26k
    cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8347
0
  {
8348
0
  *errorcodeptr= ERR33;
8349
0
  return 0;
8350
0
  }
8351
8352
/* Miscellaneous initialization */
8353
8354
8.26k
bc.outer = bcptr;
8355
8.26k
bc.current_branch = code;
8356
8357
8.26k
firstcu = reqcu = 0;
8358
8.26k
firstcuflags = reqcuflags = REQ_UNSET;
8359
8360
/* Accumulate the length for use in the pre-compile phase. Start with the
8361
length of the BRA and KET and any extra code units that are required at the
8362
beginning. We accumulate in a local variable to save frequent testing of
8363
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8364
start and end of each alternative, because compiled items are discarded during
8365
the pre-compile phase so that the workspace is not exceeded. */
8366
8367
8.26k
length = 2 + 2*LINK_SIZE + skipunits;
8368
8369
/* Remember if this is a lookbehind assertion, and if it is, save its length
8370
and skip over the pattern offset. */
8371
8372
8.26k
lookbehind = *code == OP_ASSERTBACK ||
8373
8.26k
             *code == OP_ASSERTBACK_NOT ||
8374
8.26k
             *code == OP_ASSERTBACK_NA;
8375
8376
8.26k
if (lookbehind)
8377
174
  {
8378
174
  lookbehindlength = META_DATA(pptr[-1]);
8379
174
  lookbehindminlength = *pptr;
8380
174
  pptr += SIZEOFFSET;
8381
174
  }
8382
8.09k
else lookbehindlength = lookbehindminlength = 0;
8383
8384
/* If this is a capturing subpattern, add to the chain of open capturing items
8385
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8386
need be tested here; changing this opcode to one of its variants, e.g.
8387
OP_SCBRAPOS, happens later, after the group has been compiled. */
8388
8389
8.26k
if (*code == OP_CBRA)
8390
5.23k
  {
8391
5.23k
  capnumber = GET2(code, 1 + LINK_SIZE);
8392
5.23k
  capitem.number = capnumber;
8393
5.23k
  capitem.next = open_caps;
8394
5.23k
  capitem.assert_depth = cb->assert_depth;
8395
5.23k
  open_caps = &capitem;
8396
5.23k
  }
8397
8398
/* Offset is set zero to mark that this bracket is still open */
8399
8400
8.26k
PUT(code, 1, 0);
8401
8.26k
code += 1 + LINK_SIZE + skipunits;
8402
8403
/* Loop for each alternative branch */
8404
8405
8.26k
for (;;)
8406
45.4k
  {
8407
45.4k
  int branch_return;
8408
45.4k
  uint32_t branchfirstcu = 0, branchreqcu = 0;
8409
45.4k
  uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;
8410
8411
  /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8412
  is only a single minimum length for the whole assertion. When the minimum
8413
  length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8414
  though not necessarily the same length. In this case, the original OP_REVERSE
8415
  can be used. It can also be used if a branch in a variable length lookbehind
8416
  has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8417
  maximum and minimum values. */
8418
8419
45.4k
  if (lookbehind && lookbehindlength > 0)
8420
678
    {
8421
678
    if (lookbehindminlength == LOOKBEHIND_MAX ||
8422
678
        lookbehindminlength == lookbehindlength)
8423
182
      {
8424
182
      *code++ = OP_REVERSE;
8425
182
      PUT2INC(code, 0, lookbehindlength);
8426
182
      length += 1 + IMM2_SIZE;
8427
182
      }
8428
496
    else
8429
496
      {
8430
496
      *code++ = OP_VREVERSE;
8431
496
      PUT2INC(code, 0, lookbehindminlength);
8432
496
      PUT2INC(code, 0, lookbehindlength);
8433
496
      length += 1 + 2*IMM2_SIZE;
8434
496
      }
8435
678
    }
8436
8437
  /* Now compile the branch; in the pre-compile phase its length gets added
8438
  into the length. */
8439
8440
45.4k
  if ((branch_return =
8441
45.4k
        compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8442
45.4k
          &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8443
45.4k
          &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8444
109
    return 0;
8445
8446
  /* If a branch can match an empty string, so can the whole group. */
8447
8448
45.3k
  if (branch_return < 0) okreturn = -1;
8449
8450
  /* In the real compile phase, there is some post-processing to be done. */
8451
8452
45.3k
  if (lengthptr == NULL)
8453
22.1k
    {
8454
    /* If this is the first branch, the firstcu and reqcu values for the
8455
    branch become the values for the regex. */
8456
8457
22.1k
    if (*last_branch != OP_ALT)
8458
3.97k
      {
8459
3.97k
      firstcu = branchfirstcu;
8460
3.97k
      firstcuflags = branchfirstcuflags;
8461
3.97k
      reqcu = branchreqcu;
8462
3.97k
      reqcuflags = branchreqcuflags;
8463
3.97k
      }
8464
8465
    /* If this is not the first branch, the first char and reqcu have to
8466
    match the values from all the previous branches, except that if the
8467
    previous value for reqcu didn't have REQ_VARY set, it can still match,
8468
    and we set REQ_VARY for the group from this branch's value. */
8469
8470
18.1k
    else
8471
18.1k
      {
8472
      /* If we previously had a firstcu, but it doesn't match the new branch,
8473
      we have to abandon the firstcu for the regex, but if there was
8474
      previously no reqcu, it takes on the value of the old firstcu. */
8475
8476
18.1k
      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8477
14.4k
        {
8478
14.4k
        if (firstcuflags < REQ_NONE)
8479
1.44k
          {
8480
1.44k
          if (reqcuflags >= REQ_NONE)
8481
96
            {
8482
96
            reqcu = firstcu;
8483
96
            reqcuflags = firstcuflags;
8484
96
            }
8485
1.44k
          }
8486
14.4k
        firstcuflags = REQ_NONE;
8487
14.4k
        }
8488
8489
      /* If we (now or from before) have no firstcu, a firstcu from the
8490
      branch becomes a reqcu if there isn't a branch reqcu. */
8491
8492
18.1k
      if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8493
18.1k
          branchreqcuflags >= REQ_NONE)
8494
364
        {
8495
364
        branchreqcu = branchfirstcu;
8496
364
        branchreqcuflags = branchfirstcuflags;
8497
364
        }
8498
8499
      /* Now ensure that the reqcus match */
8500
8501
18.1k
      if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8502
18.1k
          reqcu != branchreqcu)
8503
16.1k
        reqcuflags = REQ_NONE;
8504
2.02k
      else
8505
2.02k
        {
8506
2.02k
        reqcu = branchreqcu;
8507
2.02k
        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8508
2.02k
        }
8509
18.1k
      }
8510
22.1k
    }
8511
8512
  /* Handle reaching the end of the expression, either ')' or end of pattern.
8513
  In the real compile phase, go back through the alternative branches and
8514
  reverse the chain of offsets, with the field in the BRA item now becoming an
8515
  offset to the first alternative. If there are no alternatives, it points to
8516
  the end of the group. The length in the terminating ket is always the length
8517
  of the whole bracketed item. Return leaving the pointer at the terminating
8518
  char. */
8519
8520
45.3k
  if (META_CODE(*pptr) != META_ALT)
8521
8.15k
    {
8522
8.15k
    if (lengthptr == NULL)
8523
3.97k
      {
8524
3.97k
      uint32_t branch_length = (uint32_t)(code - last_branch);
8525
3.97k
      do
8526
22.1k
        {
8527
22.1k
        uint32_t prev_length = GET(last_branch, 1);
8528
22.1k
        PUT(last_branch, 1, branch_length);
8529
22.1k
        branch_length = prev_length;
8530
22.1k
        last_branch -= branch_length;
8531
22.1k
        }
8532
22.1k
      while (branch_length > 0);
8533
3.97k
      }
8534
8535
    /* Fill in the ket */
8536
8537
8.15k
    *code = OP_KET;
8538
8.15k
    PUT(code, 1, (uint32_t)(code - start_bracket));
8539
8.15k
    code += 1 + LINK_SIZE;
8540
8541
    /* Set values to pass back */
8542
8543
8.15k
    *codeptr = code;
8544
8.15k
    *pptrptr = pptr;
8545
8.15k
    *firstcuptr = firstcu;
8546
8.15k
    *firstcuflagsptr = firstcuflags;
8547
8.15k
    *reqcuptr = reqcu;
8548
8.15k
    *reqcuflagsptr = reqcuflags;
8549
8.15k
    if (lengthptr != NULL)
8550
4.18k
      {
8551
4.18k
      if (OFLOW_MAX - *lengthptr < length)
8552
0
        {
8553
0
        *errorcodeptr = ERR20;
8554
0
        return 0;
8555
0
        }
8556
4.18k
      *lengthptr += length;
8557
4.18k
      }
8558
8.15k
    return okreturn;
8559
8.15k
    }
8560
8561
  /* Another branch follows. In the pre-compile phase, we can move the code
8562
  pointer back to where it was for the start of the first branch. (That is,
8563
  pretend that each branch is the only one.)
8564
8565
  In the real compile phase, insert an ALT node. Its length field points back
8566
  to the previous branch while the bracket remains open. At the end the chain
8567
  is reversed. It's done like this so that the start of the bracket has a
8568
  zero offset until it is closed, making it possible to detect recursion. */
8569
8570
37.1k
  if (lengthptr != NULL)
8571
19.0k
    {
8572
19.0k
    code = *codeptr + 1 + LINK_SIZE + skipunits;
8573
19.0k
    length += 1 + LINK_SIZE;
8574
19.0k
    }
8575
18.1k
  else
8576
18.1k
    {
8577
18.1k
    *code = OP_ALT;
8578
18.1k
    PUT(code, 1, (int)(code - last_branch));
8579
18.1k
    bc.current_branch = last_branch = code;
8580
18.1k
    code += 1 + LINK_SIZE;
8581
18.1k
    }
8582
8583
  /* Set the maximum lookbehind length for the next branch (if not in a
8584
  lookbehind the value will be zero) and then advance past the vertical bar. */
8585
8586
37.1k
  lookbehindlength = META_DATA(*pptr);
8587
37.1k
  pptr++;
8588
37.1k
  }
8589
8590
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8591
0
return 0;                  /* Avoid compiler warnings */
8592
8.26k
}
8593
8594
8595
8596
/*************************************************
8597
*          Check for anchored pattern            *
8598
*************************************************/
8599
8600
/* Try to find out if this is an anchored regular expression. Consider each
8601
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8602
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8603
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8604
be found, because ^ generates OP_CIRCM in that mode.
8605
8606
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8607
This is the code for \G, which means "match at start of match position, taking
8608
into account the match offset".
8609
8610
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8611
because that will try the rest of the pattern at all possible matching points,
8612
so there is no point trying again.... er ....
8613
8614
.... except when the .* appears inside capturing parentheses, and there is a
8615
subsequent back reference to those parentheses. We haven't enough information
8616
to catch that case precisely.
8617
8618
At first, the best we could do was to detect when .* was in capturing brackets
8619
and the highest back reference was greater than or equal to that level.
8620
However, by keeping a bitmap of the first 31 back references, we can catch some
8621
of the more common cases more precisely.
8622
8623
... A second exception is when the .* appears inside an atomic group, because
8624
this prevents the number of characters it matches from being adjusted.
8625
8626
Arguments:
8627
  code           points to start of the compiled pattern
8628
  bracket_map    a bitmap of which brackets we are inside while testing; this
8629
                   handles up to substring 31; after that we just have to take
8630
                   the less precise approach
8631
  cb             points to the compile data block
8632
  atomcount      atomic group level
8633
  inassert       TRUE if in an assertion
8634
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8635
8636
Returns:     TRUE or FALSE
8637
*/
8638
8639
static BOOL
8640
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8641
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8642
1.14k
{
8643
1.17k
do {
8644
1.17k
   PCRE2_SPTR scode = first_significant_code(
8645
1.17k
     code + PRIV(OP_lengths)[*code], FALSE);
8646
1.17k
   int op = *scode;
8647
8648
   /* Non-capturing brackets */
8649
8650
1.17k
   if (op == OP_BRA  || op == OP_BRAPOS ||
8651
1.17k
       op == OP_SBRA || op == OP_SBRAPOS)
8652
0
     {
8653
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8654
0
       return FALSE;
8655
0
     }
8656
8657
   /* Capturing brackets */
8658
8659
1.17k
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8660
1.17k
            op == OP_SCBRA || op == OP_SCBRAPOS)
8661
5
     {
8662
5
     int n = GET2(scode, 1+LINK_SIZE);
8663
5
     uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8664
5
     if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;
8665
5
     }
8666
8667
   /* Positive forward assertion */
8668
8669
1.17k
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8670
0
     {
8671
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;
8672
0
     }
8673
8674
   /* Condition. If there is no second branch, it can't be anchored. */
8675
8676
1.17k
   else if (op == OP_COND || op == OP_SCOND)
8677
0
     {
8678
0
     if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8679
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8680
0
       return FALSE;
8681
0
     }
8682
8683
   /* Atomic groups */
8684
8685
1.17k
   else if (op == OP_ONCE)
8686
0
     {
8687
0
     if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8688
0
       return FALSE;
8689
0
     }
8690
8691
   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8692
   it isn't in brackets that are or may be referenced or inside an atomic
8693
   group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8694
   because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8695
   with the subject "aab", which matches "b", i.e. not at the start of a line.
8696
   There is also an option that disables auto-anchoring. */
8697
8698
1.17k
   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8699
1.17k
             op == OP_TYPEPOSSTAR))
8700
8
     {
8701
8
     if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8702
8
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8703
7
       return FALSE;
8704
8
     }
8705
8706
   /* Check for explicit anchoring */
8707
8708
1.16k
   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8709
8710
47
   code += GET(code, 1);
8711
47
   }
8712
1.14k
while (*code == OP_ALT);   /* Loop for each alternative */
8713
11
return TRUE;
8714
1.14k
}
8715
8716
8717
8718
/*************************************************
8719
*         Check for starting with ^ or .*        *
8720
*************************************************/
8721
8722
/* This is called to find out if every branch starts with ^ or .* so that
8723
"first char" processing can be done to speed things up in multiline
8724
matching and for non-DOTALL patterns that start with .* (which must start at
8725
the beginning or after \n). As in the case of is_anchored() (see above), we
8726
have to take account of back references to capturing brackets that contain .*
8727
because in that case we can't make the assumption. Also, the appearance of .*
8728
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8729
or *SKIP does not count, because once again the assumption no longer holds.
8730
8731
Arguments:
8732
  code           points to start of the compiled pattern or a group
8733
  bracket_map    a bitmap of which brackets we are inside while testing; this
8734
                   handles up to substring 31; after that we just have to take
8735
                   the less precise approach
8736
  cb             points to the compile data
8737
  atomcount      atomic group level
8738
  inassert       TRUE if in an assertion
8739
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8740
8741
Returns:         TRUE or FALSE
8742
*/
8743
8744
static BOOL
8745
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8746
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8747
1.07k
{
8748
1.10k
do {
8749
1.10k
   PCRE2_SPTR scode = first_significant_code(
8750
1.10k
     code + PRIV(OP_lengths)[*code], FALSE);
8751
1.10k
   int op = *scode;
8752
8753
   /* If we are at the start of a conditional assertion group, *both* the
8754
   conditional assertion *and* what follows the condition must satisfy the test
8755
   for start of line. Other kinds of condition fail. Note that there may be an
8756
   auto-callout at the start of a condition. */
8757
8758
1.10k
   if (op == OP_COND)
8759
0
     {
8760
0
     scode += 1 + LINK_SIZE;
8761
8762
0
     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8763
0
       else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8764
8765
0
     switch (*scode)
8766
0
       {
8767
0
       case OP_CREF:
8768
0
       case OP_DNCREF:
8769
0
       case OP_RREF:
8770
0
       case OP_DNRREF:
8771
0
       case OP_FAIL:
8772
0
       case OP_FALSE:
8773
0
       case OP_TRUE:
8774
0
       return FALSE;
8775
8776
0
       default:     /* Assertion */
8777
0
       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
8778
0
         return FALSE;
8779
0
       do scode += GET(scode, 1); while (*scode == OP_ALT);
8780
0
       scode += 1 + LINK_SIZE;
8781
0
       break;
8782
0
       }
8783
0
     scode = first_significant_code(scode, FALSE);
8784
0
     op = *scode;
8785
0
     }
8786
8787
   /* Non-capturing brackets */
8788
8789
1.10k
   if (op == OP_BRA  || op == OP_BRAPOS ||
8790
1.10k
       op == OP_SBRA || op == OP_SBRAPOS)
8791
0
     {
8792
0
     if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8793
0
       return FALSE;
8794
0
     }
8795
8796
   /* Capturing brackets */
8797
8798
1.10k
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8799
1.10k
            op == OP_SCBRA || op == OP_SCBRAPOS)
8800
0
     {
8801
0
     int n = GET2(scode, 1+LINK_SIZE);
8802
0
     unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8803
0
     if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))
8804
0
       return FALSE;
8805
0
     }
8806
8807
   /* Positive forward assertions */
8808
8809
1.10k
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8810
0
     {
8811
0
     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
8812
0
       return FALSE;
8813
0
     }
8814
8815
   /* Atomic brackets */
8816
8817
1.10k
   else if (op == OP_ONCE)
8818
0
     {
8819
0
     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8820
0
       return FALSE;
8821
0
     }
8822
8823
   /* .* means "start at start or after \n" if it isn't in atomic brackets or
8824
   brackets that may be referenced or an assertion, and as long as the pattern
8825
   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8826
   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8827
   i.e. not at the start of a line. There is also an option that disables this
8828
   optimization. */
8829
8830
1.10k
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8831
8
     {
8832
8
     if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8833
8
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8834
8
       return FALSE;
8835
8
     }
8836
8837
   /* Check for explicit circumflex; anything else gives a FALSE result. Note
8838
   in particular that this includes atomic brackets OP_ONCE because the number
8839
   of characters matched by .* cannot be adjusted inside them. */
8840
8841
1.09k
   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8842
8843
   /* Move on to the next alternative */
8844
8845
30
   code += GET(code, 1);
8846
30
   }
8847
1.07k
while (*code == OP_ALT);  /* Loop for each alternative */
8848
0
return TRUE;
8849
1.07k
}
8850
8851
8852
8853
/*************************************************
8854
*   Scan compiled regex for recursion reference  *
8855
*************************************************/
8856
8857
/* This function scans through a compiled pattern until it finds an instance of
8858
OP_RECURSE.
8859
8860
Arguments:
8861
  code        points to start of expression
8862
  utf         TRUE in UTF mode
8863
8864
Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8865
*/
8866
8867
static PCRE2_UCHAR *
8868
find_recurse(PCRE2_UCHAR *code, BOOL utf)
8869
538
{
8870
538
for (;;)
8871
82.6k
  {
8872
82.6k
  PCRE2_UCHAR c = *code;
8873
82.6k
  if (c == OP_END) return NULL;
8874
82.5k
  if (c == OP_RECURSE) return code;
8875
8876
  /* XCLASS is used for classes that cannot be represented just by a bit map.
8877
  This includes negated single high-valued characters. ECLASS is used for
8878
  classes that use set operations internally. CALLOUT_STR is used for
8879
  callouts with string arguments. In each case the length in the table is
8880
  zero; the actual length is stored in the compiled code. */
8881
8882
82.0k
  if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);
8883
82.0k
  else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8884
8885
  /* Otherwise, we can get the item's length from the table, except that for
8886
  repeated character types, we have to test for \p and \P, which have an extra
8887
  two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8888
  we must add in its length. */
8889
8890
82.0k
  else
8891
82.0k
    {
8892
82.0k
    switch(c)
8893
82.0k
      {
8894
7
      case OP_TYPESTAR:
8895
7
      case OP_TYPEMINSTAR:
8896
7
      case OP_TYPEPLUS:
8897
7
      case OP_TYPEMINPLUS:
8898
15
      case OP_TYPEQUERY:
8899
15
      case OP_TYPEMINQUERY:
8900
15
      case OP_TYPEPOSSTAR:
8901
15
      case OP_TYPEPOSPLUS:
8902
15
      case OP_TYPEPOSQUERY:
8903
15
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8904
15
      break;
8905
8906
0
      case OP_TYPEPOSUPTO:
8907
0
      case OP_TYPEUPTO:
8908
0
      case OP_TYPEMINUPTO:
8909
0
      case OP_TYPEEXACT:
8910
0
      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8911
0
        code += 2;
8912
0
      break;
8913
8914
0
      case OP_MARK:
8915
0
      case OP_COMMIT_ARG:
8916
0
      case OP_PRUNE_ARG:
8917
0
      case OP_SKIP_ARG:
8918
0
      case OP_THEN_ARG:
8919
0
      code += code[1];
8920
0
      break;
8921
82.0k
      }
8922
8923
    /* Add in the fixed length from the table */
8924
8925
82.0k
    code += PRIV(OP_lengths)[c];
8926
8927
    /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8928
    be followed by a multi-unit character. The length in the table is a
8929
    minimum, so we have to arrange to skip the extra units. */
8930
8931
82.0k
#ifdef MAYBE_UTF_MULTI
8932
82.0k
    if (utf) switch(c)
8933
0
      {
8934
0
      case OP_CHAR:
8935
0
      case OP_CHARI:
8936
0
      case OP_NOT:
8937
0
      case OP_NOTI:
8938
0
      case OP_EXACT:
8939
0
      case OP_EXACTI:
8940
0
      case OP_NOTEXACT:
8941
0
      case OP_NOTEXACTI:
8942
0
      case OP_UPTO:
8943
0
      case OP_UPTOI:
8944
0
      case OP_NOTUPTO:
8945
0
      case OP_NOTUPTOI:
8946
0
      case OP_MINUPTO:
8947
0
      case OP_MINUPTOI:
8948
0
      case OP_NOTMINUPTO:
8949
0
      case OP_NOTMINUPTOI:
8950
0
      case OP_POSUPTO:
8951
0
      case OP_POSUPTOI:
8952
0
      case OP_NOTPOSUPTO:
8953
0
      case OP_NOTPOSUPTOI:
8954
0
      case OP_STAR:
8955
0
      case OP_STARI:
8956
0
      case OP_NOTSTAR:
8957
0
      case OP_NOTSTARI:
8958
0
      case OP_MINSTAR:
8959
0
      case OP_MINSTARI:
8960
0
      case OP_NOTMINSTAR:
8961
0
      case OP_NOTMINSTARI:
8962
0
      case OP_POSSTAR:
8963
0
      case OP_POSSTARI:
8964
0
      case OP_NOTPOSSTAR:
8965
0
      case OP_NOTPOSSTARI:
8966
0
      case OP_PLUS:
8967
0
      case OP_PLUSI:
8968
0
      case OP_NOTPLUS:
8969
0
      case OP_NOTPLUSI:
8970
0
      case OP_MINPLUS:
8971
0
      case OP_MINPLUSI:
8972
0
      case OP_NOTMINPLUS:
8973
0
      case OP_NOTMINPLUSI:
8974
0
      case OP_POSPLUS:
8975
0
      case OP_POSPLUSI:
8976
0
      case OP_NOTPOSPLUS:
8977
0
      case OP_NOTPOSPLUSI:
8978
0
      case OP_QUERY:
8979
0
      case OP_QUERYI:
8980
0
      case OP_NOTQUERY:
8981
0
      case OP_NOTQUERYI:
8982
0
      case OP_MINQUERY:
8983
0
      case OP_MINQUERYI:
8984
0
      case OP_NOTMINQUERY:
8985
0
      case OP_NOTMINQUERYI:
8986
0
      case OP_POSQUERY:
8987
0
      case OP_POSQUERYI:
8988
0
      case OP_NOTPOSQUERY:
8989
0
      case OP_NOTPOSQUERYI:
8990
0
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8991
0
      break;
8992
0
      }
8993
#else
8994
    (void)(utf);  /* Keep compiler happy by referencing function argument */
8995
#endif  /* MAYBE_UTF_MULTI */
8996
82.0k
    }
8997
82.0k
  }
8998
538
}
8999
9000
9001
9002
/*************************************************
9003
*    Check for asserted fixed first code unit    *
9004
*************************************************/
9005
9006
/* During compilation, the "first code unit" settings from forward assertions
9007
are discarded, because they can cause conflicts with actual literals that
9008
follow. However, if we end up without a first code unit setting for an
9009
unanchored pattern, it is worth scanning the regex to see if there is an
9010
initial asserted first code unit. If all branches start with the same asserted
9011
code unit, or with a non-conditional bracket all of whose alternatives start
9012
with the same asserted code unit (recurse ad lib), then we return that code
9013
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9014
REQ_NONE in the flags.
9015
9016
Arguments:
9017
  code       points to start of compiled pattern
9018
  flags      points to the first code unit flags
9019
  inassert   non-zero if in an assertion
9020
9021
Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9022
*/
9023
9024
static uint32_t
9025
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9026
1.10k
{
9027
1.10k
uint32_t c = 0;
9028
1.10k
uint32_t cflags = REQ_NONE;
9029
9030
1.10k
*flags = REQ_NONE;
9031
1.10k
do {
9032
1.10k
   uint32_t d;
9033
1.10k
   uint32_t dflags;
9034
1.10k
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9035
1.10k
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9036
1.10k
   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9037
1.10k
   PCRE2_UCHAR op = *scode;
9038
9039
1.10k
   switch(op)
9040
1.10k
     {
9041
411
     default:
9042
411
     return 0;
9043
9044
0
     case OP_BRA:
9045
0
     case OP_BRAPOS:
9046
0
     case OP_CBRA:
9047
0
     case OP_SCBRA:
9048
0
     case OP_CBRAPOS:
9049
0
     case OP_SCBRAPOS:
9050
0
     case OP_ASSERT:
9051
0
     case OP_ASSERT_NA:
9052
0
     case OP_ONCE:
9053
0
     case OP_SCRIPT_RUN:
9054
0
     d = find_firstassertedcu(scode, &dflags, inassert +
9055
0
       ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9056
0
     if (dflags >= REQ_NONE) return 0;
9057
0
     if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9058
0
       else if (c != d || cflags != dflags) return 0;
9059
0
     break;
9060
9061
0
     case OP_EXACT:
9062
0
     scode += IMM2_SIZE;
9063
     /* Fall through */
9064
9065
303
     case OP_CHAR:
9066
303
     case OP_PLUS:
9067
303
     case OP_MINPLUS:
9068
305
     case OP_POSPLUS:
9069
305
     if (inassert == 0) return 0;
9070
0
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9071
0
       else if (c != scode[1]) return 0;
9072
0
     break;
9073
9074
0
     case OP_EXACTI:
9075
0
     scode += IMM2_SIZE;
9076
     /* Fall through */
9077
9078
378
     case OP_CHARI:
9079
378
     case OP_PLUSI:
9080
379
     case OP_MINPLUSI:
9081
386
     case OP_POSPLUSI:
9082
386
     if (inassert == 0) return 0;
9083
9084
     /* If the character is more than one code unit long, we cannot set its
9085
     first code unit when matching caselessly. Later scanning may pick up
9086
     multiple code units. */
9087
9088
0
#ifdef SUPPORT_UNICODE
9089
0
#if PCRE2_CODE_UNIT_WIDTH == 8
9090
0
     if (scode[1] >= 0x80) return 0;
9091
#elif PCRE2_CODE_UNIT_WIDTH == 16
9092
     if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9093
#endif
9094
0
#endif
9095
9096
0
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9097
0
       else if (c != scode[1]) return 0;
9098
0
     break;
9099
1.10k
     }
9100
9101
0
   code += GET(code, 1);
9102
0
   }
9103
1.10k
while (*code == OP_ALT);
9104
9105
0
*flags = cflags;
9106
0
return c;
9107
1.10k
}
9108
9109
9110
9111
/*************************************************
9112
*     Add an entry to the name/number table      *
9113
*************************************************/
9114
9115
/* This function is called between compiling passes to add an entry to the
9116
name/number table, maintaining alphabetical order. Checking for permitted
9117
and forbidden duplicates has already been done.
9118
9119
Arguments:
9120
  cb           the compile data block
9121
  name         the name to add
9122
  length       the length of the name
9123
  groupno      the group number
9124
  tablecount   the count of names in the table so far
9125
9126
Returns:       nothing
9127
*/
9128
9129
static void
9130
add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9131
  unsigned int groupno, uint32_t tablecount)
9132
0
{
9133
0
uint32_t i;
9134
0
PCRE2_UCHAR *slot = cb->name_table;
9135
9136
0
for (i = 0; i < tablecount; i++)
9137
0
  {
9138
0
  int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9139
0
  if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9140
0
    crc = -1; /* Current name is a substring */
9141
9142
  /* Make space in the table and break the loop for an earlier name. For a
9143
  duplicate or later name, carry on. We do this for duplicates so that in the
9144
  simple case (when ?(| is not used) they are in order of their numbers. In all
9145
  cases they are in the order in which they appear in the pattern. */
9146
9147
0
  if (crc < 0)
9148
0
    {
9149
0
    (void)memmove(slot + cb->name_entry_size, slot,
9150
0
      CU2BYTES((tablecount - i) * cb->name_entry_size));
9151
0
    break;
9152
0
    }
9153
9154
  /* Continue the loop for a later or duplicate name */
9155
9156
0
  slot += cb->name_entry_size;
9157
0
  }
9158
9159
0
PUT2(slot, 0, groupno);
9160
0
memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9161
9162
/* Add a terminating zero and fill the rest of the slot with zeroes so that
9163
the memory is all initialized. Otherwise valgrind moans about uninitialized
9164
memory when saving serialized compiled patterns. */
9165
9166
0
memset(slot + IMM2_SIZE + length, 0,
9167
0
  CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9168
0
}
9169
9170
9171
9172
/*************************************************
9173
*             Skip in parsed pattern             *
9174
*************************************************/
9175
9176
/* This function is called to skip parts of the parsed pattern when finding the
9177
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9178
the end of the branch, it is called to skip over an internal lookaround or
9179
(DEFINE) group, and it is also called to skip to the end of a class, during
9180
which it will never encounter nested groups (but there's no need to have
9181
special code for that).
9182
9183
When called to find the end of a branch or group, pptr must point to the first
9184
meta code inside the branch, not the branch-starting code. In other cases it
9185
can point to the item that causes the function to be called.
9186
9187
Arguments:
9188
  pptr       current pointer to skip from
9189
  skiptype   PSKIP_CLASS when skipping to end of class
9190
             PSKIP_ALT when META_ALT ends the skip
9191
             PSKIP_KET when only META_KET ends the skip
9192
9193
Returns:     new value of pptr
9194
             NULL if META_END is reached - should never occur
9195
               or for an unknown meta value - likewise
9196
*/
9197
9198
static uint32_t *
9199
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9200
0
{
9201
0
uint32_t nestlevel = 0;
9202
9203
0
for (;; pptr++)
9204
0
  {
9205
0
  uint32_t meta = META_CODE(*pptr);
9206
9207
0
  switch(meta)
9208
0
    {
9209
0
    default:  /* Just skip over most items */
9210
0
    if (meta < META_END) continue;  /* Literal */
9211
0
    break;
9212
9213
0
    case META_END:
9214
9215
    /* The parsed regex is malformed; we have reached the end and did
9216
    not find the end of the construct which we are skipping over. */
9217
9218
0
    PCRE2_DEBUG_UNREACHABLE();
9219
0
    return NULL;
9220
9221
    /* The data for these items is variable in length. */
9222
9223
0
    case META_BACKREF:  /* Offset is present only if group >= 10 */
9224
0
    if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9225
0
    break;
9226
9227
0
    case META_ESCAPE:
9228
0
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9229
0
      pptr += 1;     /* Skip prop data */
9230
0
    break;
9231
9232
0
    case META_MARK:     /* Add the length of the name. */
9233
0
    case META_COMMIT_ARG:
9234
0
    case META_PRUNE_ARG:
9235
0
    case META_SKIP_ARG:
9236
0
    case META_THEN_ARG:
9237
0
    pptr += pptr[1];
9238
0
    break;
9239
9240
    /* These are the "active" items in this loop. */
9241
9242
0
    case META_CLASS_END:
9243
0
    if (skiptype == PSKIP_CLASS) return pptr;
9244
0
    break;
9245
9246
0
    case META_ATOMIC:
9247
0
    case META_CAPTURE:
9248
0
    case META_COND_ASSERT:
9249
0
    case META_COND_DEFINE:
9250
0
    case META_COND_NAME:
9251
0
    case META_COND_NUMBER:
9252
0
    case META_COND_RNAME:
9253
0
    case META_COND_RNUMBER:
9254
0
    case META_COND_VERSION:
9255
0
    case META_SCS:
9256
0
    case META_LOOKAHEAD:
9257
0
    case META_LOOKAHEADNOT:
9258
0
    case META_LOOKAHEAD_NA:
9259
0
    case META_LOOKBEHIND:
9260
0
    case META_LOOKBEHINDNOT:
9261
0
    case META_LOOKBEHIND_NA:
9262
0
    case META_NOCAPTURE:
9263
0
    case META_SCRIPT_RUN:
9264
0
    nestlevel++;
9265
0
    break;
9266
9267
0
    case META_ALT:
9268
0
    if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9269
0
    break;
9270
9271
0
    case META_KET:
9272
0
    if (nestlevel == 0) return pptr;
9273
0
    nestlevel--;
9274
0
    break;
9275
0
    }
9276
9277
  /* The extra data item length for each meta is in a table. */
9278
9279
0
  meta = (meta >> 16) & 0x7fff;
9280
0
  if (meta >= sizeof(meta_extra_lengths)) return NULL;
9281
0
  pptr += meta_extra_lengths[meta];
9282
0
  }
9283
9284
0
PCRE2_UNREACHABLE(); /* Control never reaches here */
9285
0
}
9286
9287
9288
9289
/*************************************************
9290
*       Find length of a parsed group            *
9291
*************************************************/
9292
9293
/* This is called for nested groups within a branch of a lookbehind whose
9294
length is being computed. On entry, the pointer must be at the first element
9295
after the group initializing code. On exit it points to OP_KET. Caching is used
9296
to improve processing speed when the same capturing group occurs many times.
9297
9298
Arguments:
9299
  pptrptr     pointer to pointer in the parsed pattern
9300
  minptr      where to return the minimum length
9301
  isinline    FALSE if a reference or recursion; TRUE for inline group
9302
  errcodeptr  pointer to the errorcode
9303
  lcptr       pointer to the loop counter
9304
  group       number of captured group or -1 for a non-capturing group
9305
  recurses    chain of recurse_check to catch mutual recursion
9306
  cb          pointer to the compile data
9307
9308
Returns:      the maximum group length or a negative number
9309
*/
9310
9311
static int
9312
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9313
  int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9314
0
{
9315
0
uint32_t *gi = cb->groupinfo + 2 * group;
9316
0
int branchlength, branchminlength;
9317
0
int grouplength = -1;
9318
0
int groupminlength = INT_MAX;
9319
9320
/* The cache can be used only if there is no possibility of there being two
9321
groups with the same number. We do not need to set the end pointer for a group
9322
that is being processed as a back reference or recursion, but we must do so for
9323
an inline group. */
9324
9325
0
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9326
0
  {
9327
0
  uint32_t groupinfo = gi[0];
9328
0
  if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9329
0
  if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9330
0
    {
9331
0
    if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9332
0
    *minptr = gi[1];
9333
0
    return groupinfo & GI_FIXED_LENGTH_MASK;
9334
0
    }
9335
0
  }
9336
9337
/* Scan the group. In this case we find the end pointer of necessity. */
9338
9339
0
for(;;)
9340
0
  {
9341
0
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9342
0
    recurses, cb);
9343
0
  if (branchlength < 0) goto ISNOTFIXED;
9344
0
  if (branchlength > grouplength) grouplength = branchlength;
9345
0
  if (branchminlength < groupminlength) groupminlength = branchminlength;
9346
0
  if (**pptrptr == META_KET) break;
9347
0
  *pptrptr += 1;   /* Skip META_ALT */
9348
0
  }
9349
9350
0
if (group > 0)
9351
0
  {
9352
0
  gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9353
0
  gi[1] = groupminlength;
9354
0
  }
9355
9356
0
*minptr = groupminlength;
9357
0
return grouplength;
9358
9359
0
ISNOTFIXED:
9360
0
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9361
0
return -1;
9362
0
}
9363
9364
9365
9366
/*************************************************
9367
*        Find length of a parsed branch          *
9368
*************************************************/
9369
9370
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9371
giving an error if the length is not limited. On entry, *pptrptr points to the
9372
first element inside the branch. On exit it is set to point to the ALT or KET.
9373
9374
Arguments:
9375
  pptrptr     pointer to pointer in the parsed pattern
9376
  minptr      where to return the minimum length
9377
  errcodeptr  pointer to error code
9378
  lcptr       pointer to loop counter
9379
  recurses    chain of recurse_check to catch mutual recursion
9380
  cb          pointer to compile block
9381
9382
Returns:      the maximum length, or a negative value on error
9383
*/
9384
9385
static int
9386
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9387
  parsed_recurse_check *recurses, compile_block *cb)
9388
454
{
9389
454
int branchlength = 0;
9390
454
int branchminlength = 0;
9391
454
int grouplength, groupminlength;
9392
454
uint32_t lastitemlength = 0;
9393
454
uint32_t lastitemminlength = 0;
9394
454
uint32_t *pptr = *pptrptr;
9395
454
PCRE2_SIZE offset;
9396
454
parsed_recurse_check this_recurse;
9397
9398
/* A large and/or complex regex can take too long to process. This can happen
9399
more often when (?| groups are present in the pattern because their length
9400
cannot be cached. */
9401
9402
454
if ((*lcptr)++ > 2000)
9403
0
  {
9404
0
  *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9405
0
  return -1;
9406
0
  }
9407
9408
/* Scan the branch, accumulating the length. */
9409
9410
9.75k
for (;; pptr++)
9411
10.2k
  {
9412
10.2k
  parsed_recurse_check *r;
9413
10.2k
  uint32_t *gptr, *gptrend;
9414
10.2k
  uint32_t escape;
9415
10.2k
  uint32_t min, max;
9416
10.2k
  uint32_t group = 0;
9417
10.2k
  uint32_t itemlength = 0;
9418
10.2k
  uint32_t itemminlength = 0;
9419
9420
10.2k
  if (*pptr < META_END)
9421
9.56k
    {
9422
9.56k
    itemlength = itemminlength = 1;
9423
9.56k
    }
9424
9425
647
  else switch (META_CODE(*pptr))
9426
647
    {
9427
88
    case META_KET:
9428
451
    case META_ALT:
9429
451
    goto EXIT;
9430
9431
    /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9432
    actual termination. */
9433
9434
0
    case META_ACCEPT:
9435
0
    case META_FAIL:
9436
0
    pptr = parsed_skip(pptr, PSKIP_ALT);
9437
0
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9438
0
    goto EXIT;
9439
9440
0
    case META_MARK:
9441
0
    case META_COMMIT_ARG:
9442
0
    case META_PRUNE_ARG:
9443
0
    case META_SKIP_ARG:
9444
0
    case META_THEN_ARG:
9445
0
    pptr += pptr[1] + 1;
9446
0
    break;
9447
9448
0
    case META_CIRCUMFLEX:
9449
0
    case META_COMMIT:
9450
83
    case META_DOLLAR:
9451
83
    case META_PRUNE:
9452
83
    case META_SKIP:
9453
83
    case META_THEN:
9454
83
    break;
9455
9456
0
    case META_OPTIONS:
9457
0
    pptr += 2;
9458
0
    break;
9459
9460
0
    case META_BIGVALUE:
9461
0
    itemlength = itemminlength = 1;
9462
0
    pptr += 1;
9463
0
    break;
9464
9465
0
    case META_CLASS:
9466
0
    case META_CLASS_NOT:
9467
0
    itemlength = itemminlength = 1;
9468
0
    pptr = parsed_skip(pptr, PSKIP_CLASS);
9469
0
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9470
0
    break;
9471
9472
0
    case META_CLASS_EMPTY_NOT:
9473
43
    case META_DOT:
9474
43
    itemlength = itemminlength = 1;
9475
43
    break;
9476
9477
0
    case META_CALLOUT_NUMBER:
9478
0
    pptr += 3;
9479
0
    break;
9480
9481
0
    case META_CALLOUT_STRING:
9482
0
    pptr += 3 + SIZEOFFSET;
9483
0
    break;
9484
9485
    /* Only some escapes consume a character. Of those, \R can match one or two
9486
    characters, but \X is never allowed because it matches an unknown number of
9487
    characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9488
9489
0
    case META_ESCAPE:
9490
0
    escape = META_DATA(*pptr);
9491
0
    if (escape == ESC_X) return -1;
9492
0
    if (escape == ESC_R)
9493
0
      {
9494
0
      itemminlength = 1;
9495
0
      itemlength = 2;
9496
0
      }
9497
0
    else if (escape > ESC_b && escape < ESC_Z)
9498
0
      {
9499
0
#if PCRE2_CODE_UNIT_WIDTH != 32
9500
0
      if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9501
0
        {
9502
0
        *errcodeptr = ERR36;
9503
0
        return -1;
9504
0
        }
9505
0
#endif
9506
0
      itemlength = itemminlength = 1;
9507
0
      if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9508
0
      }
9509
0
    break;
9510
9511
    /* Lookaheads do not contribute to the length of this branch, but they may
9512
    contain lookbehinds within them whose lengths need to be set. */
9513
9514
0
    case META_LOOKAHEAD:
9515
0
    case META_LOOKAHEADNOT:
9516
0
    case META_LOOKAHEAD_NA:
9517
0
    case META_SCS:
9518
0
    *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9519
0
    if (*errcodeptr != 0) return -1;
9520
9521
    /* Ignore any qualifiers that follow a lookahead assertion. */
9522
9523
0
    switch (pptr[1])
9524
0
      {
9525
0
      case META_ASTERISK:
9526
0
      case META_ASTERISK_PLUS:
9527
0
      case META_ASTERISK_QUERY:
9528
0
      case META_PLUS:
9529
0
      case META_PLUS_PLUS:
9530
0
      case META_PLUS_QUERY:
9531
0
      case META_QUERY:
9532
0
      case META_QUERY_PLUS:
9533
0
      case META_QUERY_QUERY:
9534
0
      pptr++;
9535
0
      break;
9536
9537
0
      case META_MINMAX:
9538
0
      case META_MINMAX_PLUS:
9539
0
      case META_MINMAX_QUERY:
9540
0
      pptr += 3;
9541
0
      break;
9542
9543
0
      default:
9544
0
      break;
9545
0
      }
9546
0
    break;
9547
9548
    /* A nested lookbehind does not contribute any length to this lookbehind,
9549
    but must itself be checked and have its lengths set. Note that
9550
    set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket
9551
    of the group, so no need to update it here. */
9552
9553
0
    case META_LOOKBEHIND:
9554
0
    case META_LOOKBEHINDNOT:
9555
0
    case META_LOOKBEHIND_NA:
9556
0
    if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9557
0
      return -1;
9558
0
    break;
9559
9560
    /* Back references and recursions are handled by very similar code. At this
9561
    stage, the names generated in the parsing pass are available, but the main
9562
    name table has not yet been created. So for the named varieties, scan the
9563
    list of names in order to get the number of the first one in the pattern,
9564
    and whether or not this name is duplicated. */
9565
9566
0
    case META_BACKREF_BYNAME:
9567
0
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9568
0
      goto ISNOTFIXED;
9569
    /* Fall through */
9570
9571
0
    case META_RECURSE_BYNAME:
9572
0
      {
9573
0
      int i;
9574
0
      PCRE2_SPTR name;
9575
0
      BOOL is_dupname = FALSE;
9576
0
      named_group *ng = cb->named_groups;
9577
0
      uint32_t meta_code = META_CODE(*pptr);
9578
0
      uint32_t length = *(++pptr);
9579
9580
0
      GETPLUSOFFSET(offset, pptr);
9581
0
      name = cb->start_pattern + offset;
9582
0
      for (i = 0; i < cb->names_found; i++, ng++)
9583
0
        {
9584
0
        if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9585
0
          {
9586
0
          group = ng->number;
9587
0
          is_dupname = ng->isdup;
9588
0
          break;
9589
0
          }
9590
0
        }
9591
9592
0
      if (group == 0)
9593
0
        {
9594
0
        *errcodeptr = ERR15;  /* Non-existent subpattern */
9595
0
        cb->erroroffset = offset;
9596
0
        return -1;
9597
0
        }
9598
9599
      /* A numerical back reference can be fixed length if duplicate capturing
9600
      groups are not being used. A non-duplicate named back reference can also
9601
      be handled. */
9602
9603
0
      if (meta_code == META_RECURSE_BYNAME ||
9604
0
          (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9605
0
        goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9606
0
      }
9607
0
    goto ISNOTFIXED;                     /* Duplicate name or number */
9608
9609
    /* The offset values for back references < 10 are in a separate vector
9610
    because otherwise they would use more than two parsed pattern elements on
9611
    64-bit systems. */
9612
9613
0
    case META_BACKREF:
9614
0
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9615
0
        (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9616
0
      goto ISNOTFIXED;
9617
0
    group = META_DATA(*pptr);
9618
0
    if (group < 10)
9619
0
      {
9620
0
      offset = cb->small_ref_offset[group];
9621
0
      goto RECURSE_OR_BACKREF_LENGTH;
9622
0
      }
9623
9624
    /* Fall through */
9625
    /* For groups >= 10 - picking up group twice does no harm. */
9626
9627
    /* A true recursion implies not fixed length, but a subroutine call may
9628
    be OK. Back reference "recursions" are also failed. */
9629
9630
2
    case META_RECURSE:
9631
2
    group = META_DATA(*pptr);
9632
2
    GETPLUSOFFSET(offset, pptr);
9633
9634
2
    RECURSE_OR_BACKREF_LENGTH:
9635
2
    if (group > cb->bracount)
9636
2
      {
9637
2
      cb->erroroffset = offset;
9638
2
      *errcodeptr = ERR15;  /* Non-existent subpattern */
9639
2
      return -1;
9640
2
      }
9641
0
    if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9642
0
    for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9643
0
      {
9644
0
      if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9645
0
        else if (*gptr == (META_CAPTURE | group)) break;
9646
0
      }
9647
9648
    /* We must start the search for the end of the group at the first meta code
9649
    inside the group. Otherwise it will be treated as an enclosed group. */
9650
9651
0
    gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9652
0
    if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9653
0
    if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9654
0
    for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9655
0
    if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9656
0
    this_recurse.prev = recurses;
9657
0
    this_recurse.groupptr = gptr;
9658
9659
    /* We do not need to know the position of the end of the group, that is,
9660
    gptr is not used after the call to get_grouplength(). Setting the second
9661
    argument FALSE stops it scanning for the end when the length can be found
9662
    in the cache. */
9663
9664
0
    gptr++;
9665
0
    grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9666
0
      lcptr, group, &this_recurse, cb);
9667
0
    if (grouplength < 0)
9668
0
      {
9669
0
      if (*errcodeptr == 0) goto ISNOTFIXED;
9670
0
      return -1;  /* Error already set */
9671
0
      }
9672
0
    itemlength = grouplength;
9673
0
    itemminlength = groupminlength;
9674
0
    break;
9675
9676
    /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9677
    the length of this branch. Skip from the following item to the next
9678
    unpaired ket. */
9679
9680
0
    case META_COND_DEFINE:
9681
0
    pptr = parsed_skip(pptr + 1, PSKIP_KET);
9682
0
    break;
9683
9684
    /* Check other nested groups - advance past the initial data for each type
9685
    and then seek a fixed length with get_grouplength(). */
9686
9687
0
    case META_COND_NAME:
9688
0
    case META_COND_NUMBER:
9689
0
    case META_COND_RNAME:
9690
0
    case META_COND_RNUMBER:
9691
0
    pptr += 2 + SIZEOFFSET;
9692
0
    goto CHECK_GROUP;
9693
9694
0
    case META_COND_ASSERT:
9695
0
    pptr += 1;
9696
0
    goto CHECK_GROUP;
9697
9698
0
    case META_COND_VERSION:
9699
0
    pptr += 4;
9700
0
    goto CHECK_GROUP;
9701
9702
0
    case META_CAPTURE:
9703
0
    group = META_DATA(*pptr);
9704
    /* Fall through */
9705
9706
0
    case META_ATOMIC:
9707
0
    case META_NOCAPTURE:
9708
0
    case META_SCRIPT_RUN:
9709
0
    pptr++;
9710
0
    CHECK_GROUP:
9711
0
    grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9712
0
      lcptr, group, recurses, cb);
9713
0
    if (grouplength < 0) return -1;
9714
0
    itemlength = grouplength;
9715
0
    itemminlength = groupminlength;
9716
0
    break;
9717
9718
54
    case META_QUERY:
9719
54
    case META_QUERY_PLUS:
9720
67
    case META_QUERY_QUERY:
9721
67
    min = 0;
9722
67
    max = 1;
9723
67
    goto REPETITION;
9724
9725
    /* Exact repetition is OK; variable repetition is not. A repetition of zero
9726
    must subtract the length that has already been added. */
9727
9728
0
    case META_MINMAX:
9729
0
    case META_MINMAX_PLUS:
9730
0
    case META_MINMAX_QUERY:
9731
0
    min = pptr[1];
9732
0
    max = pptr[2];
9733
0
    pptr += 2;
9734
9735
67
    REPETITION:
9736
67
    if (max != REPEAT_UNLIMITED)
9737
67
      {
9738
67
      if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9739
67
          max != 0 &&
9740
67
          (INT_MAX - branchlength)/lastitemlength < max - 1)
9741
0
        {
9742
0
        *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9743
0
        return -1;
9744
0
        }
9745
67
      if (min == 0) branchminlength -= lastitemminlength;
9746
0
        else itemminlength = (min - 1) * lastitemminlength;
9747
67
      if (max == 0) branchlength -= lastitemlength;
9748
67
        else itemlength = (max - 1) * lastitemlength;
9749
67
      break;
9750
67
      }
9751
    /* Fall through */
9752
9753
    /* Any other item means this branch does not have a fixed length. */
9754
9755
1
    default:
9756
1
    ISNOTFIXED:
9757
1
    *errcodeptr = ERR25;   /* Not fixed length */
9758
1
    return -1;
9759
647
    }
9760
9761
  /* Add the item length to the branchlength, checking for integer overflow and
9762
  for the branch length exceeding the overall limit. Later, if there is at
9763
  least one variable-length branch in the group, there is a test for the
9764
  (smaller) variable-length branch length limit. */
9765
9766
9.75k
  if (INT_MAX - branchlength < (int)itemlength ||
9767
9.75k
      (branchlength += itemlength) > LOOKBEHIND_MAX)
9768
0
    {
9769
0
    *errcodeptr = ERR87;
9770
0
    return -1;
9771
0
    }
9772
9773
9.75k
  branchminlength += itemminlength;
9774
9775
  /* Save this item length for use if the next item is a quantifier. */
9776
9777
9.75k
  lastitemlength = itemlength;
9778
9.75k
  lastitemminlength = itemminlength;
9779
9.75k
  }
9780
9781
451
EXIT:
9782
451
*pptrptr = pptr;
9783
451
*minptr = branchminlength;
9784
451
return branchlength;
9785
9786
0
PARSED_SKIP_FAILED:
9787
0
PCRE2_DEBUG_UNREACHABLE();
9788
0
*errcodeptr = ERR90;  /* Unhandled META code - internal error */
9789
0
return -1;
9790
454
}
9791
9792
9793
9794
/*************************************************
9795
*        Set lengths in a lookbehind             *
9796
*************************************************/
9797
9798
/* This function is called for each lookbehind, to set the lengths in its
9799
branches. An error occurs if any branch does not have a limited maximum length
9800
that is less than the limit (65535). On exit, the pointer must be left on the
9801
final ket.
9802
9803
The function also maintains the max_lookbehind value. Any lookbehind branch
9804
that contains a nested lookbehind may actually look further back than the
9805
length of the branch. The additional amount is passed back from
9806
get_branchlength() as an "extra" value.
9807
9808
Arguments:
9809
  pptrptr     pointer to pointer in the parsed pattern
9810
  errcodeptr  pointer to error code
9811
  lcptr       pointer to loop counter
9812
  recurses    chain of recurse_check to catch mutual recursion
9813
  cb          pointer to compile block
9814
9815
Returns:      TRUE if all is well
9816
              FALSE otherwise, with error code and offset set
9817
*/
9818
9819
static BOOL
9820
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9821
  parsed_recurse_check *recurses, compile_block *cb)
9822
91
{
9823
91
PCRE2_SIZE offset;
9824
91
uint32_t *bptr = *pptrptr;
9825
91
uint32_t *gbptr = bptr;
9826
91
int maxlength = 0;
9827
91
int minlength = INT_MAX;
9828
91
BOOL variable = FALSE;
9829
9830
91
READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9831
91
*pptrptr += SIZEOFFSET;
9832
9833
/* Each branch can have a different maximum length, but we can keep only a
9834
single minimum for the whole group, because there's nowhere to save individual
9835
values in the META_ALT item. */
9836
9837
91
do
9838
454
  {
9839
454
  int branchlength, branchminlength;
9840
9841
454
  *pptrptr += 1;
9842
454
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9843
454
    recurses, cb);
9844
9845
454
  if (branchlength < 0)
9846
3
    {
9847
    /* The errorcode and offset may already be set from a nested lookbehind. */
9848
3
    if (*errcodeptr == 0) *errcodeptr = ERR25;
9849
3
    if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9850
3
    return FALSE;
9851
3
    }
9852
9853
451
  if (branchlength != branchminlength) variable = TRUE;
9854
451
  if (branchminlength < minlength) minlength = branchminlength;
9855
451
  if (branchlength > maxlength) maxlength = branchlength;
9856
451
  if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9857
451
  *bptr |= branchlength;  /* branchlength never more than 65535 */
9858
451
  bptr = *pptrptr;
9859
451
  }
9860
451
while (META_CODE(*bptr) == META_ALT);
9861
9862
/* If any branch is of variable length, the whole lookbehind is of variable
9863
length. If the maximum length of any branch exceeds the maximum for variable
9864
lookbehinds, give an error. Otherwise, the minimum length is set in the word
9865
that follows the original group META value. For a fixed-length lookbehind, this
9866
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9867
possibly different) length. */
9868
9869
88
if (variable)
9870
53
  {
9871
53
  gbptr[1] = minlength;
9872
53
  if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)
9873
0
    {
9874
0
    *errcodeptr = ERR100;
9875
0
    cb->erroroffset = offset;
9876
0
    return FALSE;
9877
0
    }
9878
53
  }
9879
35
else gbptr[1] = LOOKBEHIND_MAX;
9880
9881
88
return TRUE;
9882
88
}
9883
9884
9885
9886
/*************************************************
9887
*         Check parsed pattern lookbehinds       *
9888
*************************************************/
9889
9890
/* This function is called at the end of parsing a pattern if any lookbehinds
9891
were encountered. It scans the parsed pattern for them, calling
9892
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9893
the error offset is marked unset. The enables the functions above not to
9894
override settings from deeper nestings.
9895
9896
This function is called recursively from get_branchlength() for lookaheads in
9897
order to process any lookbehinds that they may contain. It stops when it hits a
9898
non-nested closing parenthesis in this case, returning a pointer to it.
9899
9900
Arguments
9901
  pptr      points to where to start (start of pattern or start of lookahead)
9902
  retptr    if not NULL, return the ket pointer here
9903
  recurses  chain of recurse_check to catch mutual recursion
9904
  cb        points to the compile block
9905
  lcptr     points to loop counter
9906
9907
Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9908
*/
9909
9910
static int
9911
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9912
  parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9913
21
{
9914
21
int errorcode = 0;
9915
21
int nestlevel = 0;
9916
9917
21
cb->erroroffset = PCRE2_UNSET;
9918
9919
54.2k
for (; *pptr != META_END; pptr++)
9920
54.1k
  {
9921
54.1k
  if (*pptr < META_END) continue;  /* Literal */
9922
9923
5.09k
  switch (META_CODE(*pptr))
9924
5.09k
    {
9925
0
    default:
9926
9927
    /* The following erroroffset is a bogus but safe value. This branch should
9928
    be avoided by providing a proper implementation for all supported cases
9929
    below. */
9930
9931
0
    PCRE2_DEBUG_UNREACHABLE();
9932
0
    cb->erroroffset = 0;
9933
0
    return ERR70;  /* Unrecognized meta code */
9934
9935
8
    case META_ESCAPE:
9936
8
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9937
0
      pptr += 1;    /* Skip prop data */
9938
8
    break;
9939
9940
538
    case META_KET:
9941
538
    if (--nestlevel < 0)
9942
0
      {
9943
0
      if (retptr != NULL) *retptr = pptr;
9944
0
      return 0;
9945
0
      }
9946
538
    break;
9947
9948
538
    case META_ATOMIC:
9949
351
    case META_CAPTURE:
9950
351
    case META_COND_ASSERT:
9951
351
    case META_SCS:
9952
363
    case META_LOOKAHEAD:
9953
370
    case META_LOOKAHEADNOT:
9954
370
    case META_LOOKAHEAD_NA:
9955
538
    case META_NOCAPTURE:
9956
538
    case META_SCRIPT_RUN:
9957
538
    nestlevel++;
9958
538
    break;
9959
9960
0
    case META_ACCEPT:
9961
2.36k
    case META_ALT:
9962
2.42k
    case META_ASTERISK:
9963
2.42k
    case META_ASTERISK_PLUS:
9964
2.42k
    case META_ASTERISK_QUERY:
9965
2.42k
    case META_BACKREF:
9966
2.42k
    case META_CIRCUMFLEX:
9967
2.43k
    case META_CLASS:
9968
2.43k
    case META_CLASS_EMPTY:
9969
2.43k
    case META_CLASS_EMPTY_NOT:
9970
2.44k
    case META_CLASS_END:
9971
2.44k
    case META_CLASS_NOT:
9972
2.44k
    case META_COMMIT:
9973
2.76k
    case META_DOLLAR:
9974
3.01k
    case META_DOT:
9975
3.01k
    case META_FAIL:
9976
3.43k
    case META_PLUS:
9977
3.45k
    case META_PLUS_PLUS:
9978
3.45k
    case META_PLUS_QUERY:
9979
3.45k
    case META_PRUNE:
9980
3.79k
    case META_QUERY:
9981
3.81k
    case META_QUERY_PLUS:
9982
3.91k
    case META_QUERY_QUERY:
9983
3.91k
    case META_RANGE_ESCAPED:
9984
3.91k
    case META_RANGE_LITERAL:
9985
3.91k
    case META_SKIP:
9986
3.91k
    case META_THEN:
9987
3.91k
    break;
9988
9989
0
    case META_OFFSET:
9990
3
    case META_RECURSE:
9991
3
    pptr += SIZEOFFSET;
9992
3
    break;
9993
9994
0
    case META_BACKREF_BYNAME:
9995
0
    case META_RECURSE_BYNAME:
9996
0
    pptr += 1 + SIZEOFFSET;
9997
0
    break;
9998
9999
0
    case META_COND_DEFINE:
10000
0
    pptr += SIZEOFFSET;
10001
0
    nestlevel++;
10002
0
    break;
10003
10004
0
    case META_COND_NAME:
10005
0
    case META_COND_NUMBER:
10006
0
    case META_COND_RNAME:
10007
0
    case META_COND_RNUMBER:
10008
0
    pptr += 1 + SIZEOFFSET;
10009
0
    nestlevel++;
10010
0
    break;
10011
10012
0
    case META_COND_VERSION:
10013
0
    pptr += 3;
10014
0
    nestlevel++;
10015
0
    break;
10016
10017
0
    case META_CALLOUT_STRING:
10018
0
    pptr += 3 + SIZEOFFSET;
10019
0
    break;
10020
10021
0
    case META_BIGVALUE:
10022
0
    case META_POSIX:
10023
0
    case META_POSIX_NEG:
10024
0
    case META_SCS_NAME:
10025
0
    case META_SCS_NUMBER:
10026
0
    pptr += 1;
10027
0
    break;
10028
10029
0
    case META_MINMAX:
10030
0
    case META_MINMAX_QUERY:
10031
0
    case META_MINMAX_PLUS:
10032
0
    case META_OPTIONS:
10033
0
    pptr += 2;
10034
0
    break;
10035
10036
0
    case META_CALLOUT_NUMBER:
10037
0
    pptr += 3;
10038
0
    break;
10039
10040
0
    case META_MARK:
10041
0
    case META_COMMIT_ARG:
10042
0
    case META_PRUNE_ARG:
10043
0
    case META_SKIP_ARG:
10044
0
    case META_THEN_ARG:
10045
0
    pptr += 1 + pptr[1];
10046
0
    break;
10047
10048
    /* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to
10049
    the final ket of the group, so no need to update it here. */
10050
10051
60
    case META_LOOKBEHIND:
10052
60
    case META_LOOKBEHINDNOT:
10053
91
    case META_LOOKBEHIND_NA:
10054
91
    if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10055
3
      return errorcode;
10056
88
    break;
10057
5.09k
    }
10058
5.09k
  }
10059
10060
18
return 0;
10061
21
}
10062
10063
10064
10065
/*************************************************
10066
*     External function to compile a pattern     *
10067
*************************************************/
10068
10069
/* This function reads a regular expression in the form of a string and returns
10070
a pointer to a block of store holding a compiled version of the expression.
10071
10072
Arguments:
10073
  pattern       the regular expression
10074
  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10075
  options       option bits
10076
  errorptr      pointer to errorcode
10077
  erroroffset   pointer to error offset
10078
  ccontext      points to a compile context or is NULL
10079
10080
Returns:        pointer to compiled data block, or NULL on error,
10081
                with errorcode and erroroffset set
10082
*/
10083
10084
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10085
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10086
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10087
2.39k
{
10088
2.39k
BOOL utf;                             /* Set TRUE for UTF mode */
10089
2.39k
BOOL ucp;                             /* Set TRUE for UCP mode */
10090
2.39k
BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10091
2.39k
BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10092
2.39k
pcre2_real_code *re = NULL;           /* What we will return */
10093
2.39k
compile_block cb;                     /* "Static" compile-time data */
10094
2.39k
const uint8_t *tables;                /* Char tables base pointer */
10095
10096
2.39k
PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10097
2.39k
PCRE2_UCHAR * codestart;              /* Start of compiled code */
10098
2.39k
PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10099
2.39k
uint32_t *pptr;                       /* Current pointer in parsed pattern */
10100
10101
2.39k
PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10102
2.39k
PCRE2_SIZE usedlength;                /* Actual length used */
10103
2.39k
PCRE2_SIZE re_blocksize;              /* Size of memory block */
10104
2.39k
PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10105
10106
2.39k
uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10107
2.39k
uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10108
2.39k
uint32_t setflags = 0;                /* NL and BSR set flags */
10109
2.39k
uint32_t xoptions;                    /* Flags from context, modified */
10110
10111
2.39k
uint32_t skipatstart;                 /* When checking (*UTF) etc */
10112
2.39k
uint32_t limit_heap  = UINT32_MAX;
10113
2.39k
uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10114
2.39k
uint32_t limit_depth = UINT32_MAX;
10115
10116
2.39k
int newline = 0;                      /* Unset; can be set by the pattern */
10117
2.39k
int bsr = 0;                          /* Unset; can be set by the pattern */
10118
2.39k
int errorcode = 0;                    /* Initialize to avoid compiler warn */
10119
2.39k
int regexrc;                          /* Return from compile */
10120
10121
2.39k
uint32_t i;                           /* Local loop counter */
10122
10123
/* Enable all optimizations by default. */
10124
2.39k
uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :
10125
2.39k
                                          PCRE2_OPTIMIZATION_ALL;
10126
10127
/* Comments at the head of this file explain about these variables. */
10128
10129
2.39k
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10130
2.39k
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10131
2.39k
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10132
10133
/* The workspace is used in different ways in the different compiling phases.
10134
It needs to be 16-bit aligned for the preliminary parsing scan. */
10135
10136
2.39k
uint32_t c16workspace[C16_WORK_SIZE];
10137
2.39k
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10138
10139
10140
/* -------------- Check arguments and set up the pattern ----------------- */
10141
10142
/* There must be error code and offset pointers. */
10143
10144
2.39k
if (errorptr == NULL || erroroffset == NULL) return NULL;
10145
2.39k
*errorptr = ERR0;
10146
2.39k
*erroroffset = 0;
10147
10148
/* There must be a pattern, but NULL is allowed with zero length. */
10149
10150
2.39k
if (pattern == NULL)
10151
0
  {
10152
0
  if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10153
0
    {
10154
0
    *errorptr = ERR16;
10155
0
    return NULL;
10156
0
    }
10157
0
  }
10158
10159
/* A NULL compile context means "use a default context" */
10160
10161
2.39k
if (ccontext == NULL)
10162
0
  ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10163
10164
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10165
10166
2.39k
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10167
10168
/* Check that all undefined public option bits are zero. */
10169
10170
2.39k
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10171
2.39k
    (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10172
0
  {
10173
0
  *errorptr = ERR17;
10174
0
  return NULL;
10175
0
  }
10176
10177
2.39k
if ((options & PCRE2_LITERAL) != 0 &&
10178
2.39k
    ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10179
0
     (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10180
0
  {
10181
0
  *errorptr = ERR92;
10182
0
  return NULL;
10183
0
  }
10184
10185
/* A zero-terminated pattern is indicated by the special length value
10186
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10187
10188
2.39k
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10189
0
  patlen = PRIV(strlen)(pattern);
10190
2.39k
(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */
10191
10192
2.39k
if (patlen > ccontext->max_pattern_length)
10193
0
  {
10194
0
  *errorptr = ERR88;
10195
0
  return NULL;
10196
0
  }
10197
10198
/* Optimization flags in 'options' can override those in the compile context.
10199
This is because some options to disable optimizations were added before the
10200
optimization flags word existed, and we need to continue supporting them
10201
for backwards compatibility. */
10202
10203
2.39k
if ((options & PCRE2_NO_AUTO_POSSESS) != 0)
10204
0
  optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;
10205
2.39k
if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
10206
0
  optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;
10207
2.39k
if ((options & PCRE2_NO_START_OPTIMIZE) != 0)
10208
0
  optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;
10209
10210
/* From here on, all returns from this function should end up going via the
10211
EXIT label. */
10212
10213
10214
/* ------------ Initialize the "static" compile data -------------- */
10215
10216
2.39k
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10217
10218
2.39k
cb.lcc = tables + lcc_offset;          /* Individual */
10219
2.39k
cb.fcc = tables + fcc_offset;          /*   character */
10220
2.39k
cb.cbits = tables + cbits_offset;      /*      tables */
10221
2.39k
cb.ctypes = tables + ctypes_offset;
10222
10223
2.39k
cb.assert_depth = 0;
10224
2.39k
cb.bracount = 0;
10225
2.39k
cb.cx = ccontext;
10226
2.39k
cb.dupnames = FALSE;
10227
2.39k
cb.end_pattern = pattern + patlen;
10228
2.39k
cb.erroroffset = 0;
10229
2.39k
cb.external_flags = 0;
10230
2.39k
cb.external_options = options;
10231
2.39k
cb.groupinfo = stack_groupinfo;
10232
2.39k
cb.had_recurse = FALSE;
10233
2.39k
cb.lastcapture = 0;
10234
2.39k
cb.max_lookbehind = 0;                               /* Max encountered */
10235
2.39k
cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10236
2.39k
cb.name_entry_size = 0;
10237
2.39k
cb.name_table = NULL;
10238
2.39k
cb.named_groups = named_groups;
10239
2.39k
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10240
2.39k
cb.names_found = 0;
10241
2.39k
cb.parens_depth = 0;
10242
2.39k
cb.parsed_pattern = stack_parsed_pattern;
10243
2.39k
cb.req_varyopt = 0;
10244
2.39k
cb.start_code = cworkspace;
10245
2.39k
cb.start_pattern = pattern;
10246
2.39k
cb.start_workspace = cworkspace;
10247
2.39k
cb.workspace_size = COMPILE_WORK_SIZE;
10248
2.39k
#ifdef SUPPORT_WIDE_CHARS
10249
2.39k
cb.cranges = NULL;
10250
2.39k
cb.next_cranges = NULL;
10251
2.39k
cb.char_lists_size = 0;
10252
2.39k
#endif
10253
10254
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10255
references to help in deciding whether (.*) can be treated as anchored or not.
10256
*/
10257
10258
2.39k
cb.top_backref = 0;
10259
2.39k
cb.backref_map = 0;
10260
10261
/* Escape sequences \1 to \9 are always back references, but as they are only
10262
two characters long, only two elements can be used in the parsed_pattern
10263
vector. The first contains the reference, and we'd like to use the second to
10264
record the offset in the pattern, so that forward references to non-existent
10265
groups can be diagnosed later with an offset. However, on 64-bit systems,
10266
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10267
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10268
references have enough space for the offset to be put into the parsed pattern.
10269
*/
10270
10271
26.3k
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10272
10273
10274
/* --------------- Start looking at the pattern --------------- */
10275
10276
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10277
the start of the pattern, and remember the offset to the actual regex. With
10278
valgrind support, make the terminator of a zero-terminated pattern
10279
inaccessible. This catches bugs that would otherwise only show up for
10280
non-zero-terminated patterns. */
10281
10282
#ifdef SUPPORT_VALGRIND
10283
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10284
#endif
10285
10286
2.39k
xoptions = ccontext->extra_options;
10287
2.39k
ptr = pattern;
10288
2.39k
skipatstart = 0;
10289
10290
2.39k
if ((options & PCRE2_LITERAL) == 0)
10291
2.39k
  {
10292
2.39k
  while (patlen - skipatstart >= 2 &&
10293
2.39k
         ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10294
2.39k
         ptr[skipatstart+1] == CHAR_ASTERISK)
10295
0
    {
10296
0
    for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10297
0
      {
10298
0
      const pso *p = pso_list + i;
10299
10300
0
      if (patlen - skipatstart - 2 >= p->length &&
10301
0
          PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)
10302
0
        {
10303
0
        uint32_t c, pp;
10304
10305
0
        skipatstart += p->length + 2;
10306
0
        switch(p->type)
10307
0
          {
10308
0
          case PSO_OPT:
10309
0
          cb.external_options |= p->value;
10310
0
          break;
10311
10312
0
          case PSO_XOPT:
10313
0
          xoptions |= p->value;
10314
0
          break;
10315
10316
0
          case PSO_FLG:
10317
0
          setflags |= p->value;
10318
0
          break;
10319
10320
0
          case PSO_NL:
10321
0
          newline = p->value;
10322
0
          setflags |= PCRE2_NL_SET;
10323
0
          break;
10324
10325
0
          case PSO_BSR:
10326
0
          bsr = p->value;
10327
0
          setflags |= PCRE2_BSR_SET;
10328
0
          break;
10329
10330
0
          case PSO_LIMM:
10331
0
          case PSO_LIMD:
10332
0
          case PSO_LIMH:
10333
0
          c = 0;
10334
0
          pp = skipatstart;
10335
0
          while (pp < patlen && IS_DIGIT(ptr[pp]))
10336
0
            {
10337
0
            if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10338
0
            c = c*10 + (ptr[pp++] - CHAR_0);
10339
0
            }
10340
0
          if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
10341
0
            {
10342
0
            errorcode = ERR60;
10343
0
            ptr += pp;
10344
0
            goto HAD_EARLY_ERROR;
10345
0
            }
10346
0
          if (p->type == PSO_LIMH) limit_heap = c;
10347
0
            else if (p->type == PSO_LIMM) limit_match = c;
10348
0
            else limit_depth = c;
10349
0
          skipatstart = ++pp;
10350
0
          break;
10351
10352
0
          case PSO_OPTMZ:
10353
0
          optim_flags &= ~(p->value);
10354
10355
          /* For backward compatibility the three original VERBs to disable
10356
          optimizations need to also update the corresponding bit in the
10357
          external options. */
10358
10359
0
          switch(p->value)
10360
0
            {
10361
0
            case PCRE2_OPTIM_AUTO_POSSESS:
10362
0
            cb.external_options |= PCRE2_NO_AUTO_POSSESS;
10363
0
            break;
10364
10365
0
            case PCRE2_OPTIM_DOTSTAR_ANCHOR:
10366
0
            cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;
10367
0
            break;
10368
10369
0
            case PCRE2_OPTIM_START_OPTIMIZE:
10370
0
            cb.external_options |= PCRE2_NO_START_OPTIMIZE;
10371
0
            break;
10372
0
            }
10373
10374
0
          break;
10375
10376
0
          default:
10377
          /* All values in the enum need an explicit entry for this switch
10378
          but until a better way to prevent coding mistakes is invented keep
10379
          a catch all that triggers a debug build assert as a failsafe */
10380
0
          PCRE2_DEBUG_UNREACHABLE();
10381
0
          }
10382
0
        break;   /* Out of the table scan loop */
10383
0
        }
10384
0
      }
10385
0
    if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10386
0
    }
10387
2.39k
    PCRE2_ASSERT(skipatstart <= patlen);
10388
2.39k
  }
10389
10390
/* End of pattern-start options; advance to start of real regex. */
10391
10392
2.39k
ptr += skipatstart;
10393
10394
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10395
10396
#ifndef SUPPORT_UNICODE
10397
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10398
  {
10399
  errorcode = ERR32;
10400
  goto HAD_EARLY_ERROR;
10401
  }
10402
#endif
10403
10404
/* Check UTF. We have the original options in 'options', with that value as
10405
modified by (*UTF) etc in cb->external_options. The extra option
10406
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10407
surrogate code points cannot be represented in UTF-16. */
10408
10409
2.39k
utf = (cb.external_options & PCRE2_UTF) != 0;
10410
2.39k
if (utf)
10411
552
  {
10412
552
  if ((options & PCRE2_NEVER_UTF) != 0)
10413
0
    {
10414
0
    errorcode = ERR74;
10415
0
    goto HAD_EARLY_ERROR;
10416
0
    }
10417
552
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10418
552
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10419
34
    goto HAD_ERROR;  /* Offset was set by valid_utf() */
10420
10421
#if PCRE2_CODE_UNIT_WIDTH == 16
10422
  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10423
    {
10424
    errorcode = ERR91;
10425
    goto HAD_EARLY_ERROR;
10426
    }
10427
#endif
10428
552
  }
10429
10430
/* Check UCP lockout. */
10431
10432
2.35k
ucp = (cb.external_options & PCRE2_UCP) != 0;
10433
2.35k
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10434
0
  {
10435
0
  errorcode = ERR75;
10436
0
  goto HAD_EARLY_ERROR;
10437
0
  }
10438
10439
/* PCRE2_EXTRA_TURKISH_CASING checks */
10440
10441
2.35k
if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)
10442
0
  {
10443
0
  if (!utf && !ucp)
10444
0
    {
10445
0
    errorcode = ERR104;
10446
0
    goto HAD_EARLY_ERROR;
10447
0
    }
10448
10449
0
#if PCRE2_CODE_UNIT_WIDTH == 8
10450
0
  if (!utf)
10451
0
    {
10452
0
    errorcode = ERR105;
10453
0
    goto HAD_EARLY_ERROR;
10454
0
    }
10455
0
#endif
10456
10457
0
  if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)
10458
0
    {
10459
0
    errorcode = ERR106;
10460
0
    goto HAD_EARLY_ERROR;
10461
0
    }
10462
0
  }
10463
10464
/* Process the BSR setting. */
10465
10466
2.35k
if (bsr == 0) bsr = ccontext->bsr_convention;
10467
10468
/* Process the newline setting. */
10469
10470
2.35k
if (newline == 0) newline = ccontext->newline_convention;
10471
2.35k
cb.nltype = NLTYPE_FIXED;
10472
2.35k
switch(newline)
10473
2.35k
  {
10474
0
  case PCRE2_NEWLINE_CR:
10475
0
  cb.nllen = 1;
10476
0
  cb.nl[0] = CHAR_CR;
10477
0
  break;
10478
10479
2.35k
  case PCRE2_NEWLINE_LF:
10480
2.35k
  cb.nllen = 1;
10481
2.35k
  cb.nl[0] = CHAR_NL;
10482
2.35k
  break;
10483
10484
0
  case PCRE2_NEWLINE_NUL:
10485
0
  cb.nllen = 1;
10486
0
  cb.nl[0] = CHAR_NUL;
10487
0
  break;
10488
10489
0
  case PCRE2_NEWLINE_CRLF:
10490
0
  cb.nllen = 2;
10491
0
  cb.nl[0] = CHAR_CR;
10492
0
  cb.nl[1] = CHAR_NL;
10493
0
  break;
10494
10495
0
  case PCRE2_NEWLINE_ANY:
10496
0
  cb.nltype = NLTYPE_ANY;
10497
0
  break;
10498
10499
0
  case PCRE2_NEWLINE_ANYCRLF:
10500
0
  cb.nltype = NLTYPE_ANYCRLF;
10501
0
  break;
10502
10503
0
  default:
10504
0
  PCRE2_DEBUG_UNREACHABLE();
10505
0
  errorcode = ERR56;
10506
0
  goto HAD_EARLY_ERROR;
10507
2.35k
  }
10508
10509
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10510
their numerical equivalents, so that this information is always available for
10511
the remaining processing. (2) At the same time, parse the pattern and put a
10512
processed version into the parsed_pattern vector. This has escapes interpreted
10513
and comments removed (amongst other things). */
10514
10515
/* Ensure that the parsed pattern buffer is big enough. For many smaller
10516
patterns the vector on the stack (which was set up above) can be used. */
10517
10518
2.35k
parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);
10519
10520
/* Allow for 2x uint32_t at the start and 2 at the end, for
10521
PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */
10522
10523
2.35k
if ((ccontext->extra_options &
10524
2.35k
     (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10525
0
  parsed_size_needed += 4;
10526
10527
/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */
10528
10529
2.35k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10530
0
  parsed_size_needed += 4;
10531
10532
2.35k
parsed_size_needed += 1;  /* For the final META_END */
10533
10534
2.35k
if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)
10535
323
  {
10536
323
  uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10537
323
    parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);
10538
323
  if (heap_parsed_pattern == NULL)
10539
0
    {
10540
0
    *errorptr = ERR21;
10541
0
    goto EXIT;
10542
0
    }
10543
323
  cb.parsed_pattern = heap_parsed_pattern;
10544
323
  }
10545
2.35k
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;
10546
10547
/* Do the parsing scan. */
10548
10549
2.35k
errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);
10550
2.35k
if (errorcode != 0) goto HAD_CB_ERROR;
10551
10552
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10553
lengths. Workspace is needed to remember whether numbered groups are or are not
10554
of limited length, and if limited, what the minimum and maximum lengths are.
10555
This caching saves re-computing the length of any group that is referenced more
10556
than once, which is particularly relevant when recursion is involved.
10557
Unnumbered groups do not have this exposure because they cannot be referenced.
10558
If there are sufficiently few groups, the default index vector on the stack, as
10559
set up above, can be used. Otherwise we have to get/free some heap memory. The
10560
vector must be initialized to zero. */
10561
10562
1.27k
if (has_lookbehind)
10563
21
  {
10564
21
  int loopcount = 0;
10565
21
  if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10566
0
    {
10567
0
    cb.groupinfo = ccontext->memctl.malloc(
10568
0
      (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10569
0
    if (cb.groupinfo == NULL)
10570
0
      {
10571
0
      errorcode = ERR21;
10572
0
      cb.erroroffset = 0;
10573
0
      goto HAD_CB_ERROR;
10574
0
      }
10575
0
    }
10576
21
  memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10577
21
  errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10578
21
  if (errorcode != 0) goto HAD_CB_ERROR;
10579
21
  }
10580
10581
/* For debugging, there is a function that shows the parsed pattern vector. */
10582
10583
#ifdef DEBUG_SHOW_PARSED
10584
fprintf(stderr, "+++ Pre-scan complete:\n");
10585
show_parsed(&cb);
10586
#endif
10587
10588
/* For debugging capturing information this code can be enabled. */
10589
10590
#ifdef DEBUG_SHOW_CAPTURES
10591
  {
10592
  named_group *ng = cb.named_groups;
10593
  fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10594
  for (i = 0; i < cb.names_found; i++, ng++)
10595
    {
10596
    fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10597
    }
10598
  }
10599
#endif
10600
10601
/* Pretend to compile the pattern while actually just accumulating the amount
10602
of memory required in the 'length' variable. This behaviour is triggered by
10603
passing a non-NULL final argument to compile_regex(). We pass a block of
10604
workspace (cworkspace) for it to compile parts of the pattern into; the
10605
compiled code is discarded when it is no longer needed, so hopefully this
10606
workspace will never overflow, though there is a test for its doing so.
10607
10608
On error, errorcode will be set non-zero, so we don't need to look at the
10609
result of the function. The initial options have been put into the cb block,
10610
but we still have to pass a separate options variable (the first argument)
10611
because the options may change as the pattern is processed. */
10612
10613
1.26k
cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10614
1.26k
pptr = cb.parsed_pattern;
10615
1.26k
code = cworkspace;
10616
1.26k
*code = OP_BRA;
10617
10618
1.26k
(void)compile_regex(cb.external_options, xoptions, &code, &pptr,
10619
1.26k
   &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10620
1.26k
   &cb, &length);
10621
10622
1.26k
if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10623
10624
/* This should be caught in compile_regex(), but just in case... */
10625
10626
1.16k
#if defined SUPPORT_WIDE_CHARS
10627
1.16k
PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);
10628
1.16k
if (length > MAX_PATTERN_SIZE ||
10629
1.16k
    MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))
10630
#else
10631
if (length > MAX_PATTERN_SIZE)
10632
#endif
10633
0
  {
10634
0
  errorcode = ERR20;
10635
0
  goto HAD_CB_ERROR;
10636
0
  }
10637
10638
/* Compute the size of, then, if not too large, get and initialize the data
10639
block for storing the compiled pattern and names table. Integer overflow should
10640
no longer be possible because nowadays we limit the maximum value of
10641
cb.names_found and cb.name_entry_size. */
10642
10643
1.16k
re_blocksize =
10644
1.16k
  CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10645
10646
1.16k
#if defined SUPPORT_WIDE_CHARS
10647
1.16k
if (cb.char_lists_size != 0)
10648
0
  {
10649
0
#if PCRE2_CODE_UNIT_WIDTH != 32
10650
  /* Align to 32 bit first. This ensures the
10651
  allocated area will also be 32 bit aligned. */
10652
0
  re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));
10653
0
#endif
10654
0
  re_blocksize += cb.char_lists_size;
10655
0
  }
10656
1.16k
#endif
10657
10658
1.16k
re_blocksize += CU2BYTES(length);
10659
10660
1.16k
if (re_blocksize > ccontext->max_pattern_compiled_length)
10661
0
  {
10662
0
  errorcode = ERR101;
10663
0
  goto HAD_CB_ERROR;
10664
0
  }
10665
10666
1.16k
re_blocksize += sizeof(pcre2_real_code);
10667
1.16k
re = (pcre2_real_code *)
10668
1.16k
  ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10669
1.16k
if (re == NULL)
10670
0
  {
10671
0
  errorcode = ERR21;
10672
0
  goto HAD_CB_ERROR;
10673
0
  }
10674
10675
/* The compiler may put padding at the end of the pcre2_real_code structure in
10676
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10677
compiled pattern is copied (for example, when serialized) undefined bytes are
10678
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10679
write to the last 8 bytes of the structure before setting the fields. */
10680
10681
1.16k
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10682
1.16k
re->memctl = ccontext->memctl;
10683
1.16k
re->tables = tables;
10684
1.16k
re->executable_jit = NULL;
10685
1.16k
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10686
1.16k
re->blocksize = re_blocksize;
10687
1.16k
re->code_start = re_blocksize - CU2BYTES(length);
10688
1.16k
re->magic_number = MAGIC_NUMBER;
10689
1.16k
re->compile_options = options;
10690
1.16k
re->overall_options = cb.external_options;
10691
1.16k
re->extra_options = xoptions;
10692
1.16k
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10693
1.16k
re->limit_heap = limit_heap;
10694
1.16k
re->limit_match = limit_match;
10695
1.16k
re->limit_depth = limit_depth;
10696
1.16k
re->first_codeunit = 0;
10697
1.16k
re->last_codeunit = 0;
10698
1.16k
re->bsr_convention = bsr;
10699
1.16k
re->newline_convention = newline;
10700
1.16k
re->max_lookbehind = 0;
10701
1.16k
re->minlength = 0;
10702
1.16k
re->top_bracket = 0;
10703
1.16k
re->top_backref = 0;
10704
1.16k
re->name_entry_size = cb.name_entry_size;
10705
1.16k
re->name_count = cb.names_found;
10706
1.16k
re->optimization_flags = optim_flags;
10707
10708
/* The basic block is immediately followed by the name table, and the compiled
10709
code follows after that. */
10710
10711
1.16k
codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
10712
10713
/* Update the compile data block for the actual compile. The starting points of
10714
the name/number translation table and of the code are passed around in the
10715
compile data block. The start/end pattern and initial options are already set
10716
from the pre-compile phase, as is the name_entry_size field. */
10717
10718
1.16k
cb.parens_depth = 0;
10719
1.16k
cb.assert_depth = 0;
10720
1.16k
cb.lastcapture = 0;
10721
1.16k
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10722
1.16k
cb.start_code = codestart;
10723
1.16k
cb.req_varyopt = 0;
10724
1.16k
cb.had_accept = FALSE;
10725
1.16k
cb.had_pruneorskip = FALSE;
10726
1.16k
#ifdef SUPPORT_WIDE_CHARS
10727
1.16k
cb.char_lists_size = 0;
10728
1.16k
#endif
10729
10730
10731
/* If any named groups were found, create the name/number table from the list
10732
created in the pre-pass. */
10733
10734
1.16k
if (cb.names_found > 0)
10735
0
  {
10736
0
  named_group *ng = cb.named_groups;
10737
0
  for (i = 0; i < cb.names_found; i++, ng++)
10738
0
    add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10739
0
  }
10740
10741
/* Set up a starting, non-extracting bracket, then compile the expression. On
10742
error, errorcode will be set non-zero, so we don't need to look at the result
10743
of the function here. */
10744
10745
1.16k
pptr = cb.parsed_pattern;
10746
1.16k
code = (PCRE2_UCHAR *)codestart;
10747
1.16k
*code = OP_BRA;
10748
1.16k
regexrc = compile_regex(re->overall_options, re->extra_options, &code,
10749
1.16k
  &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10750
1.16k
  NULL, &cb, NULL);
10751
1.16k
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10752
1.16k
re->top_bracket = cb.bracount;
10753
1.16k
re->top_backref = cb.top_backref;
10754
1.16k
re->max_lookbehind = cb.max_lookbehind;
10755
10756
1.16k
if (cb.had_accept)
10757
0
  {
10758
0
  reqcu = 0;                     /* Must disable after (*ACCEPT) */
10759
0
  reqcuflags = REQ_NONE;
10760
0
  re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10761
0
  }
10762
10763
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10764
but the estimated length exceeds the really used length, adjust the value of
10765
re->blocksize, and if valgrind support is configured, mark the extra allocated
10766
memory as unaddressable, so that any out-of-bound reads can be detected. */
10767
10768
1.16k
*code++ = OP_END;
10769
1.16k
usedlength = code - codestart;
10770
1.16k
if (usedlength > length)
10771
0
  {
10772
0
  PCRE2_DEBUG_UNREACHABLE();
10773
0
  errorcode = ERR23;  /* Overflow of code block - internal error */
10774
0
  }
10775
1.16k
else
10776
1.16k
  {
10777
1.16k
  re->blocksize -= CU2BYTES(length - usedlength);
10778
#ifdef SUPPORT_VALGRIND
10779
  VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10780
#endif
10781
1.16k
  }
10782
10783
/* Scan the pattern for recursion/subroutine calls and convert the group
10784
numbers into offsets. Maintain a small cache so that repeated groups containing
10785
recursions are efficiently handled. */
10786
10787
1.16k
#define RSCAN_CACHE_SIZE 8
10788
10789
1.16k
if (errorcode == 0 && cb.had_recurse)
10790
21
  {
10791
21
  PCRE2_UCHAR *rcode;
10792
21
  PCRE2_SPTR rgroup;
10793
21
  unsigned int ccount = 0;
10794
21
  int start = RSCAN_CACHE_SIZE;
10795
21
  recurse_cache rc[RSCAN_CACHE_SIZE];
10796
10797
21
  for (rcode = find_recurse(codestart, utf);
10798
538
       rcode != NULL;
10799
517
       rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))
10800
517
    {
10801
517
    int p, groupnumber;
10802
10803
517
    groupnumber = (int)GET(rcode, 1);
10804
517
    if (groupnumber == 0) rgroup = codestart; else
10805
499
      {
10806
499
      PCRE2_SPTR search_from = codestart;
10807
499
      rgroup = NULL;
10808
1.35k
      for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10809
1.28k
        {
10810
1.28k
        if (groupnumber == rc[p].groupnumber)
10811
425
          {
10812
425
          rgroup = rc[p].group;
10813
425
          break;
10814
425
          }
10815
10816
        /* Group n+1 must always start to the right of group n, so we can save
10817
        search time below when the new group number is greater than any of the
10818
        previously found groups. */
10819
10820
856
        if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10821
856
        }
10822
10823
499
      if (rgroup == NULL)
10824
74
        {
10825
74
        rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10826
74
        if (rgroup == NULL)
10827
0
          {
10828
0
          PCRE2_DEBUG_UNREACHABLE();
10829
0
          errorcode = ERR53;
10830
0
          break;
10831
0
          }
10832
74
        if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10833
74
        rc[start].groupnumber = groupnumber;
10834
74
        rc[start].group = rgroup;
10835
74
        if (ccount < RSCAN_CACHE_SIZE) ccount++;
10836
74
        }
10837
499
      }
10838
10839
517
    PUT(rcode, 1, (uint32_t)(rgroup - codestart));
10840
517
    }
10841
21
  }
10842
10843
/* In rare debugging situations we sometimes need to look at the compiled code
10844
at this stage. */
10845
10846
#ifdef DEBUG_CALL_PRINTINT
10847
pcre2_printint(re, stderr, TRUE);
10848
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10849
#endif
10850
10851
/* Unless disabled, check whether any single character iterators can be
10852
auto-possessified. The function overwrites the appropriate opcode values, so
10853
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10854
used in this code because at least one compiler gives a warning about loss of
10855
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10856
function call. */
10857
10858
1.16k
if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)
10859
1.16k
  {
10860
1.16k
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10861
1.16k
  if (PRIV(auto_possessify)(temp, &cb) != 0)
10862
0
    {
10863
0
    PCRE2_DEBUG_UNREACHABLE();
10864
0
    errorcode = ERR80;
10865
0
    }
10866
1.16k
  }
10867
10868
/* Failed to compile, or error while post-processing. */
10869
10870
1.16k
if (errorcode != 0) goto HAD_CB_ERROR;
10871
10872
/* Successful compile. If the anchored option was not passed, set it if
10873
we can determine that the pattern is anchored by virtue of ^ characters or \A
10874
or anything else, such as starting with non-atomic .* when DOTALL is set and
10875
there are no occurrences of *PRUNE or *SKIP (though there is an option to
10876
disable this case). */
10877
10878
1.16k
if ((re->overall_options & PCRE2_ANCHORED) == 0)
10879
1.13k
  {
10880
1.13k
  BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
10881
1.13k
  if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
10882
11
    re->overall_options |= PCRE2_ANCHORED;
10883
1.13k
  }
10884
10885
/* Set up the first code unit or startline flag, the required code unit, and
10886
then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE
10887
is disabled, as the data it would create will not be used. Note that a first code
10888
unit (but not the startline flag) is useful for anchored patterns because it
10889
can still give a quick "no match" and also avoid searching for a last code
10890
unit. */
10891
10892
1.16k
if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
10893
1.16k
  {
10894
1.16k
  int minminlength = 0;  /* For minimal minlength from first/required CU */
10895
10896
  /* If we do not have a first code unit, see if there is one that is asserted
10897
  (these are not saved during the compile because they can cause conflicts with
10898
  actual literals that follow). */
10899
10900
1.16k
  if (firstcuflags >= REQ_NONE) {
10901
1.10k
    uint32_t assertedcuflags = 0;
10902
1.10k
    uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);
10903
    /* It would be wrong to use the asserted first code unit as `firstcu` for
10904
     * regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)
10905
     * For that example, if we set both firstcu and reqcu to 'a', it would mean
10906
     * the subject string needs to be at least 2 characters long, which is wrong.
10907
     * With more analysis, we would be able to set firstcu in more cases. */
10908
1.10k
    if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {
10909
0
      firstcu = assertedcu;
10910
0
      firstcuflags = assertedcuflags;
10911
0
    }
10912
1.10k
  }
10913
10914
  /* Save the data for a first code unit. The existence of one means the
10915
  minimum length must be at least 1. */
10916
10917
1.16k
  if (firstcuflags < REQ_NONE)
10918
64
    {
10919
64
    re->first_codeunit = firstcu;
10920
64
    re->flags |= PCRE2_FIRSTSET;
10921
64
    minminlength++;
10922
10923
    /* Handle caseless first code units. */
10924
10925
64
    if ((firstcuflags & REQ_CASELESS) != 0)
10926
28
      {
10927
28
      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10928
27
        {
10929
27
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10930
27
        }
10931
10932
      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10933
      In 8-bit UTF mode, code units in the range 128-255 are introductory code
10934
      units and cannot have another case, but if UCP is set they may do. */
10935
10936
1
#ifdef SUPPORT_UNICODE
10937
1
#if PCRE2_CODE_UNIT_WIDTH == 8
10938
1
      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10939
0
        re->flags |= PCRE2_FIRSTCASELESS;
10940
#else
10941
      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10942
               UCD_OTHERCASE(firstcu) != firstcu)
10943
        re->flags |= PCRE2_FIRSTCASELESS;
10944
#endif
10945
28
#endif  /* SUPPORT_UNICODE */
10946
28
      }
10947
64
    }
10948
10949
  /* When there is no first code unit, for non-anchored patterns, see if we can
10950
  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10951
  branches start with ^ and also when all branches start with non-atomic .* for
10952
  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10953
  that disables this case.) */
10954
10955
1.10k
  else if ((re->overall_options & PCRE2_ANCHORED) == 0)
10956
1.07k
    {
10957
1.07k
    BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
10958
1.07k
    if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
10959
0
      re->flags |= PCRE2_STARTLINE;
10960
1.07k
    }
10961
10962
  /* Handle the "required code unit", if one is set. In the UTF case we can
10963
  increment the minimum minimum length only if we are sure this really is a
10964
  different character and not a non-starting code unit of the first character,
10965
  because the minimum length count is in characters, not code units. */
10966
10967
1.16k
  if (reqcuflags < REQ_NONE)
10968
140
    {
10969
#if PCRE2_CODE_UNIT_WIDTH == 16
10970
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10971
        firstcuflags >= REQ_NONE ||                 /* First not set */
10972
        (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10973
        (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10974
#elif PCRE2_CODE_UNIT_WIDTH == 8
10975
140
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10976
140
        firstcuflags >= REQ_NONE ||                 /* First not set */
10977
140
        (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10978
140
        (reqcu & 0x80) == 0)                        /* Req is ASCII */
10979
140
#endif
10980
140
      {
10981
140
      minminlength++;
10982
140
      }
10983
10984
    /* In the case of an anchored pattern, set up the value only if it follows
10985
    a variable length item in the pattern. */
10986
10987
140
    if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10988
140
        (reqcuflags & REQ_VARY) != 0)
10989
135
      {
10990
135
      re->last_codeunit = reqcu;
10991
135
      re->flags |= PCRE2_LASTSET;
10992
10993
      /* Handle caseless required code units as for first code units (above). */
10994
10995
135
      if ((reqcuflags & REQ_CASELESS) != 0)
10996
51
        {
10997
51
        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10998
51
          {
10999
51
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
11000
51
          }
11001
0
#ifdef SUPPORT_UNICODE
11002
0
#if PCRE2_CODE_UNIT_WIDTH == 8
11003
0
      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
11004
0
        re->flags |= PCRE2_LASTCASELESS;
11005
#else
11006
      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
11007
               UCD_OTHERCASE(reqcu) != reqcu)
11008
        re->flags |= PCRE2_LASTCASELESS;
11009
#endif
11010
51
#endif  /* SUPPORT_UNICODE */
11011
51
        }
11012
135
      }
11013
140
    }
11014
11015
  /* Study the compiled pattern to set up information such as a bitmap of
11016
  starting code units and a minimum matching length. */
11017
11018
1.16k
  if (PRIV(study)(re) != 0)
11019
0
    {
11020
0
    PCRE2_DEBUG_UNREACHABLE();
11021
0
    errorcode = ERR31;
11022
0
    goto HAD_CB_ERROR;
11023
0
    }
11024
11025
  /* If study() set a bitmap of starting code units, it implies a minimum
11026
  length of at least one. */
11027
11028
1.16k
  if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
11029
542
    minminlength = 1;
11030
11031
  /* If the minimum length set (or not set) by study() is less than the minimum
11032
  implied by required code units, override it. */
11033
11034
1.16k
  if (re->minlength < minminlength) re->minlength = minminlength;
11035
1.16k
  }   /* End of start-of-match optimizations. */
11036
11037
/* Control ends up here in all cases. When running under valgrind, make a
11038
pattern's terminating zero defined again. If memory was obtained for the parsed
11039
version of the pattern, free it before returning. Also free the list of named
11040
groups if a larger one had to be obtained, and likewise the group information
11041
vector. */
11042
11043
1.16k
#ifdef SUPPORT_UNICODE
11044
1.16k
PCRE2_ASSERT(cb.cranges == NULL);
11045
1.16k
#endif
11046
11047
2.39k
EXIT:
11048
#ifdef SUPPORT_VALGRIND
11049
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
11050
#endif
11051
2.39k
if (cb.parsed_pattern != stack_parsed_pattern)
11052
323
  ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
11053
2.39k
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
11054
0
  ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
11055
2.39k
if (cb.groupinfo != stack_groupinfo)
11056
0
  ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
11057
11058
2.39k
return re;    /* Will be NULL after an error */
11059
11060
/* Errors discovered in parse_regex() set the offset value in the compile
11061
block. Errors discovered before it is called must compute it from the ptr
11062
value. After parse_regex() is called, the offset in the compile block is set to
11063
the end of the pattern, but certain errors in compile_regex() may reset it if
11064
an offset is available in the parsed pattern. */
11065
11066
1.19k
HAD_CB_ERROR:
11067
1.19k
ptr = pattern + cb.erroroffset;
11068
11069
1.19k
HAD_EARLY_ERROR:
11070
1.19k
PCRE2_ASSERT(ptr >= pattern); /* Ensure we don't return invalid erroroffset */
11071
1.19k
PCRE2_ASSERT(ptr <= (pattern + patlen));
11072
1.19k
*erroroffset = ptr - pattern;
11073
11074
1.22k
HAD_ERROR:
11075
1.22k
*errorptr = errorcode;
11076
1.22k
pcre2_code_free(re);
11077
1.22k
re = NULL;
11078
11079
1.22k
#ifdef SUPPORT_WIDE_CHARS
11080
1.22k
if (cb.cranges != NULL)
11081
21
  {
11082
21
  class_ranges* cranges = cb.cranges;
11083
21
  do
11084
42
    {
11085
42
    class_ranges* next_cranges = cranges->next;
11086
42
    cb.cx->memctl.free(cranges, cb.cx->memctl.memory_data);
11087
42
    cranges = next_cranges;
11088
42
    }
11089
42
  while (cranges != NULL);
11090
21
  }
11091
1.22k
#endif
11092
1.22k
goto EXIT;
11093
1.19k
}
11094
11095
/* These #undefs are here to enable unity builds with CMake. */
11096
11097
#undef NLBLOCK /* Block containing newline information */
11098
#undef PSSTART /* Field containing processed string start */
11099
#undef PSEND   /* Field containing processed string end */
11100
11101
/* End of pcre2_compile.c */