Coverage Report

Created: 2026-05-16 06:35

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gstreamer/subprojects/pcre2-10.47/src/pcre2_compile.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#include "pcre2_compile.h"
43
44
45
46
0
#define NLBLOCK cb             /* Block containing newline information */
47
#define PSSTART start_pattern  /* Field containing processed string start */
48
0
#define PSEND   end_pattern    /* Field containing processed string end */
49
50
/* In rare error cases debugging might require calling pcre2_printint(). */
51
52
#if 0
53
#ifdef EBCDIC
54
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
55
#else
56
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
57
#endif
58
#define CHAR_OUTPUT(c)      (c)
59
#define CHAR_OUTPUT_HEX(c)  (c)
60
#define CHAR_INPUT(c)       (c)
61
#define CHAR_INPUT_HEX(c)   (c)
62
#include "pcre2_printint_inc.h"
63
#undef PRINTABLE
64
#undef CHAR_OUTPUT
65
#undef CHAR_OUTPUT_HEX
66
#undef CHAR_INPUT
67
#define DEBUG_CALL_PRINTINT
68
#endif
69
70
/* Other debugging code can be enabled by these defines. */
71
72
/* #define DEBUG_SHOW_CAPTURES */
73
/* #define DEBUG_SHOW_PARSED */
74
75
/* There are a few things that vary with different code unit sizes. Handle them
76
by defining macros in order to minimize #if usage. */
77
78
#if PCRE2_CODE_UNIT_WIDTH == 8
79
#define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
80
56
#define XDIGIT(c)                xdigitab[c]
81
82
#else  /* Either 16-bit or 32-bit */
83
#define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
84
85
#if PCRE2_CODE_UNIT_WIDTH == 16
86
#define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
87
88
#else  /* 32-bit */
89
#define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
90
#endif
91
#endif
92
93
/* Function definitions to allow mutual recursion */
94
95
static int
96
  compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
97
    uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
98
    open_capitem *, compile_block *, PCRE2_SIZE *);
99
100
static int
101
  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
102
    compile_block *);
103
104
static BOOL
105
  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
106
    compile_block *);
107
108
static int
109
  check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
110
    compile_block *, int *);
111
112
113
/*************************************************
114
*      Code parameters and static tables         *
115
*************************************************/
116
117
8
#define MAX_GROUP_NUMBER   65535u
118
696
#define MAX_REPEAT_COUNT   65535u
119
504
#define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
120
121
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
122
different ways in the different pattern scans. The parsing and group-
123
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
124
aligned for this. Having defined the size in code units, we set up
125
C16_WORK_SIZE as the number of elements in the 16-bit vector.
126
127
During the first compiling phase, when determining how much memory is required,
128
the regex is partly compiled into this space, but the compiled parts are
129
discarded as soon as they can be, so that hopefully there will never be an
130
overrun. The code does, however, check for an overrun, which can occur for
131
pathological patterns. The size of the workspace depends on LINK_SIZE because
132
the length of compiled items varies with this.
133
134
In the real compile phase, this workspace is not currently used. */
135
136
16
#define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
137
138
#define C16_WORK_SIZE \
139
  ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
140
141
/* A uint32_t vector is used for caching information about the size of
142
capturing groups, to improve performance. A default is created on the stack of
143
this size. */
144
145
0
#define GROUPINFO_DEFAULT_SIZE 256
146
147
/* The overrun tests check for a slightly smaller size so that they detect the
148
overrun before it actually does run off the end of the data block. */
149
150
396
#define WORK_SIZE_SAFETY_MARGIN (100)
151
152
/* This value determines the size of the initial vector that is used for
153
remembering named groups during the pre-compile. It is allocated on the stack,
154
but if it is too small, it is expanded, in a similar way to the workspace. The
155
value is the number of slots in the list. */
156
157
32
#define NAMED_GROUP_LIST_SIZE  20
158
159
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
160
of uint32_t. For short patterns this lives on the stack, with this size. Heap
161
memory is used for longer patterns. */
162
163
16
#define PARSED_PATTERN_DEFAULT_SIZE 1024
164
165
/* Maximum length value to check against when making sure that the variable
166
that holds the compiled pattern length does not overflow. We make it a bit less
167
than INT_MAX to allow for adding in group terminating code units, so that we
168
don't have to check them every time. */
169
170
308
#define OFLOW_MAX (INT_MAX - 20)
171
172
/* Table of extra lengths for each of the meta codes. Must be kept in step with
173
the definitions above. For some items these values are a basic length to which
174
a variable amount has to be added. */
175
176
static unsigned char meta_extra_lengths[] = {
177
  0,             /* META_END */
178
  0,             /* META_ALT */
179
  0,             /* META_ATOMIC */
180
  0,             /* META_BACKREF - more if group is >= 10 */
181
  1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
182
  1,             /* META_BIGVALUE */
183
  3,             /* META_CALLOUT_NUMBER */
184
  3+SIZEOFFSET,  /* META_CALLOUT_STRING */
185
  0,             /* META_CAPTURE */
186
  0,             /* META_CIRCUMFLEX */
187
  0,             /* META_CLASS */
188
  0,             /* META_CLASS_EMPTY */
189
  0,             /* META_CLASS_EMPTY_NOT */
190
  0,             /* META_CLASS_END */
191
  0,             /* META_CLASS_NOT */
192
  0,             /* META_COND_ASSERT */
193
  SIZEOFFSET,    /* META_COND_DEFINE */
194
  1+SIZEOFFSET,  /* META_COND_NAME */
195
  1+SIZEOFFSET,  /* META_COND_NUMBER */
196
  1+SIZEOFFSET,  /* META_COND_RNAME */
197
  1+SIZEOFFSET,  /* META_COND_RNUMBER */
198
  3,             /* META_COND_VERSION */
199
  SIZEOFFSET,    /* META_OFFSET */
200
  0,             /* META_SCS */
201
  1,             /* META_CAPTURE_NAME */
202
  1,             /* META_CAPTURE_NUMBER */
203
  0,             /* META_DOLLAR */
204
  0,             /* META_DOT */
205
  0,             /* META_ESCAPE - one more for ESC_P and ESC_p */
206
  0,             /* META_KET */
207
  0,             /* META_NOCAPTURE */
208
  2,             /* META_OPTIONS */
209
  1,             /* META_POSIX */
210
  1,             /* META_POSIX_NEG */
211
  0,             /* META_RANGE_ESCAPED */
212
  0,             /* META_RANGE_LITERAL */
213
  SIZEOFFSET,    /* META_RECURSE */
214
  1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
215
  0,             /* META_SCRIPT_RUN */
216
  0,             /* META_LOOKAHEAD */
217
  0,             /* META_LOOKAHEADNOT */
218
  SIZEOFFSET,    /* META_LOOKBEHIND */
219
  SIZEOFFSET,    /* META_LOOKBEHINDNOT */
220
  0,             /* META_LOOKAHEAD_NA */
221
  SIZEOFFSET,    /* META_LOOKBEHIND_NA */
222
  1,             /* META_MARK - plus the string length */
223
  0,             /* META_ACCEPT */
224
  0,             /* META_FAIL */
225
  0,             /* META_COMMIT */
226
  1,             /* META_COMMIT_ARG - plus the string length */
227
  0,             /* META_PRUNE */
228
  1,             /* META_PRUNE_ARG - plus the string length */
229
  0,             /* META_SKIP */
230
  1,             /* META_SKIP_ARG - plus the string length */
231
  0,             /* META_THEN */
232
  1,             /* META_THEN_ARG - plus the string length */
233
  0,             /* META_ASTERISK */
234
  0,             /* META_ASTERISK_PLUS */
235
  0,             /* META_ASTERISK_QUERY */
236
  0,             /* META_PLUS */
237
  0,             /* META_PLUS_PLUS */
238
  0,             /* META_PLUS_QUERY */
239
  0,             /* META_QUERY */
240
  0,             /* META_QUERY_PLUS */
241
  0,             /* META_QUERY_QUERY */
242
  2,             /* META_MINMAX */
243
  2,             /* META_MINMAX_PLUS */
244
  2,             /* META_MINMAX_QUERY */
245
  0,             /* META_ECLASS_AND */
246
  0,             /* META_ECLASS_OR */
247
  0,             /* META_ECLASS_SUB */
248
  0,             /* META_ECLASS_XOR */
249
  0              /* META_ECLASS_NOT */
250
};
251
252
/* Types for skipping parts of a parsed pattern. */
253
254
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
255
256
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
257
variables, which are concerned with first and required code units. A value
258
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
259
matching xxcu variable is set, and the low valued bits are relevant. */
260
261
668
#define REQ_UNSET     0xffffffffu  /* Not yet found anything */
262
172
#define REQ_NONE      0xfffffffeu  /* Found not fixed character */
263
24
#define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
264
496
#define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
265
266
/* These flags are used in the groupinfo vector. */
267
268
0
#define GI_SET_FIXED_LENGTH    0x80000000u
269
0
#define GI_NOT_FIXED_LENGTH    0x40000000u
270
0
#define GI_FIXED_LENGTH_MASK   0x0000ffffu
271
272
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
273
and is fast (a good compiler can turn it into a subtraction and unsigned
274
comparison). */
275
276
960
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
277
278
/* Table to identify hex digits. The tables in chartables are dependent on the
279
locale, and may mark arbitrary characters as digits. We want to recognize only
280
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
281
costs 256 bytes, but it is a lot faster than doing character value tests (at
282
least in some simple cases I timed), and in some applications one wants PCRE2
283
to compile efficiently as well as match efficiently. The value in the table is
284
the binary hex digit value, or 0xff for non-hex digits. */
285
286
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
287
UTF-8 mode. */
288
289
#ifndef EBCDIC
290
static const uint8_t xdigitab[] =
291
  {
292
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
293
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
294
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
295
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
296
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
297
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
298
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
299
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
300
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
301
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
302
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
303
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
304
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
305
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
306
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
307
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
308
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
309
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
310
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
311
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
312
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
313
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
314
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
315
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
316
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
317
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
318
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
319
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
320
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
321
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
322
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
323
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
324
325
#else
326
327
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
328
329
static const uint8_t xdigitab[] =
330
  {
331
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
332
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
333
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
334
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
335
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
336
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
337
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
338
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
339
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
340
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
341
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
342
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
343
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
344
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
345
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
346
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
347
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
348
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
349
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
350
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
351
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
352
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
353
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
354
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
355
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
356
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
357
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
358
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
359
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
360
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
361
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
362
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
363
#endif  /* EBCDIC */
364
365
366
/* Table for handling alphanumeric escaped characters. Positive returns are
367
simple data values; negative values are for special things like \d and so on.
368
Zero means further processing is needed (for things like \x), or the escape is
369
invalid. */
370
371
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
372
in UTF-8 mode. It runs from '0' to 'z'. */
373
374
#ifndef EBCDIC
375
120
#define ESCAPES_FIRST       CHAR_0
376
68
#define ESCAPES_LAST        CHAR_z
377
0
#define UPPER_CASE(c)       (c-32)
378
379
static const short int escapes[] = {
380
    /* 0 */ 0,                       /* 1 */ 0,
381
    /* 2 */ 0,                       /* 3 */ 0,
382
    /* 4 */ 0,                       /* 5 */ 0,
383
    /* 6 */ 0,                       /* 7 */ 0,
384
    /* 8 */ 0,                       /* 9 */ 0,
385
    /* : */ ESCAPES_FIRST+0x0a,      /* ; */ ESCAPES_FIRST+0x0b,
386
    /* < */ ESCAPES_FIRST+0x0c,      /* = */ ESCAPES_FIRST+0x0d,
387
    /* > */ ESCAPES_FIRST+0x0e,      /* ? */ ESCAPES_FIRST+0x0f,
388
    /* @ */ ESCAPES_FIRST+0x10,      /* A */ -ESC_A,
389
    /* B */ -ESC_B,                  /* C */ -ESC_C,
390
    /* D */ -ESC_D,                  /* E */ -ESC_E,
391
    /* F */ 0,                       /* G */ -ESC_G,
392
    /* H */ -ESC_H,                  /* I */ 0,
393
    /* J */ 0,                       /* K */ -ESC_K,
394
    /* L */ 0,                       /* M */ 0,
395
    /* N */ -ESC_N,                  /* O */ 0,
396
    /* P */ -ESC_P,                  /* Q */ -ESC_Q,
397
    /* R */ -ESC_R,                  /* S */ -ESC_S,
398
    /* T */ 0,                       /* U */ 0,
399
    /* V */ -ESC_V,                  /* W */ -ESC_W,
400
    /* X */ -ESC_X,                  /* Y */ 0,
401
    /* Z */ -ESC_Z,                  /* [ */ ESCAPES_FIRST+0x2b,
402
    /* \ */ ESCAPES_FIRST+0x2c,      /* ] */ ESCAPES_FIRST+0x2d,
403
    /* ^ */ ESCAPES_FIRST+0x2e,      /* _ */ ESCAPES_FIRST+0x2f,
404
    /* ` */ ESCAPES_FIRST+0x30,      /* a */ CHAR_BEL,
405
    /* b */ -ESC_b,                  /* c */ 0,
406
    /* d */ -ESC_d,                  /* e */ CHAR_ESC,
407
    /* f */ CHAR_FF,                 /* g */ 0,
408
    /* h */ -ESC_h,                  /* i */ 0,
409
    /* j */ 0,                       /* k */ -ESC_k,
410
    /* l */ 0,                       /* m */ 0,
411
    /* n */ CHAR_LF,                 /* o */ 0,
412
    /* p */ -ESC_p,                  /* q */ 0,
413
    /* r */ CHAR_CR,                 /* s */ -ESC_s,
414
    /* t */ CHAR_HT,                 /* u */ 0,
415
    /* v */ -ESC_v,                  /* w */ -ESC_w,
416
    /* x */ 0,                       /* y */ 0,
417
    /* z */ -ESC_z
418
};
419
420
#else
421
422
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
423
It runs from 'a' to '9'. Our EBCDIC support can be provided via the compiler,
424
which can interpret character literals like 'a' or '[' in an EBCDIC codepage;
425
in this case, there is wide variance between codepages on the interpretation of
426
characters between the letters ('[' and '{' and so on are placed in all sorts of
427
different positions in the table). Thankfully however, all EBCDIC codepages
428
place the letters and digits in the same location, so we hardcode that here.
429
Our EBCDIC support can also be provided via numeric literals instead of
430
character literals, so either way, 'CHAR_a' will be 0x81 when PCRE2 is compiled
431
in EBCDIC mode. */
432
433
#define ESCAPES_FIRST       CHAR_a
434
#define ESCAPES_LAST        CHAR_9
435
#define UPPER_CASE(c)       (c+64)
436
437
static const short int escapes[] = {
438
    /* 0x81 a */ CHAR_BEL,             /* 0x82 b */ -ESC_b,
439
    /* 0x83 c */ 0,                    /* 0x84 d */ -ESC_d,
440
    /* 0x85 e */ CHAR_ESC,             /* 0x86 f */ CHAR_FF,
441
    /* 0x87 g */ 0,                    /* 0x88 h */ -ESC_h,
442
    /* 0x89 i */ 0,                    /* 0x8a   */ ESCAPES_FIRST+0x09,
443
    /* 0x8b   */ ESCAPES_FIRST+0x0a,   /* 0x8c   */ ESCAPES_FIRST+0x0b,
444
    /* 0x8d   */ ESCAPES_FIRST+0x0c,   /* 0x8e   */ ESCAPES_FIRST+0x0d,
445
    /* 0x8f   */ ESCAPES_FIRST+0x0e,   /* 0x90   */ ESCAPES_FIRST+0x0f,
446
    /* 0x91 j */ 0,                    /* 0x92 k */ -ESC_k,
447
    /* 0x93 l */ 0,                    /* 0x94 m */ 0,
448
    /* 0x95 n */ CHAR_LF,              /* 0x96 o */ 0,
449
    /* 0x97 p */ -ESC_p,               /* 0x98 q */ 0,
450
    /* 0x99 r */ CHAR_CR,              /* 0x9a   */ ESCAPES_FIRST+0x19,
451
    /* 0x9b   */ ESCAPES_FIRST+0x1a,   /* 0x9c   */ ESCAPES_FIRST+0x1b,
452
    /* 0x9d   */ ESCAPES_FIRST+0x1c,   /* 0x9e   */ ESCAPES_FIRST+0x1d,
453
    /* 0x9f   */ ESCAPES_FIRST+0x1e,   /* 0xa0   */ ESCAPES_FIRST+0x1f,
454
    /* 0xa1   */ ESCAPES_FIRST+0x20,   /* 0xa2 s */ -ESC_s,
455
    /* 0xa3 t */ CHAR_HT,              /* 0xa4 u */ 0,
456
    /* 0xa5 v */ -ESC_v,               /* 0xa6 w */ -ESC_w,
457
    /* 0xa7 x */ 0,                    /* 0xa8 y */ 0,
458
    /* 0xa9 z */ -ESC_z,               /* 0xaa   */ ESCAPES_FIRST+0x29,
459
    /* 0xab   */ ESCAPES_FIRST+0x2a,   /* 0xac   */ ESCAPES_FIRST+0x2b,
460
    /* 0xad   */ ESCAPES_FIRST+0x2c,   /* 0xae   */ ESCAPES_FIRST+0x2d,
461
    /* 0xaf   */ ESCAPES_FIRST+0x2e,   /* 0xb0   */ ESCAPES_FIRST+0x2f,
462
    /* 0xb1   */ ESCAPES_FIRST+0x30,   /* 0xb2   */ ESCAPES_FIRST+0x31,
463
    /* 0xb3   */ ESCAPES_FIRST+0x32,   /* 0xb4   */ ESCAPES_FIRST+0x33,
464
    /* 0xb5   */ ESCAPES_FIRST+0x34,   /* 0xb6   */ ESCAPES_FIRST+0x35,
465
    /* 0xb7   */ ESCAPES_FIRST+0x36,   /* 0xb8   */ ESCAPES_FIRST+0x37,
466
    /* 0xb9   */ ESCAPES_FIRST+0x38,   /* 0xba   */ ESCAPES_FIRST+0x39,
467
    /* 0xbb   */ ESCAPES_FIRST+0x3a,   /* 0xbc   */ ESCAPES_FIRST+0x3b,
468
    /* 0xbd   */ ESCAPES_FIRST+0x3c,   /* 0xbe   */ ESCAPES_FIRST+0x3d,
469
    /* 0xbf   */ ESCAPES_FIRST+0x3e,   /* 0xc0   */ ESCAPES_FIRST+0x3f,
470
    /* 0xc1 A */ -ESC_A,               /* 0xc2 B */ -ESC_B,
471
    /* 0xc3 C */ -ESC_C,               /* 0xc4 D */ -ESC_D,
472
    /* 0xc5 E */ -ESC_E,               /* 0xc6 F */ 0,
473
    /* 0xc7 G */ -ESC_G,               /* 0xc8 H */ -ESC_H,
474
    /* 0xc9 I */ 0,                    /* 0xca   */ ESCAPES_FIRST+0x49,
475
    /* 0xcb   */ ESCAPES_FIRST+0x4a,   /* 0xcc   */ ESCAPES_FIRST+0x4b,
476
    /* 0xcd   */ ESCAPES_FIRST+0x4c,   /* 0xce   */ ESCAPES_FIRST+0x4d,
477
    /* 0xcf   */ ESCAPES_FIRST+0x4e,   /* 0xd0   */ ESCAPES_FIRST+0x4f,
478
    /* 0xd1 J */ 0,                    /* 0xd2 K */ -ESC_K,
479
    /* 0xd3 L */ 0,                    /* 0xd4 M */ 0,
480
    /* 0xd5 N */ -ESC_N,               /* 0xd6 O */ 0,
481
    /* 0xd7 P */ -ESC_P,               /* 0xd8 Q */ -ESC_Q,
482
    /* 0xd9 R */ -ESC_R,               /* 0xda   */ ESCAPES_FIRST+0x59,
483
    /* 0xdb   */ ESCAPES_FIRST+0x5a,   /* 0xdc   */ ESCAPES_FIRST+0x5b,
484
    /* 0xdd   */ ESCAPES_FIRST+0x5c,   /* 0xde   */ ESCAPES_FIRST+0x5d,
485
    /* 0xdf   */ ESCAPES_FIRST+0x5e,   /* 0xe0   */ ESCAPES_FIRST+0x5f,
486
    /* 0xe1   */ ESCAPES_FIRST+0x60,   /* 0xe2 S */ -ESC_S,
487
    /* 0xe3 T */ 0,                    /* 0xe4 U */ 0,
488
    /* 0xe5 V */ -ESC_V,               /* 0xe6 W */ -ESC_W,
489
    /* 0xe7 X */ -ESC_X,               /* 0xe8 Y */ 0,
490
    /* 0xe9 Z */ -ESC_Z,               /* 0xea   */ ESCAPES_FIRST+0x69,
491
    /* 0xeb   */ ESCAPES_FIRST+0x6a,   /* 0xec   */ ESCAPES_FIRST+0x6b,
492
    /* 0xed   */ ESCAPES_FIRST+0x6c,   /* 0xee   */ ESCAPES_FIRST+0x6d,
493
    /* 0xef   */ ESCAPES_FIRST+0x6e,   /* 0xf0 0 */ 0,
494
    /* 0xf1 1 */ 0,                    /* 0xf2 2 */ 0,
495
    /* 0xf3 3 */ 0,                    /* 0xf4 4 */ 0,
496
    /* 0xf5 5 */ 0,                    /* 0xf6 6 */ 0,
497
    /* 0xf7 7 */ 0,                    /* 0xf8 8 */ 0,
498
    /* 0xf9 9 */ 0,
499
};
500
501
/* We also need a table of characters that may follow \c in an EBCDIC
502
environment for characters 0-31. */
503
504
static unsigned char ebcdic_escape_c[] = {
505
  CHAR_COMMERCIAL_AT, CHAR_A, CHAR_B, CHAR_C, CHAR_D, CHAR_E, CHAR_F, CHAR_G,
506
  CHAR_H, CHAR_I, CHAR_J, CHAR_K, CHAR_L, CHAR_M, CHAR_N, CHAR_O, CHAR_P,
507
  CHAR_Q, CHAR_R, CHAR_S, CHAR_T, CHAR_U, CHAR_V, CHAR_W, CHAR_X, CHAR_Y,
508
  CHAR_Z, CHAR_LEFT_SQUARE_BRACKET, CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
509
  CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE
510
};
511
512
#endif   /* EBCDIC */
513
514
515
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
516
searched linearly. Put all the names into a single string, in order to reduce
517
the number of relocations when a shared library is dynamically linked. The
518
string is built from string macros so that it works in UTF-8 mode on EBCDIC
519
platforms. */
520
521
typedef struct verbitem {
522
  unsigned int len;          /* Length of verb name */
523
  uint32_t meta;             /* Base META_ code */
524
  int has_arg;               /* Argument requirement */
525
} verbitem;
526
527
static const char verbnames[] =
528
  "\0"                       /* Empty name is a shorthand for MARK */
529
  STRING_MARK0
530
  STRING_ACCEPT0
531
  STRING_F0
532
  STRING_FAIL0
533
  STRING_COMMIT0
534
  STRING_PRUNE0
535
  STRING_SKIP0
536
  STRING_THEN;
537
538
static const verbitem verbs[] = {
539
  { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
540
  { 4, META_MARK,   +1 },
541
  { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
542
  { 1, META_FAIL,   -1 },
543
  { 4, META_FAIL,   -1 },
544
  { 6, META_COMMIT,  0 },
545
  { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
546
  { 4, META_SKIP,    0 },
547
  { 4, META_THEN,    0 }
548
};
549
550
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
551
552
/* Verb opcodes, indexed by their META code offset from META_MARK. */
553
554
static const uint32_t verbops[] = {
555
  OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
556
  OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
557
558
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
559
560
typedef struct alasitem {
561
  unsigned int len;          /* Length of name */
562
  uint32_t meta;             /* Base META_ code */
563
} alasitem;
564
565
static const char alasnames[] =
566
  STRING_pla0
567
  STRING_plb0
568
  STRING_napla0
569
  STRING_naplb0
570
  STRING_nla0
571
  STRING_nlb0
572
  STRING_positive_lookahead0
573
  STRING_positive_lookbehind0
574
  STRING_non_atomic_positive_lookahead0
575
  STRING_non_atomic_positive_lookbehind0
576
  STRING_negative_lookahead0
577
  STRING_negative_lookbehind0
578
  STRING_scs0
579
  STRING_scan_substring0
580
  STRING_atomic0
581
  STRING_sr0
582
  STRING_asr0
583
  STRING_script_run0
584
  STRING_atomic_script_run;
585
586
static const alasitem alasmeta[] = {
587
  {  3, META_LOOKAHEAD         },
588
  {  3, META_LOOKBEHIND        },
589
  {  5, META_LOOKAHEAD_NA      },
590
  {  5, META_LOOKBEHIND_NA     },
591
  {  3, META_LOOKAHEADNOT      },
592
  {  3, META_LOOKBEHINDNOT     },
593
  { 18, META_LOOKAHEAD         },
594
  { 19, META_LOOKBEHIND        },
595
  { 29, META_LOOKAHEAD_NA      },
596
  { 30, META_LOOKBEHIND_NA     },
597
  { 18, META_LOOKAHEADNOT      },
598
  { 19, META_LOOKBEHINDNOT     },
599
  {  3, META_SCS               },
600
  { 14, META_SCS               },
601
  {  6, META_ATOMIC            },
602
  {  2, META_SCRIPT_RUN        }, /* sr = script run */
603
  {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
604
  { 10, META_SCRIPT_RUN        }, /* script run */
605
  { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
606
};
607
608
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
609
610
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
611
612
static uint32_t chartypeoffset[] = {
613
  OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
614
  OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
615
616
/* Tables of names of POSIX character classes and their lengths. The names are
617
now all in a single string, to reduce the number of relocations when a shared
618
library is dynamically loaded. The list of lengths is terminated by a zero
619
length entry. The first three must be alpha, lower, upper, as this is assumed
620
for handling case independence.
621
622
The indices for several classes are stored in pcre2_compile.h - these must
623
be kept in sync with posix_names, posix_name_lengths, posix_class_maps,
624
and posix_substitutes. */
625
626
static const char posix_names[] =
627
  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
628
  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
629
  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
630
  STRING_word0  STRING_xdigit;
631
632
static const uint8_t posix_name_lengths[] = {
633
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
634
635
/* Table of class bit maps for each POSIX class. Each class is formed from a
636
base map, with an optional addition or removal of another map. Then, for some
637
classes, there is some additional tweaking: for [:blank:] the vertical space
638
characters are removed, and for [:alpha:] and [:alnum:] the underscore
639
character is removed. The triples in the table consist of the base map offset,
640
second map offset or -1 if no second map, and a non-negative value for map
641
addition or a negative value for map subtraction (if there are two maps). The
642
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
643
remove vertical space characters, 2 => remove underscore. */
644
645
const int PRIV(posix_class_maps)[] = {
646
  cbit_word,   cbit_digit, -2,            /* alpha */
647
  cbit_lower,  -1,          0,            /* lower */
648
  cbit_upper,  -1,          0,            /* upper */
649
  cbit_word,   -1,          2,            /* alnum - word without underscore */
650
  cbit_print,  cbit_cntrl,  0,            /* ascii */
651
  cbit_space,  -1,          1,            /* blank - a GNU extension */
652
  cbit_cntrl,  -1,          0,            /* cntrl */
653
  cbit_digit,  -1,          0,            /* digit */
654
  cbit_graph,  -1,          0,            /* graph */
655
  cbit_print,  -1,          0,            /* print */
656
  cbit_punct,  -1,          0,            /* punct */
657
  cbit_space,  -1,          0,            /* space */
658
  cbit_word,   -1,          0,            /* word - a Perl extension */
659
  cbit_xdigit, -1,          0             /* xdigit */
660
};
661
662
#ifdef SUPPORT_UNICODE
663
664
/* The POSIX class Unicode property substitutes that are used in UCP mode must
665
be in the order of the POSIX class names, defined above. */
666
667
static int posix_substitutes[] = {
668
  PT_GC, ucp_L,     /* alpha */
669
  PT_PC, ucp_Ll,    /* lower */
670
  PT_PC, ucp_Lu,    /* upper */
671
  PT_ALNUM, 0,      /* alnum */
672
  -1, 0,            /* ascii, treat as non-UCP */
673
  -1, 1,            /* blank, treat as \h */
674
  PT_PC, ucp_Cc,    /* cntrl */
675
  PT_PC, ucp_Nd,    /* digit */
676
  PT_PXGRAPH, 0,    /* graph */
677
  PT_PXPRINT, 0,    /* print */
678
  PT_PXPUNCT, 0,    /* punct */
679
  PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
680
  PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
681
  PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
682
};
683
#endif  /* SUPPORT_UNICODE */
684
685
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
686
are allowed. */
687
688
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
689
16
  (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
690
16
   PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
691
16
   PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
692
693
#define PUBLIC_COMPILE_OPTIONS \
694
16
  (PUBLIC_LITERAL_COMPILE_OPTIONS| \
695
16
   PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
696
16
   PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
697
16
   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
698
16
   PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
699
16
   PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
700
16
   PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)
701
702
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
703
16
   (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \
704
16
    PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)
705
706
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
707
16
   (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
708
16
    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
709
16
    PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
710
16
    PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
711
16
    PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
712
16
    PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \
713
16
    PCRE2_EXTRA_NEVER_CALLOUT)
714
715
/* This is a table of start-of-pattern options such as (*UTF) and settings such
716
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
717
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
718
generic and always supported. */
719
720
enum { PSO_OPT,     /* Value is an option bit */
721
       PSO_XOPT,    /* Value is an xoption bit */
722
       PSO_FLG,     /* Value is a flag bit */
723
       PSO_NL,      /* Value is a newline type */
724
       PSO_BSR,     /* Value is a \R type */
725
       PSO_LIMH,    /* Read integer value for heap limit */
726
       PSO_LIMM,    /* Read integer value for match limit */
727
       PSO_LIMD,    /* Read integer value for depth limit */
728
       PSO_OPTMZ    /* Value is an optimization bit */
729
     };
730
731
typedef struct pso {
732
  const char *name;
733
  uint16_t length;
734
  uint16_t type;
735
  uint32_t value;
736
} pso;
737
738
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
739
740
static const pso pso_list[] = {
741
  { STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
742
  { STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
743
  { STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
744
  { STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
745
  { STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
746
  { STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },
747
  { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },
748
  { STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
749
  { STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },
750
  { STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },
751
  { STRING_TURKISH_CASING_RIGHTPAR,    15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },
752
  { STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
753
  { STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
754
  { STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
755
  { STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
756
  { STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
757
  { STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
758
  { STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
759
  { STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
760
  { STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
761
  { STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
762
  { STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
763
  { STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
764
};
765
766
/* This table is used when converting repeating opcodes into possessified
767
versions as a result of an explicit possessive quantifier such as ++. A zero
768
value means there is no possessified version - in those cases the item in
769
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
770
because all relevant opcodes are less than that. */
771
772
static const uint8_t opcode_possessify[] = {
773
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
774
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
775
776
  0,                       /* NOTI */
777
  OP_POSSTAR, 0,           /* STAR, MINSTAR */
778
  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
779
  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
780
  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
781
  0,                       /* EXACT */
782
  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
783
784
  OP_POSSTARI, 0,          /* STARI, MINSTARI */
785
  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
786
  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
787
  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
788
  0,                       /* EXACTI */
789
  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
790
791
  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
792
  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
793
  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
794
  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
795
  0,                       /* NOTEXACT */
796
  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
797
798
  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
799
  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
800
  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
801
  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
802
  0,                       /* NOTEXACTI */
803
  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
804
805
  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
806
  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
807
  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
808
  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
809
  0,                       /* TYPEEXACT */
810
  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
811
812
  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
813
  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
814
  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
815
  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
816
  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
817
818
  0, 0, 0, 0,              /* CLASS, NCLASS, XCLASS, ECLASS */
819
  0, 0,                    /* REF, REFI */
820
  0, 0,                    /* DNREF, DNREFI */
821
  0, 0,                    /* RECURSE, CALLOUT */
822
};
823
824
/* Compile-time check that the table has the correct size. */
825
STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);
826
827
828
#ifdef DEBUG_SHOW_PARSED
829
/*************************************************
830
*     Show the parsed pattern for debugging      *
831
*************************************************/
832
833
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
834
can be enabled. */
835
836
static void show_parsed(compile_block *cb)
837
{
838
uint32_t *pptr = cb->parsed_pattern;
839
840
for (;;)
841
  {
842
  int max, min;
843
  PCRE2_SIZE offset;
844
  uint32_t i;
845
  uint32_t length;
846
  uint32_t meta_arg = META_DATA(*pptr);
847
848
  fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
849
850
  if (*pptr < META_END)
851
    {
852
    if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
853
    pptr++;
854
    }
855
856
  else switch (META_CODE(*pptr++))
857
    {
858
    default:
859
    fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
860
    return;
861
862
    case META_END:
863
    fprintf(stderr, "META_END\n");
864
    return;
865
866
    case META_CAPTURE:
867
    fprintf(stderr, "META_CAPTURE %d", meta_arg);
868
    break;
869
870
    case META_RECURSE:
871
    GETOFFSET(offset, pptr);
872
    fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
873
    break;
874
875
    case META_BACKREF:
876
    if (meta_arg < 10)
877
      offset = cb->small_ref_offset[meta_arg];
878
    else
879
      GETOFFSET(offset, pptr);
880
    fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
881
    break;
882
883
    case META_ESCAPE:
884
    if (meta_arg == ESC_P || meta_arg == ESC_p)
885
      {
886
      uint32_t ptype = *pptr >> 16;
887
      uint32_t pvalue = *pptr++ & 0xffff;
888
      fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
889
        ptype, pvalue);
890
      }
891
    else
892
      {
893
      uint32_t cc;
894
      /* There's just one escape we might have here that isn't negated in the
895
      escapes table. */
896
      if (meta_arg == ESC_g) cc = CHAR_g;
897
      else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
898
        {
899
        if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
900
        }
901
      if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
902
      fprintf(stderr, "META \\%c", cc);
903
      }
904
    break;
905
906
    case META_MINMAX:
907
    min = *pptr++;
908
    max = *pptr++;
909
    if (max != REPEAT_UNLIMITED)
910
      fprintf(stderr, "META {%d,%d}", min, max);
911
    else
912
      fprintf(stderr, "META {%d,}", min);
913
    break;
914
915
    case META_MINMAX_QUERY:
916
    min = *pptr++;
917
    max = *pptr++;
918
    if (max != REPEAT_UNLIMITED)
919
      fprintf(stderr, "META {%d,%d}?", min, max);
920
    else
921
      fprintf(stderr, "META {%d,}?", min);
922
    break;
923
924
    case META_MINMAX_PLUS:
925
    min = *pptr++;
926
    max = *pptr++;
927
    if (max != REPEAT_UNLIMITED)
928
      fprintf(stderr, "META {%d,%d}+", min, max);
929
    else
930
      fprintf(stderr, "META {%d,}+", min);
931
    break;
932
933
    case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
934
    case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
935
    case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
936
    case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
937
    case META_DOT: fprintf(stderr, "META_DOT"); break;
938
    case META_ASTERISK: fprintf(stderr, "META *"); break;
939
    case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
940
    case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
941
    case META_PLUS: fprintf(stderr, "META +"); break;
942
    case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
943
    case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
944
    case META_QUERY: fprintf(stderr, "META ?"); break;
945
    case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
946
    case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
947
948
    case META_ATOMIC: fprintf(stderr, "META (?>"); break;
949
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
950
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
951
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
952
    case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
953
    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
954
    case META_KET: fprintf(stderr, "META )"); break;
955
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
956
957
    case META_CLASS: fprintf(stderr, "META ["); break;
958
    case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
959
    case META_CLASS_END: fprintf(stderr, "META ]"); break;
960
    case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
961
    case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
962
963
    case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
964
    case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
965
966
    case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
967
    case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
968
969
    case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
970
    case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
971
    case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
972
    case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
973
    case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
974
    case META_THEN: fprintf(stderr, "META (*THEN)"); break;
975
976
    case META_OPTIONS:
977
    fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
978
    pptr += 2;
979
    break;
980
981
    case META_LOOKBEHIND:
982
    fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
983
    pptr += 2;
984
    break;
985
986
    case META_LOOKBEHIND_NA:
987
    fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
988
    pptr += 2;
989
    break;
990
991
    case META_LOOKBEHINDNOT:
992
    fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
993
    pptr += 2;
994
    break;
995
996
    case META_CALLOUT_NUMBER:
997
    fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
998
       pptr[1]);
999
    pptr += 3;
1000
    break;
1001
1002
    case META_CALLOUT_STRING:
1003
      {
1004
      uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1005
      uint32_t patlength = *pptr++;    /* Length of next pattern item */
1006
      fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1007
      GETOFFSET(offset, pptr);
1008
      fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1009
      }
1010
    break;
1011
1012
    case META_RECURSE_BYNAME:
1013
    fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1014
    GETOFFSET(offset, pptr);
1015
    fprintf(stderr, "%zd", offset);
1016
    break;
1017
1018
    case META_BACKREF_BYNAME:
1019
    fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1020
    GETOFFSET(offset, pptr);
1021
    fprintf(stderr, "%zd", offset);
1022
    break;
1023
1024
    case META_COND_NUMBER:
1025
    fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1026
    GETOFFSET(offset, pptr);
1027
    fprintf(stderr, "%zd", offset);
1028
    pptr++;
1029
    break;
1030
1031
    case META_COND_DEFINE:
1032
    fprintf(stderr, "META (?(DEFINE) offset=");
1033
    GETOFFSET(offset, pptr);
1034
    fprintf(stderr, "%zd", offset);
1035
    break;
1036
1037
    case META_COND_VERSION:
1038
    fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1039
    fprintf(stderr, "%d.", *pptr++);
1040
    fprintf(stderr, "%d)", *pptr++);
1041
    break;
1042
1043
    case META_COND_NAME:
1044
    fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1045
    GETOFFSET(offset, pptr);
1046
    fprintf(stderr, "%zd", offset);
1047
    break;
1048
1049
    case META_COND_RNAME:
1050
    fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1051
    GETOFFSET(offset, pptr);
1052
    fprintf(stderr, "%zd", offset);
1053
    break;
1054
1055
    /* This is kept as a name, because it might be. */
1056
1057
    case META_COND_RNUMBER:
1058
    fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1059
    GETOFFSET(offset, pptr);
1060
    fprintf(stderr, "%zd", offset);
1061
    break;
1062
1063
    case META_OFFSET:
1064
    fprintf(stderr, "META_OFFSET offset=");
1065
    GETOFFSET(offset, pptr);
1066
    fprintf(stderr, "%zd", offset);
1067
    break;
1068
1069
    case META_SCS:
1070
    fprintf(stderr, "META (*scan_substring:");
1071
    break;
1072
1073
    case META_CAPTURE_NAME:
1074
    fprintf(stderr, "META_CAPTURE_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);
1075
    break;
1076
1077
    case META_CAPTURE_NUMBER:
1078
    fprintf(stderr, "META_CAPTURE_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);
1079
    break;
1080
1081
    case META_MARK:
1082
    fprintf(stderr, "META (*MARK:");
1083
    goto SHOWARG;
1084
1085
    case META_COMMIT_ARG:
1086
    fprintf(stderr, "META (*COMMIT:");
1087
    goto SHOWARG;
1088
1089
    case META_PRUNE_ARG:
1090
    fprintf(stderr, "META (*PRUNE:");
1091
    goto SHOWARG;
1092
1093
    case META_SKIP_ARG:
1094
    fprintf(stderr, "META (*SKIP:");
1095
    goto SHOWARG;
1096
1097
    case META_THEN_ARG:
1098
    fprintf(stderr, "META (*THEN:");
1099
    SHOWARG:
1100
    length = *pptr++;
1101
    for (i = 0; i < length; i++)
1102
      {
1103
      uint32_t cc = *pptr++;
1104
      if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1105
        else fprintf(stderr, "\\x{%x}", cc);
1106
      }
1107
    fprintf(stderr, ") length=%u", length);
1108
    break;
1109
1110
    case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;
1111
    case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;
1112
    case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;
1113
    case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;
1114
    case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;
1115
    }
1116
  fprintf(stderr, "\n");
1117
  }
1118
return;
1119
}
1120
#endif  /* DEBUG_SHOW_PARSED */
1121
1122
1123
1124
/*************************************************
1125
*               Copy compiled code               *
1126
*************************************************/
1127
1128
/* Compiled JIT code cannot be copied, so the new compiled block has no
1129
associated JIT data. */
1130
1131
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1132
pcre2_code_copy(const pcre2_code *code)
1133
0
{
1134
0
PCRE2_SIZE *ref_count;
1135
0
pcre2_code *newcode;
1136
1137
0
if (code == NULL) return NULL;
1138
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1139
0
if (newcode == NULL) return NULL;
1140
0
memcpy(newcode, code, code->blocksize);
1141
0
newcode->executable_jit = NULL;
1142
1143
/* If the code is one that has been deserialized, increment the reference count
1144
in the decoded tables. */
1145
1146
0
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1147
0
  {
1148
0
  ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1149
0
  (*ref_count)++;
1150
0
  }
1151
1152
0
return newcode;
1153
0
}
1154
1155
1156
1157
/*************************************************
1158
*     Copy compiled code and character tables    *
1159
*************************************************/
1160
1161
/* Compiled JIT code cannot be copied, so the new compiled block has no
1162
associated JIT data. This version of code_copy also makes a separate copy of
1163
the character tables. */
1164
1165
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1166
pcre2_code_copy_with_tables(const pcre2_code *code)
1167
0
{
1168
0
PCRE2_SIZE* ref_count;
1169
0
pcre2_code *newcode;
1170
0
uint8_t *newtables;
1171
1172
0
if (code == NULL) return NULL;
1173
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1174
0
if (newcode == NULL) return NULL;
1175
0
memcpy(newcode, code, code->blocksize);
1176
0
newcode->executable_jit = NULL;
1177
1178
0
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1179
0
  code->memctl.memory_data);
1180
0
if (newtables == NULL)
1181
0
  {
1182
0
  code->memctl.free((void *)newcode, code->memctl.memory_data);
1183
0
  return NULL;
1184
0
  }
1185
0
memcpy(newtables, code->tables, TABLES_LENGTH);
1186
0
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1187
0
*ref_count = 1;
1188
1189
0
newcode->tables = newtables;
1190
0
newcode->flags |= PCRE2_DEREF_TABLES;
1191
0
return newcode;
1192
0
}
1193
1194
1195
1196
/*************************************************
1197
*               Free compiled code               *
1198
*************************************************/
1199
1200
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1201
pcre2_code_free(pcre2_code *code)
1202
0
{
1203
0
PCRE2_SIZE* ref_count;
1204
1205
0
if (code != NULL)
1206
0
  {
1207
0
#ifdef SUPPORT_JIT
1208
0
  if (code->executable_jit != NULL)
1209
0
    PRIV(jit_free)(code->executable_jit, &code->memctl);
1210
0
#endif
1211
1212
0
  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1213
0
    {
1214
    /* Decoded tables belong to the codes after deserialization, and they must
1215
    be freed when there are no more references to them. The *ref_count should
1216
    always be > 0. */
1217
1218
0
    ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1219
0
    if (*ref_count > 0)
1220
0
      {
1221
0
      (*ref_count)--;
1222
0
      if (*ref_count == 0)
1223
0
        code->memctl.free((void *)code->tables, code->memctl.memory_data);
1224
0
      }
1225
0
    }
1226
1227
0
  code->memctl.free(code, code->memctl.memory_data);
1228
0
  }
1229
0
}
1230
1231
1232
1233
/*************************************************
1234
*         Read a number, possibly signed         *
1235
*************************************************/
1236
1237
/* This function is used to read numbers in the pattern. The initial pointer
1238
must be at the sign or first digit of the number. When relative values
1239
(introduced by + or -) are allowed, they are relative group numbers, and the
1240
result must be greater than zero.
1241
1242
Arguments:
1243
  ptrptr      points to the character pointer variable
1244
  ptrend      points to the end of the input string
1245
  allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1246
  max_value   the largest number allowed;
1247
              you must not pass a value for max_value larger than
1248
              INT_MAX/10 - 1 because this function relies on max_value to
1249
              avoid integer overflow
1250
  max_error   the error to give for an over-large number
1251
  intptr      where to put the result
1252
  errcodeptr  where to put an error code
1253
1254
Returns:      TRUE  - a number was read
1255
              FALSE - errorcode == 0 => no number was found
1256
                      errorcode != 0 => an error occurred
1257
*/
1258
1259
static BOOL
1260
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1261
  uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1262
192
{
1263
192
int sign = 0;
1264
192
uint32_t n = 0;
1265
192
PCRE2_SPTR ptr = *ptrptr;
1266
192
BOOL yield = FALSE;
1267
1268
192
PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);
1269
1270
192
*errorcodeptr = 0;
1271
1272
192
if (allow_sign >= 0 && ptr < ptrend)
1273
0
  {
1274
0
  if (*ptr == CHAR_PLUS)
1275
0
    {
1276
0
    sign = +1;
1277
0
    max_value -= allow_sign;
1278
0
    ptr++;
1279
0
    }
1280
0
  else if (*ptr == CHAR_MINUS)
1281
0
    {
1282
0
    sign = -1;
1283
0
    ptr++;
1284
0
    }
1285
0
  }
1286
1287
192
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1288
384
while (ptr < ptrend && IS_DIGIT(*ptr))
1289
192
  {
1290
192
  n = n * 10 + (*ptr++ - CHAR_0);
1291
192
  if (n > max_value)
1292
0
    {
1293
0
    *errorcodeptr = max_error;
1294
0
    while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;
1295
0
    goto EXIT;
1296
0
    }
1297
192
  }
1298
1299
192
if (allow_sign >= 0 && sign != 0)
1300
0
  {
1301
0
  if (n == 0)
1302
0
    {
1303
0
    *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1304
0
    goto EXIT;
1305
0
    }
1306
1307
0
  if (sign > 0) n += allow_sign;
1308
0
  else if (n > (uint32_t)allow_sign)
1309
0
    {
1310
0
    *errorcodeptr = ERR15;  /* Non-existent subpattern */
1311
0
    goto EXIT;
1312
0
    }
1313
0
  else n = allow_sign + 1 - n;
1314
0
  }
1315
1316
192
yield = TRUE;
1317
1318
192
EXIT:
1319
192
*intptr = n;
1320
192
*ptrptr = ptr;
1321
192
return yield;
1322
192
}
1323
1324
1325
1326
/*************************************************
1327
*         Read repeat counts                     *
1328
*************************************************/
1329
1330
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1331
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1332
larger value is used for "unlimited". We have to use signed arguments for
1333
read_number() because it is capable of returning a signed value. As of Perl
1334
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1335
tabs after { and before } and between the numbers and the comma, so we do too.
1336
1337
Arguments:
1338
  ptrptr         points to pointer to character after '{'
1339
  ptrend         pointer to end of input
1340
  minp           if not NULL, pointer to int for min
1341
  maxp           if not NULL, pointer to int for max
1342
  errorcodeptr   points to error code variable
1343
1344
Returns:         FALSE if not a repeat quantifier, errorcode set zero
1345
                 FALSE on error, with errorcode set non-zero
1346
                 TRUE on success, with pointer updated to point after '}'
1347
*/
1348
1349
static BOOL
1350
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1351
  uint32_t *maxp, int *errorcodeptr)
1352
96
{
1353
96
PCRE2_SPTR p = *ptrptr;
1354
96
PCRE2_SPTR pp;
1355
96
BOOL yield = FALSE;
1356
96
BOOL had_minimum = FALSE;
1357
96
int32_t min = 0;
1358
96
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1359
1360
96
*errorcodeptr = 0;
1361
96
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1362
1363
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1364
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1365
error. */
1366
1367
96
pp = p;
1368
96
if (pp < ptrend && IS_DIGIT(*pp))
1369
96
  {
1370
96
  had_minimum = TRUE;
1371
96
  while (++pp < ptrend && IS_DIGIT(*pp)) {}
1372
96
  }
1373
1374
96
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1375
96
if (pp >= ptrend) return FALSE;
1376
1377
96
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1378
0
  {
1379
0
  if (!had_minimum) return FALSE;
1380
0
  }
1381
96
else
1382
96
  {
1383
96
  if (*pp++ != CHAR_COMMA) return FALSE;
1384
96
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1385
96
  if (pp >= ptrend) return FALSE;
1386
96
  if (IS_DIGIT(*pp))
1387
96
    {
1388
96
    while (++pp < ptrend && IS_DIGIT(*pp)) {}
1389
96
    }
1390
0
  else if (!had_minimum) return FALSE;
1391
96
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1392
96
  if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1393
96
  }
1394
1395
/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
1396
or {n,m}. The only error that read_number() can return is for a number that is
1397
too big. If *errorcodeptr is returned as zero it means no number was found. */
1398
1399
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1400
check m >= n because n defaults to zero. */
1401
1402
96
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1403
0
  {
1404
0
  if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1405
0
  p++;  /* Skip comma and subsequent spaces */
1406
0
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1407
0
  if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1408
0
    {
1409
0
    if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1410
0
    }
1411
0
  }
1412
1413
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1414
1415
96
else
1416
96
  {
1417
96
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1418
96
  if (*p == CHAR_RIGHT_CURLY_BRACKET)
1419
0
    {
1420
0
    max = min;
1421
0
    }
1422
96
  else   /* Handle {n,} or {n,m} */
1423
96
    {
1424
96
    p++;    /* Skip comma and subsequent spaces */
1425
96
    while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1426
96
    if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1427
0
      {
1428
0
      if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1429
0
      }
1430
1431
96
    if (max < min)
1432
0
      {
1433
0
      *errorcodeptr = ERR4;
1434
0
      goto EXIT;
1435
0
      }
1436
96
    }
1437
96
  }
1438
1439
/* Valid quantifier exists */
1440
1441
96
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1442
96
p++;
1443
96
yield = TRUE;
1444
96
if (minp != NULL) *minp = (uint32_t)min;
1445
96
if (maxp != NULL) *maxp = (uint32_t)max;
1446
1447
/* Update the pattern pointer */
1448
1449
96
EXIT:
1450
96
*ptrptr = p;
1451
96
return yield;
1452
96
}
1453
1454
1455
1456
/*************************************************
1457
*            Handle escapes                      *
1458
*************************************************/
1459
1460
/* This function is called when a \ has been encountered. It either returns a
1461
positive value for a simple escape such as \d, or 0 for a data character, which
1462
is placed in chptr. A backreference to group n is returned as -(n+1). On
1463
entry, ptr is pointing at the character after \. On exit, it points after the
1464
final code unit of the escape sequence.
1465
1466
This function is also called from pcre2_substitute() to handle escape sequences
1467
in replacement strings. In this case, the cb argument is NULL, and in the case
1468
of escapes that have further processing, only sequences that define a data
1469
character are recognised. The options argument is the final value of the
1470
compiled pattern's options.
1471
1472
Arguments:
1473
  ptrptr         points to the input position pointer
1474
  ptrend         points to the end of the input
1475
  chptr          points to a returned data character
1476
  errorcodeptr   points to the errorcode variable (containing zero)
1477
  options        the current options bits
1478
  xoptions       the current extra options bits
1479
  bracount       the number of capturing parentheses encountered so far
1480
  isclass        TRUE if in a character class
1481
  cb             compile data block or NULL when called from pcre2_substitute()
1482
1483
Returns:         zero => a data character
1484
                 positive => a special escape sequence
1485
                 negative => a numerical back reference
1486
                 on error, errorcodeptr is set non-zero
1487
*/
1488
1489
int
1490
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1491
  int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,
1492
  BOOL isclass, compile_block *cb)
1493
68
{
1494
68
BOOL utf = (options & PCRE2_UTF) != 0;
1495
68
BOOL alt_bsux =
1496
68
  ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1497
68
PCRE2_SPTR ptr = *ptrptr;
1498
68
uint32_t c, cc;
1499
68
int escape = 0;
1500
68
int i;
1501
1502
/* If backslash is at the end of the string, it's an error. */
1503
1504
68
if (ptr >= ptrend)
1505
0
  {
1506
0
  *errorcodeptr = ERR1;
1507
0
  return 0;
1508
0
  }
1509
1510
68
GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1511
68
*errorcodeptr = 0;              /* Be optimistic */
1512
1513
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1514
value test saves a memory lookup for code points outside the alphanumeric
1515
range. */
1516
1517
68
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1518
1519
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1520
positive value is a literal value for something like \n. A negative value is
1521
the negation of one of the ESC_ macros that is passed back for handling by the
1522
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1523
is supported. If the value is zero, further processing is handled below. */
1524
1525
52
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1526
24
  {
1527
24
  if (i > 0)
1528
16
    {
1529
16
    c = (uint32_t)i;
1530
16
    if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1531
0
      c = CHAR_LF;
1532
16
    }
1533
8
  else  /* Negative table entry */
1534
8
    {
1535
8
    escape = -i;                    /* Else return a special escape */
1536
8
    if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1537
0
      cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1538
1539
    /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1540
    Unicode code points, as well as plain \N for "not newline". PCRE does not
1541
    support \N{name}. However, it does support quantification such as \N{2,3},
1542
    so if \N{ is not followed by U+dddd we check for a quantifier. */
1543
1544
8
    if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1545
0
      {
1546
0
      PCRE2_SPTR p = ptr + 1;
1547
1548
      /* Perl ignores spaces and tabs after { */
1549
1550
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1551
1552
      /* \N{U+ can be handled by the \x{ code. However, this construction is
1553
      not valid in EBCDIC environments because it specifies a Unicode
1554
      character, not a codepoint in the local code. For example \N{U+0041}
1555
      must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1556
      casing semantics for the entire pattern, so allow it only in UTF (i.e.
1557
      Unicode) mode. */
1558
1559
0
      if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1560
0
        {
1561
0
#ifndef EBCDIC
1562
0
        if (utf)
1563
0
          {
1564
0
          ptr = p + 2;
1565
0
          escape = 0;   /* Not a fancy escape after all */
1566
0
          goto COME_FROM_NU;
1567
0
          }
1568
0
#endif
1569
1570
        /* Improve error offset. */
1571
0
        ptr = p + 2;
1572
0
        while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1573
0
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1574
0
        if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET) ptr++;
1575
1576
0
        *errorcodeptr = ERR93;
1577
0
        }
1578
1579
      /* Give an error in contexts where quantifiers are not allowed
1580
      (character classes; substitution strings). */
1581
1582
0
      else if (isclass || cb == NULL)
1583
0
        {
1584
0
        ptr++; /* Skip over the opening brace */
1585
0
        *errorcodeptr = ERR37;
1586
0
        }
1587
1588
      /* Give an error if what follows is not a quantifier, but don't override
1589
      an error set by the quantifier reader (e.g. number overflow). */
1590
1591
0
      else
1592
0
        {
1593
0
        if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1594
0
             *errorcodeptr == 0)
1595
0
          {
1596
0
          ptr++; /* Skip over the opening brace */
1597
0
          *errorcodeptr = ERR37;
1598
0
          }
1599
0
        }
1600
0
      }
1601
8
    }
1602
24
  }
1603
1604
/* Escapes that need further processing, including those that are unknown, have
1605
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1606
\o, and \x are recognized (\u and \U can never appear as they are used for case
1607
forcing). */
1608
1609
28
else
1610
28
  {
1611
28
  int s;
1612
28
  PCRE2_SPTR oldptr;
1613
28
  BOOL overflow;
1614
1615
  /* Filter calls from pcre2_substitute(). */
1616
1617
28
  if (cb == NULL)
1618
0
    {
1619
0
    if (!(c >= CHAR_0 && c <= CHAR_9) && c != CHAR_c && c != CHAR_o &&
1620
0
        c != CHAR_x && c != CHAR_g)
1621
0
      {
1622
0
      *errorcodeptr = ERR3;
1623
0
      goto EXIT;
1624
0
      }
1625
0
    alt_bsux = FALSE;   /* Do not modify \x handling */
1626
0
    }
1627
1628
28
  switch (c)
1629
28
    {
1630
    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1631
    error. */
1632
1633
0
    case CHAR_F:
1634
0
    case CHAR_l:
1635
0
    case CHAR_L:
1636
0
    *errorcodeptr = ERR37;
1637
0
    break;
1638
1639
    /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1640
    is set. Otherwise, \u must be followed by exactly four hex digits or, if
1641
    PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1642
    Otherwise it is a lowercase u letter. This gives some compatibility with
1643
    ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1644
    allowed. When \u{ is not followed by hex digits, a special return is given
1645
    because otherwise \u{ 12} (for example) would be treated as u{12}. */
1646
1647
0
    case CHAR_u:
1648
0
    if (!alt_bsux)
1649
0
      *errorcodeptr = ERR37;
1650
0
    else
1651
0
      {
1652
0
      uint32_t xc;
1653
1654
0
      if (ptr >= ptrend) break;
1655
0
      if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1656
0
          (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1657
0
        {
1658
0
        PCRE2_SPTR hptr = ptr + 1;
1659
1660
0
        cc = 0;
1661
0
        while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1662
0
          {
1663
0
          if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1664
0
            {
1665
0
            *errorcodeptr = ERR77;
1666
0
            ptr = hptr;   /* Show where */
1667
0
            break;        /* *hptr != } will cause another break below */
1668
0
            }
1669
0
          cc = (cc << 4) | xc;
1670
0
          hptr++;
1671
0
          }
1672
1673
0
        if (hptr == ptr + 1 ||   /* No hex digits */
1674
0
            hptr >= ptrend ||    /* Hit end of input */
1675
0
            *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1676
0
          {
1677
0
          if (isclass) break; /* In a class, just treat as '\u' literal */
1678
0
          escape = ESC_ub;    /* Special return */
1679
0
          ptr++;              /* Skip { */
1680
0
          break;              /* Hex escape not recognized */
1681
0
          }
1682
1683
0
        c = cc;          /* Accept the code point */
1684
0
        ptr = hptr + 1;
1685
0
        }
1686
1687
0
      else  /* Must be exactly 4 hex digits */
1688
0
        {
1689
0
        if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1690
0
        if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1691
0
        if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1692
0
        cc = (cc << 4) | xc;
1693
0
        if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1694
0
        cc = (cc << 4) | xc;
1695
0
        if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1696
0
        c = (cc << 4) | xc;
1697
0
        ptr += 4;
1698
0
        }
1699
1700
0
      if (utf)
1701
0
        {
1702
0
        if (c > 0x10ffffU) *errorcodeptr = ERR77;
1703
0
        else
1704
0
          if (c >= 0xd800 && c <= 0xdfff &&
1705
0
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1706
0
                *errorcodeptr = ERR73;
1707
0
        }
1708
0
      else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1709
0
      }
1710
0
    break;
1711
1712
    /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1713
    in which case it is an upper case letter. */
1714
1715
0
    case CHAR_U:
1716
0
    if (!alt_bsux) *errorcodeptr = ERR37;
1717
0
    break;
1718
1719
    /* In a character class, \g is just a literal "g". Outside a character
1720
    class, \g must be followed by one of a number of specific things:
1721
1722
    (1) A number, either plain or braced. If positive, it is an absolute
1723
    backreference. If negative, it is a relative backreference. This is a Perl
1724
    5.10 feature.
1725
1726
    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1727
    is part of Perl's movement towards a unified syntax for back references. As
1728
    this is synonymous with \k{name}, we fudge it up by pretending it really
1729
    was \k{name}.
1730
1731
    (3) For Oniguruma compatibility we also support \g followed by a name or a
1732
    number either in angle brackets or in single quotes. However, these are
1733
    (possibly recursive) subroutine calls, _not_ backreferences. We return
1734
    the ESC_g code.
1735
1736
    Summary: Return a negative number for a numerical back reference (offset
1737
    by 1), ESC_k for a named back reference, and ESC_g for a named or
1738
    numbered subroutine call.
1739
1740
    The above describes the \g behaviour inside patterns. Inside replacement
1741
    strings (pcre2_substitute) we support only \g<nameornum> for Python
1742
    compatibility. Return ESG_g for the named case, and -(num+1) for the
1743
    numbered case.
1744
    */
1745
1746
0
    case CHAR_g:
1747
0
    if (isclass) break;
1748
1749
0
    if (ptr >= ptrend)
1750
0
      {
1751
0
      *errorcodeptr = ERR57;
1752
0
      break;
1753
0
      }
1754
1755
0
    if (cb == NULL)
1756
0
      {
1757
0
      PCRE2_SPTR p;
1758
      /* Substitution strings */
1759
0
      if (*ptr != CHAR_LESS_THAN_SIGN)
1760
0
        {
1761
0
        *errorcodeptr = ERR57;
1762
0
        break;
1763
0
        }
1764
1765
0
      p = ptr + 1;
1766
1767
0
      if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,
1768
0
          errorcodeptr))
1769
0
        {
1770
0
        if (*errorcodeptr == 0) escape = ESC_g;  /* No number found */
1771
0
        break;
1772
0
        }
1773
1774
0
      if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)
1775
0
        {
1776
0
        ptr = p;
1777
0
        *errorcodeptr = ERR119;  /* Missing terminator for number */
1778
0
        break;
1779
0
        }
1780
1781
      /* This is the reason that back references are returned as -(s+1) rather
1782
      than just -s. In a pattern, \0 is not a back reference, but \g<0> is
1783
      valid in a substitution string, so this must be representable. */
1784
0
      ptr = p + 1;
1785
0
      escape = -(s+1);
1786
0
      break;
1787
0
      }
1788
1789
0
    if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790
0
      {
1791
0
      escape = ESC_g;
1792
0
      break;
1793
0
      }
1794
1795
    /* If there is a brace delimiter, try to read a numerical reference. If
1796
    there isn't one, assume we have a name and treat it as \k. */
1797
1798
0
    if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799
0
      {
1800
0
      PCRE2_SPTR p = ptr + 1;
1801
1802
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803
0
      if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804
0
          errorcodeptr))
1805
0
        {
1806
0
        if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1807
0
        break;
1808
0
        }
1809
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810
1811
0
      if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812
0
        {
1813
0
        ptr = p;
1814
0
        *errorcodeptr = ERR119;  /* Missing terminator for number */
1815
0
        break;
1816
0
        }
1817
0
      ptr = p + 1;
1818
0
      }
1819
1820
    /* Read an undelimited number */
1821
1822
0
    else
1823
0
      {
1824
0
      if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1825
0
          errorcodeptr))
1826
0
        {
1827
0
        if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1828
0
        break;
1829
0
        }
1830
0
      }
1831
1832
0
    if (s <= 0)
1833
0
      {
1834
0
      *errorcodeptr = ERR15;
1835
0
      break;
1836
0
      }
1837
1838
0
    escape = -(s+1);
1839
0
    break;
1840
1841
    /* The handling of escape sequences consisting of a string of digits
1842
    starting with one that is not zero is not straightforward. Perl has changed
1843
    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1844
    recommended to avoid the ambiguities in the old syntax.
1845
1846
    Outside a character class, the digits are read as a decimal number. If the
1847
    number is less than 10, or if there are that many previous extracting left
1848
    brackets, it is a back reference. Otherwise, up to three octal digits are
1849
    read to form an escaped character code. Thus \123 is likely to be octal 123
1850
    (cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
1851
    style" of handling ambiguous octal/backrefences such as \12.
1852
1853
    There is an alternative disambiguation strategy, selected by
1854
    PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
1855
    have either a leading zero, or exactly three octal digits; otherwise it's
1856
    a backreference. The disambiguation is stable, and does not depend on how
1857
    many capture groups are defined (it's simply an invalid backreference if
1858
    there is no corresponding capture group). Additionally, octal values above
1859
    \377 (\xff) are rejected.
1860
1861
    Inside a character class, \ followed by a digit is always either a literal
1862
    8 or 9 or an octal number. */
1863
1864
0
    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1865
0
    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1866
1867
0
    if (isclass)
1868
0
      {
1869
      /* Fall through to octal handling; never a backreference inside a class. */
1870
0
      }
1871
0
    else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
1872
0
      {
1873
      /* Python-style disambiguation. */
1874
0
      if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
1875
0
          ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1876
0
        {
1877
        /* We peeked a three-digit octal, so fall through */
1878
0
        }
1879
0
      else
1880
0
        {
1881
        /* We are at a digit, so the only possible error from read_number() is
1882
        a number that is too large. */
1883
0
        ptr--;   /* Back to the digit */
1884
1885
0
        if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1886
0
          {
1887
0
          *errorcodeptr = ERR61;
1888
0
          break;
1889
0
          }
1890
1891
0
        escape = -(s+1);
1892
0
        break;
1893
0
        }
1894
0
      }
1895
0
    else
1896
0
      {
1897
      /* Perl-style disambiguation. */
1898
0
      oldptr = ptr;
1899
0
      ptr--;   /* Back to the digit */
1900
1901
      /* As we know we are at a digit, the only possible error from
1902
      read_number() is a number that is too large to be a group number. Because
1903
      that number might be still valid if read as an octal, errorcodeptr is not
1904
      set on failure and therefore a sentinel value of INT_MAX is used instead
1905
      of the original value, and will be used later to properly set the error,
1906
      if not falling through. */
1907
1908
0
      if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1909
0
        s = INT_MAX;
1910
1911
      /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1912
      are octal escapes if there are not that many previous captures. */
1913
1914
0
      if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)
1915
0
        {
1916
        /* s > MAX_GROUP_NUMBER should not be possible because of read_number(),
1917
        but we keep it just to be safe and because it will also catch the
1918
        sentinel value that was set on failure by that function. */
1919
1920
0
        if ((unsigned)s > MAX_GROUP_NUMBER)
1921
0
          {
1922
0
          PCRE2_ASSERT(s == INT_MAX);
1923
0
          *errorcodeptr = ERR61;
1924
0
          }
1925
0
        else escape = -(s+1);     /* Indicates a back reference */
1926
0
        break;
1927
0
        }
1928
1929
0
      ptr = oldptr;      /* Put the pointer back and fall through */
1930
0
      }
1931
1932
    /* Handle a digit following \ when the number is not a back reference, or
1933
    we are within a character class. If the first digit is 8 or 9, Perl used to
1934
    generate a binary zero and then treat the digit as a following literal. At
1935
    least by Perl 5.18 this changed so as not to insert the binary zero. */
1936
1937
0
    if (c >= CHAR_8) break;
1938
1939
0
    PCRE2_FALLTHROUGH /* Fall through */
1940
0
1941
0
    /* \0 always starts an octal number, but we may drop through to here with a
1942
0
    larger first octal digit. The original code used just to take the least
1943
0
    significant 8 bits of octal numbers (I think this is what early Perls used
1944
0
    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
1945
0
    but no more than 3 octal digits. */
1946
0
1947
0
    case CHAR_0:
1948
0
    c -= CHAR_0;
1949
0
    while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1950
0
        c = c * 8 + *ptr++ - CHAR_0;
1951
0
    if (c > 0xff)
1952
0
      {
1953
0
      if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;
1954
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1955
0
      else if (!utf) *errorcodeptr = ERR51;
1956
0
#endif
1957
0
      }
1958
1959
    /* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
1960
    two- or three-character octal escapes \00 and \000, nor \x00. */
1961
1962
0
    if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
1963
0
        *errorcodeptr = ERR98;
1964
0
    break;
1965
1966
    /* \o is a relatively new Perl feature, supporting a more general way of
1967
    specifying character codes in octal. The only supported form is \o{ddd},
1968
    with optional spaces or tabs after { and before }. */
1969
1970
0
    case CHAR_o:
1971
0
    if (ptr >= ptrend || *ptr != CHAR_LEFT_CURLY_BRACKET)
1972
0
      {
1973
0
      *errorcodeptr = ERR55;
1974
0
      break;
1975
0
      }
1976
0
    ptr++;
1977
1978
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1979
0
    if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1980
0
      {
1981
0
      *errorcodeptr = ERR78;
1982
0
      break;
1983
0
      }
1984
1985
0
    c = 0;
1986
0
    overflow = FALSE;
1987
0
    while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1988
0
      {
1989
0
      cc = *ptr++;
1990
0
      if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1991
#if PCRE2_CODE_UNIT_WIDTH == 32
1992
      if (c >= 0x20000000u) { overflow = TRUE; break; }
1993
#endif
1994
0
      c = (c << 3) + (cc - CHAR_0);
1995
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1996
0
      if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1997
#elif PCRE2_CODE_UNIT_WIDTH == 16
1998
      if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1999
#elif PCRE2_CODE_UNIT_WIDTH == 32
2000
      if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
2001
#endif
2002
0
      }
2003
2004
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2005
2006
0
    if (overflow)
2007
0
      {
2008
0
      while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
2009
0
      *errorcodeptr = ERR34;
2010
0
      }
2011
0
    else if (utf && c >= 0xd800 && c <= 0xdfff &&
2012
0
             (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2013
0
      {
2014
0
      *errorcodeptr = ERR73;
2015
0
      }
2016
0
    else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)
2017
0
      {
2018
0
      ptr++;
2019
0
      }
2020
0
    else
2021
0
      {
2022
0
      *errorcodeptr = ERR64;
2023
0
      goto ESCAPE_FAILED_FORWARD;
2024
0
      }
2025
0
    break;
2026
2027
    /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
2028
    by two hexadecimal digits. Otherwise it is a lowercase x letter. */
2029
2030
28
    case CHAR_x:
2031
28
    if (alt_bsux)
2032
0
      {
2033
0
      uint32_t xc;
2034
0
      if (ptrend - ptr < 2) break;               /* Less than 2 characters */
2035
0
      if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
2036
0
      if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2037
0
      c = (cc << 4) | xc;
2038
0
      ptr += 2;
2039
0
      }
2040
2041
    /* Handle \x in Perl's style. \x{ddd} is a character code which can be
2042
    greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
2043
    digits. If not, { used to be treated as a data character. However, Perl
2044
    seems to read hex digits up to the first non-such, and ignore the rest, so
2045
    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2046
    now gives an error. */
2047
2048
28
    else
2049
28
      {
2050
28
      if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
2051
0
        {
2052
0
        ptr++;
2053
0
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2054
2055
0
#ifndef EBCDIC
2056
0
        COME_FROM_NU:
2057
0
#endif
2058
0
        if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
2059
0
          {
2060
0
          *errorcodeptr = ERR78;
2061
0
          break;
2062
0
          }
2063
0
        c = 0;
2064
0
        overflow = FALSE;
2065
2066
0
        while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2067
0
          {
2068
0
          ptr++;
2069
0
          if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2070
#if PCRE2_CODE_UNIT_WIDTH == 32
2071
          if (c >= 0x10000000l) { overflow = TRUE; break; }
2072
#endif
2073
0
          c = (c << 4) | cc;
2074
0
          if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2075
0
            {
2076
0
            overflow = TRUE;
2077
0
            break;
2078
0
            }
2079
0
          }
2080
2081
        /* Perl ignores spaces and tabs before } */
2082
2083
0
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2084
2085
        /* On overflow, skip remaining hex digits */
2086
2087
0
        if (overflow)
2088
0
          {
2089
0
          while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2090
0
          *errorcodeptr = ERR34;
2091
0
          }
2092
0
        else if (utf && c >= 0xd800 && c <= 0xdfff &&
2093
0
                 (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2094
0
          {
2095
0
          *errorcodeptr = ERR73;
2096
0
          }
2097
0
        else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)
2098
0
          {
2099
0
          ptr++;
2100
0
          }
2101
2102
        /* If the sequence of hex digits (followed by optional space) does not
2103
        end with '}', give an error. We used just to recognize this construct
2104
        and fall through to the normal \x handling, but nowadays Perl gives an
2105
        error, which seems much more sensible, so we do too. */
2106
2107
0
        else
2108
0
          {
2109
0
          *errorcodeptr = ERR67;
2110
0
          goto ESCAPE_FAILED_FORWARD;
2111
0
          }
2112
0
        }   /* End of \x{} processing */
2113
2114
      /* Read a up to two hex digits after \x */
2115
2116
28
      else
2117
28
        {
2118
        /* Perl has the surprising/broken behaviour that \x without following
2119
        hex digits is treated as an escape for NUL. Their source code laments
2120
        this but keeps it for backwards compatibility. A warning is printed
2121
        when "use warnings" is enabled. Because we don't have warnings, we
2122
        simply forbid it. */
2123
28
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)
2124
0
          {
2125
          /* Not a hex digit */
2126
0
          *errorcodeptr = ERR78;
2127
0
          break;
2128
0
          }
2129
28
        ptr++;
2130
28
        c = cc;
2131
2132
        /* With "use re 'strict'" Perl actually requires exactly two digits (error
2133
        for \x, \xA and \xAAA). While \x was already rejected, this seems overly
2134
        strict, and there seems little incentive to align with that, given the
2135
        backwards-compatibility cost.
2136
2137
        For comparison, note that other engines disagree. For example:
2138
          - Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits
2139
          - .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.
2140
        */
2141
28
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2142
16
        ptr++;
2143
16
        c = (c << 4) | cc;
2144
16
        }     /* End of \xdd handling */
2145
28
      }       /* End of Perl-style \x handling */
2146
16
    break;
2147
2148
    /* The handling of \c is different in ASCII and EBCDIC environments. In an
2149
    ASCII (or Unicode) environment, an error is given if the character
2150
    following \c is not a printable ASCII character. Otherwise, the following
2151
    character is upper-cased if it is a letter, and after that the 0x40 bit is
2152
    flipped. The result is the value of the escape.
2153
2154
    In an EBCDIC environment the handling of \c is compatible with the
2155
    specification in the perlebcdic document. The following character must be
2156
    a letter or one of small number of special characters. These provide a
2157
    means of defining the character values 0-31.
2158
2159
    For testing the EBCDIC handling of \c in an ASCII environment, recognize
2160
    the EBCDIC value of 'c' explicitly. */
2161
2162
16
    case CHAR_c:
2163
0
    if (ptr >= ptrend)
2164
0
      {
2165
0
      *errorcodeptr = ERR2;
2166
0
      break;
2167
0
      }
2168
0
    c = *ptr;
2169
0
    if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2170
2171
    /* Handle \c in an ASCII/Unicode environment. */
2172
2173
0
#ifndef EBCDIC    /* ASCII/UTF-8 coding */
2174
0
    if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2175
0
      {
2176
0
      *errorcodeptr = ERR68;
2177
0
      goto ESCAPE_FAILED_FORWARD;
2178
0
      }
2179
0
    c ^= 0x40;
2180
2181
    /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2182
    255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2183
    POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2184
    The other valid sequences correspond to a list of specific characters. */
2185
2186
#else
2187
    if (c == CHAR_QUESTION_MARK)
2188
      c = (CHAR_BACKSLASH == 188 && CHAR_GRAVE_ACCENT == 74)? 0x5f : 0xff;
2189
    else
2190
      {
2191
      for (i = 0; i < 32; i++)
2192
        {
2193
        if (c == ebcdic_escape_c[i]) break;
2194
        }
2195
      if (i < 32)
2196
        c = i;
2197
      else
2198
        {
2199
        *errorcodeptr = ERR68;
2200
        goto ESCAPE_FAILED_FORWARD;
2201
        }
2202
      }
2203
#endif  /* EBCDIC */
2204
2205
0
    ptr++;
2206
0
    break;
2207
2208
    /* Any other alphanumeric following \ is an error. Perl gives an error only
2209
    if in warning mode, but PCRE doesn't have a warning mode. */
2210
2211
0
    default:
2212
0
    *errorcodeptr = ERR3;
2213
0
    break;
2214
28
    }
2215
28
  }
2216
2217
/* Set the pointer to the next character before returning. */
2218
2219
68
EXIT:
2220
68
*ptrptr = ptr;
2221
68
*chptr = c;
2222
68
return escape;
2223
2224
/* Some errors need to indicate the next character. */
2225
2226
0
ESCAPE_FAILED_FORWARD:
2227
0
ptr++;
2228
0
#ifdef SUPPORT_UNICODE
2229
0
if (utf) FORWARDCHARTEST(ptr, ptrend);
2230
0
#endif
2231
0
goto EXIT;
2232
68
}
2233
2234
2235
2236
#ifdef SUPPORT_UNICODE
2237
/*************************************************
2238
*               Handle \P and \p                 *
2239
*************************************************/
2240
2241
/* This function is called after \P or \p has been encountered, provided that
2242
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2243
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2244
after the final code unit of the escape sequence.
2245
2246
Arguments:
2247
  ptrptr         the pattern position pointer
2248
  utf            true if the input is UTF-encoded
2249
  negptr         a boolean that is set TRUE for negation else FALSE
2250
  ptypeptr       an unsigned int that is set to the type value
2251
  pdataptr       an unsigned int that is set to the detailed property value
2252
  errorcodeptr   the error code variable
2253
  cb             the compile data
2254
2255
Returns:         TRUE if the type value was found, or FALSE for an invalid type
2256
*/
2257
2258
static BOOL
2259
get_ucp(PCRE2_SPTR *ptrptr, BOOL utf, BOOL *negptr, uint16_t *ptypeptr,
2260
  uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2261
0
{
2262
0
uint32_t c;
2263
0
ptrdiff_t i;
2264
0
PCRE2_SIZE bot, top;
2265
0
PCRE2_SPTR ptr = *ptrptr;
2266
0
PCRE2_UCHAR name[50];
2267
0
PCRE2_UCHAR *vptr = NULL;
2268
0
uint16_t ptscript = PT_NOTSCRIPT;
2269
2270
#ifndef MAYBE_UTF_MULTI
2271
(void)utf;  /* Avoid compiler warning */
2272
#endif
2273
2274
0
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2275
0
GETCHARINCTEST(c, ptr);
2276
0
*negptr = FALSE;
2277
2278
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2279
negation. We must be handling Unicode encoding here, though we may be compiling
2280
for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC
2281
input and Unicode input in the same build.) In accordance with Unicode's "loose
2282
matching" rules, ASCII white space, hyphens, and underscores are ignored. We
2283
don't use isspace() or tolower() because (a) code points may be greater than
2284
255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC
2285
environment. */
2286
2287
0
if (c == CHAR_LEFT_CURLY_BRACKET)
2288
0
  {
2289
0
  if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2290
2291
0
  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2292
0
    {
2293
0
    REDO:
2294
2295
0
    if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2296
0
    GETCHARINCTEST(c, ptr);
2297
2298
    /* Skip ignorable Unicode characters. */
2299
2300
0
    if (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||
2301
0
        (c >= CHAR_HT && c <= CHAR_CR))
2302
0
      {
2303
0
      goto REDO;
2304
0
      }
2305
2306
    /* The first significant character being circumflex negates the meaning of
2307
    the item. */
2308
2309
0
    if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)
2310
0
      {
2311
0
      *negptr = TRUE;
2312
0
      goto REDO;
2313
0
      }
2314
2315
0
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2316
2317
    /* Names consist of ASCII letters and digits, but equals and colon may also
2318
    occur as a name/value separator. We must also allow for \p{L&}. A simple
2319
    check for a value between '&' and 'z' suffices because anything else in a
2320
    name or value will cause an "unknown property" error anyway. */
2321
2322
0
    if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;
2323
2324
    /* Lower case a capital letter or remember where the name/value separator
2325
    is. */
2326
2327
0
    if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;
2328
0
    else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)
2329
0
      vptr = name + i;
2330
2331
0
    name[i] = c;
2332
0
    }
2333
2334
  /* Error if the loop didn't end with '}' - either we hit the end of the
2335
  pattern or the name was longer than any legal property name. */
2336
2337
0
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2338
0
  name[i] = 0;
2339
0
  }
2340
2341
/* If { doesn't follow \p or \P there is just one following character, which
2342
must be an ASCII letter. */
2343
2344
0
else if (c >= CHAR_A && c <= CHAR_Z)
2345
0
  {
2346
0
  name[0] = c | 0x20;  /* Lower case */
2347
0
  name[1] = 0;
2348
0
  }
2349
0
else if (c >= CHAR_a && c <= CHAR_z)
2350
0
  {
2351
0
  name[0] = c;
2352
0
  name[1] = 0;
2353
0
  }
2354
0
else goto ERROR_RETURN;
2355
2356
0
*ptrptr = ptr;   /* Update pattern pointer */
2357
2358
/* If the property contains ':' or '=' we have class name and value separately
2359
specified. The following are supported:
2360
2361
  . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2362
  . Script (synonym sc) for which the property name is the script name
2363
  . Script_Extensions (synonym scx), ditto
2364
2365
As this is a small number, we currently just check the names directly. If this
2366
grows, a sorted table and a switch will be neater.
2367
2368
For both the script properties, set a PT_xxx value so that (1) they can be
2369
distinguished and (2) invalid script names that happen to be the name of
2370
another property can be diagnosed. */
2371
2372
0
if (vptr != NULL)
2373
0
  {
2374
0
  int offset = 0;
2375
0
  PCRE2_UCHAR sname[8];
2376
2377
0
  *vptr = 0;   /* Terminate property name */
2378
0
  if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2379
0
      PRIV(strcmp_c8)(name, STRING_bc) == 0)
2380
0
    {
2381
0
    offset = 4;
2382
0
    sname[0] = CHAR_b;
2383
0
    sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2384
0
    sname[2] = CHAR_d;
2385
0
    sname[3] = CHAR_i;
2386
0
    }
2387
2388
0
  else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2389
0
           PRIV(strcmp_c8)(name, STRING_sc) == 0)
2390
0
    ptscript = PT_SC;
2391
2392
0
  else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2393
0
           PRIV(strcmp_c8)(name, STRING_scx) == 0)
2394
0
    ptscript = PT_SCX;
2395
2396
0
  else
2397
0
    {
2398
0
    *errorcodeptr = ERR47;
2399
0
    return FALSE;
2400
0
    }
2401
2402
  /* Adjust the string in name[] as needed */
2403
2404
0
  memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2405
0
  if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2406
0
  }
2407
2408
/* Search for a recognized property using binary chop. */
2409
2410
0
bot = 0;
2411
0
top = PRIV(utt_size);
2412
2413
0
while (bot < top)
2414
0
  {
2415
0
  int r;
2416
0
  i = (bot + top) >> 1;
2417
0
  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2418
2419
  /* When a matching property is found, some extra checking is needed when the
2420
  \p{xx:yy} syntax is used and xx is either sc or scx. */
2421
2422
0
  if (r == 0)
2423
0
    {
2424
0
    *pdataptr = PRIV(utt)[i].value;
2425
0
    if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2426
0
      {
2427
0
      *ptypeptr = PRIV(utt)[i].type;
2428
0
      return TRUE;
2429
0
      }
2430
2431
0
    switch (PRIV(utt)[i].type)
2432
0
      {
2433
0
      case PT_SC:
2434
0
      *ptypeptr = PT_SC;
2435
0
      return TRUE;
2436
2437
0
      case PT_SCX:
2438
0
      *ptypeptr = ptscript;
2439
0
      return TRUE;
2440
0
      }
2441
2442
0
    break;  /* Non-script found */
2443
0
    }
2444
2445
0
  if (r > 0) bot = i + 1; else top = i;
2446
0
  }
2447
2448
0
*errorcodeptr = ERR47;   /* Unrecognized property */
2449
0
return FALSE;
2450
2451
0
ERROR_RETURN:            /* Malformed \P or \p */
2452
0
*errorcodeptr = ERR46;
2453
0
*ptrptr = ptr;
2454
0
return FALSE;
2455
0
}
2456
#endif
2457
2458
2459
2460
/*************************************************
2461
*           Check for POSIX class syntax         *
2462
*************************************************/
2463
2464
/* This function is called when the sequence "[:" or "[." or "[=" is
2465
encountered in a character class. It checks whether this is followed by a
2466
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2467
reach an unescaped ']' without the special preceding character, return FALSE.
2468
2469
Originally, this function only recognized a sequence of letters between the
2470
terminators, but it seems that Perl recognizes any sequence of characters,
2471
though of course unknown POSIX names are subsequently rejected. Perl gives an
2472
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2473
didn't consider this to be a POSIX class. Likewise for [:1234:].
2474
2475
The problem in trying to be exactly like Perl is in the handling of escapes. We
2476
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2477
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2478
below handles the special cases \\ and \], but does not try to do any other
2479
escape processing. This makes it different from Perl for cases such as
2480
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2481
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2482
when Perl does, I think.
2483
2484
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2485
It seems that the appearance of a nested POSIX class supersedes an apparent
2486
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2487
a digit. This is handled by returning FALSE if the start of a new group with
2488
the same terminator is encountered, since the next closing sequence must close
2489
the nested group, not the outer one.
2490
2491
In Perl, unescaped square brackets may also appear as part of class names. For
2492
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2493
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2494
seem right at all. PCRE does not allow closing square brackets in POSIX class
2495
names.
2496
2497
Arguments:
2498
  ptr      pointer to the character after the initial [ (colon, dot, equals)
2499
  ptrend   pointer to the end of the pattern
2500
  endptr   where to return a pointer to the terminating ':', '.', or '='
2501
2502
Returns:   TRUE or FALSE
2503
*/
2504
2505
static BOOL
2506
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2507
0
{
2508
0
PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2509
0
terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2510
2511
0
for (; ptrend - ptr >= 2; ptr++)
2512
0
  {
2513
0
  if (*ptr == CHAR_BACKSLASH &&
2514
0
      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2515
0
    ptr++;
2516
2517
0
  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2518
0
            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2519
2520
0
  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2521
0
    {
2522
0
    *endptr = ptr;
2523
0
    return TRUE;
2524
0
    }
2525
0
  }
2526
2527
0
return FALSE;
2528
0
}
2529
2530
2531
2532
/*************************************************
2533
*          Check POSIX class name                *
2534
*************************************************/
2535
2536
/* This function is called to check the name given in a POSIX-style class entry
2537
such as [:alnum:].
2538
2539
Arguments:
2540
  ptr        points to the first letter
2541
  len        the length of the name
2542
2543
Returns:     a value representing the name, or -1 if unknown
2544
*/
2545
2546
static int
2547
check_posix_name(PCRE2_SPTR ptr, int len)
2548
0
{
2549
0
const char *pn = posix_names;
2550
0
int yield = 0;
2551
0
while (posix_name_lengths[yield] != 0)
2552
0
  {
2553
0
  if (len == posix_name_lengths[yield] &&
2554
0
    PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2555
0
  pn += posix_name_lengths[yield] + 1;
2556
0
  yield++;
2557
0
  }
2558
0
return -1;
2559
0
}
2560
2561
2562
2563
/*************************************************
2564
*       Read a subpattern or VERB name           *
2565
*************************************************/
2566
2567
/* This function is called from parse_regex() below whenever it needs to read
2568
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2569
pointer must be to the preceding character. If that character is '*' we are
2570
reading a verb or alpha assertion name. The pointer is updated to point after
2571
the name, for a VERB or alpha assertion name, or after the name's terminator
2572
for a subpattern name. Returning both the offset and the name pointer is
2573
redundant information, but some callers use one and some the other, so it is
2574
simplest just to return both. When the name is in braces, spaces and tabs are
2575
allowed (and ignored) at either end.
2576
2577
Arguments:
2578
  ptrptr      points to the character pointer variable
2579
  ptrend      points to the end of the input string
2580
  utf         true if the input is UTF-encoded
2581
  terminator  the terminator of a subpattern name must be this
2582
  offsetptr   where to put the offset from the start of the pattern
2583
  nameptr     where to put a pointer to the name in the input
2584
  namelenptr  where to put the length of the name
2585
  errcodeptr  where to put an error code
2586
  cb          pointer to the compile data block
2587
2588
Returns:    TRUE if a name was read
2589
            FALSE otherwise, with error code set
2590
*/
2591
2592
static BOOL
2593
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2594
  PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2595
  int *errorcodeptr, compile_block *cb)
2596
0
{
2597
0
PCRE2_SPTR ptr = *ptrptr;
2598
0
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2599
0
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2600
2601
0
if (is_braced)
2602
0
  while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2603
2604
0
if (ptr >= ptrend)                 /* No characters in name */
2605
0
  {
2606
0
  *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2607
0
                            ERR60; /* Verb not recognized or malformed */
2608
0
  goto FAILED;
2609
0
  }
2610
2611
0
*nameptr = ptr;
2612
0
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2613
2614
/* If this logic were ever to change, the matching function in pcre2_substitute.c
2615
ought to be updated to match. */
2616
2617
/* In UTF mode, a group name may contain letters and decimal digits as defined
2618
by Unicode properties, and underscores, but must not start with a digit. */
2619
2620
0
#ifdef SUPPORT_UNICODE
2621
0
if (utf && is_group)
2622
0
  {
2623
0
  uint32_t c, type;
2624
0
  PCRE2_SPTR p = ptr;
2625
2626
0
  GETCHARINC(c, p);  /* Peek at next character */
2627
0
  type = UCD_CHARTYPE(c);
2628
2629
0
  if (type == ucp_Nd)
2630
0
    {
2631
0
    ptr = p;
2632
0
    *errorcodeptr = ERR44;
2633
0
    goto FAILED;
2634
0
    }
2635
2636
0
  for(;;)
2637
0
    {
2638
0
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2639
0
        c != CHAR_UNDERSCORE) break;
2640
0
    ptr = p;  /* Accept character and peek again */
2641
0
    if (p >= ptrend) break;
2642
0
    GETCHARINC(c, p);
2643
0
    type = UCD_CHARTYPE(c);
2644
0
    }
2645
0
  }
2646
0
else
2647
#else
2648
(void)utf;  /* Avoid compiler warning */
2649
#endif      /* SUPPORT_UNICODE */
2650
2651
/* Handle non-group names and group names in non-UTF modes. A group name must
2652
not start with a digit. If either of the others start with a digit it just
2653
won't be recognized. */
2654
2655
0
  {
2656
0
  if (is_group && IS_DIGIT(*ptr))
2657
0
    {
2658
0
    ++ptr;
2659
0
    *errorcodeptr = ERR44;
2660
0
    goto FAILED;
2661
0
    }
2662
2663
0
  while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2664
0
    {
2665
0
    ptr++;
2666
0
    }
2667
0
  }
2668
2669
/* Check name length */
2670
2671
0
if (ptr - *nameptr > MAX_NAME_SIZE)
2672
0
  {
2673
0
  *errorcodeptr = ERR48;
2674
0
  goto FAILED;
2675
0
  }
2676
0
*namelenptr = (uint32_t)(ptr - *nameptr);
2677
2678
/* Subpattern names must not be empty, and their terminator is checked here.
2679
(What follows a verb or alpha assertion name is checked separately.) */
2680
2681
0
if (is_group)
2682
0
  {
2683
0
  if (ptr == *nameptr)
2684
0
    {
2685
0
    *errorcodeptr = ERR62;   /* Subpattern name expected */
2686
0
    goto FAILED;
2687
0
    }
2688
0
  if (is_braced)
2689
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2690
0
  if (terminator != 0)
2691
0
    {
2692
0
    if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2693
0
      {
2694
0
      *errorcodeptr = ERR42;
2695
0
      goto FAILED;
2696
0
      }
2697
0
    ptr++;
2698
0
    }
2699
0
  }
2700
2701
0
*ptrptr = ptr;
2702
0
return TRUE;
2703
2704
0
FAILED:
2705
0
*ptrptr = ptr;
2706
0
return FALSE;
2707
0
}
2708
2709
2710
2711
/**************************************************
2712
*        Parse capturing bracket argument list    *
2713
**************************************************/
2714
2715
/* Reads a list of capture references. The references
2716
can be numbers or names.
2717
2718
Arguments:
2719
  ptrptr           points to the character pointer variable
2720
  ptrend           points to the end of the input string
2721
  utf              true if the input is UTF-encoded
2722
  parsed_pattern   the parsed pattern pointer
2723
  offset           last known offset
2724
  errcodeptr       where to put an error code
2725
  cb               pointer to the compile data block
2726
2727
Returns: updated parsed_pattern pointer on success
2728
         NULL otherwise
2729
*/
2730
2731
static uint32_t *
2732
parse_capture_list(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
2733
  BOOL utf, uint32_t *parsed_pattern, PCRE2_SIZE offset,
2734
  int *errorcodeptr, compile_block *cb)
2735
0
{
2736
0
PCRE2_SIZE next_offset;
2737
0
PCRE2_SPTR ptr = *ptrptr;
2738
0
PCRE2_SPTR name;
2739
0
PCRE2_UCHAR terminator;
2740
0
uint32_t meta, namelen;
2741
0
int i;
2742
2743
0
if (ptr >= ptrend || *ptr != CHAR_LEFT_PARENTHESIS)
2744
0
  {
2745
0
  *errorcodeptr = ERR118;
2746
0
  goto FAILED;
2747
0
  }
2748
2749
0
for (;;)
2750
0
  {
2751
0
  ptr++;
2752
0
  next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
2753
2754
0
  if (ptr >= ptrend)
2755
0
    {
2756
0
    *errorcodeptr = ERR117;
2757
0
    goto FAILED;
2758
0
    }
2759
2760
  /* Handle [+-]number cases */
2761
0
  if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
2762
0
      &i, errorcodeptr))
2763
0
    {
2764
0
    PCRE2_ASSERT(i >= 0);
2765
0
    if (i <= 0)
2766
0
      {
2767
0
      *errorcodeptr = ERR15;
2768
0
      goto FAILED;
2769
0
      }
2770
0
    meta = META_CAPTURE_NUMBER;
2771
0
    namelen = (uint32_t)i;
2772
0
    }
2773
0
  else if (*errorcodeptr != 0) goto FAILED; /* Number too big */
2774
0
  else
2775
0
    {
2776
    /* Handle 'name' or <name> cases. */
2777
0
    if (*ptr == CHAR_LESS_THAN_SIGN)
2778
0
      terminator = CHAR_GREATER_THAN_SIGN;
2779
0
    else if (*ptr == CHAR_APOSTROPHE)
2780
0
      terminator = CHAR_APOSTROPHE;
2781
0
    else
2782
0
      {
2783
0
      *errorcodeptr = ERR117;
2784
0
      goto FAILED;
2785
0
      }
2786
2787
0
    if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,
2788
0
        &name, &namelen, errorcodeptr, cb)) goto FAILED;
2789
2790
0
    meta = META_CAPTURE_NAME;
2791
0
    }
2792
2793
0
  PCRE2_ASSERT(next_offset > 0);
2794
0
  if (offset == 0 || (next_offset - offset) >= 0x10000)
2795
0
    {
2796
0
    *parsed_pattern++ = META_OFFSET;
2797
0
    PUTOFFSET(next_offset, parsed_pattern);
2798
0
    offset = next_offset;
2799
0
    }
2800
2801
  /* The offset is encoded as a relative offset, because for some
2802
  inputs such as ",2" in (1,2,3), we only have space for two uint32_t
2803
  values, and an opcode and absolute offset may require three uint32_t
2804
  values. */
2805
0
  *parsed_pattern++ = meta | (uint32_t)(next_offset - offset);
2806
0
  *parsed_pattern++ = namelen;
2807
0
  offset = next_offset;
2808
2809
0
  if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
2810
2811
0
  if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
2812
2813
0
  if (*ptr != CHAR_COMMA)
2814
0
    {
2815
0
    *errorcodeptr = ERR24;
2816
0
    goto FAILED;
2817
0
    }
2818
0
  }
2819
2820
0
*ptrptr = ptr + 1;
2821
0
return parsed_pattern;
2822
2823
0
UNCLOSED_PARENTHESIS:
2824
0
*errorcodeptr = ERR14;
2825
2826
0
FAILED:
2827
0
*ptrptr = ptr;
2828
0
return NULL;
2829
0
}
2830
2831
2832
2833
/*************************************************
2834
*          Manage callouts at start of cycle     *
2835
*************************************************/
2836
2837
/* At the start of a new item in parse_regex() we are able to record the
2838
details of the previous item in a prior callout, and also to set up an
2839
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2840
which would otherwise happen for items such as \Q that contribute nothing to
2841
the parsed pattern.
2842
2843
Arguments:
2844
  ptr              current pattern pointer
2845
  pcalloutptr      points to a pointer to previous callout, or NULL
2846
  auto_callout     TRUE if auto_callouts are enabled
2847
  parsed_pattern   the parsed pattern pointer
2848
  cb               compile block
2849
2850
Returns: possibly updated parsed_pattern pointer.
2851
*/
2852
2853
static uint32_t *
2854
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2855
  uint32_t *parsed_pattern, compile_block *cb)
2856
276
{
2857
276
uint32_t *previous_callout = *pcalloutptr;
2858
2859
276
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2860
0
  cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2861
2862
276
if (!auto_callout) previous_callout = NULL; else
2863
0
  {
2864
0
  if (previous_callout == NULL ||
2865
0
      previous_callout != parsed_pattern - 4 ||
2866
0
      previous_callout[3] != 255)
2867
0
    {
2868
0
    previous_callout = parsed_pattern;  /* Set up new automatic callout */
2869
0
    parsed_pattern += 4;
2870
0
    previous_callout[0] = META_CALLOUT_NUMBER;
2871
0
    previous_callout[2] = 0;
2872
0
    previous_callout[3] = 255;
2873
0
    }
2874
0
  previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2875
0
  }
2876
2877
276
*pcalloutptr = previous_callout;
2878
276
return parsed_pattern;
2879
276
}
2880
2881
2882
2883
/*************************************************
2884
*          Handle \d, \D, \s, \S, \w, \W         *
2885
*************************************************/
2886
2887
/* This function is called from parse_regex() below, both for freestanding
2888
escapes, and those within classes, to handle those escapes that may change when
2889
Unicode property support is requested. Note that PCRE2_UCP will never be set
2890
without Unicode support because that is checked when pcre2_compile() is called.
2891
2892
Arguments:
2893
  escape          the ESC_... value
2894
  parsed_pattern  where to add the code
2895
  options         options bits
2896
  xoptions        extra options bits
2897
2898
Returns:          updated value of parsed_pattern
2899
*/
2900
static uint32_t *
2901
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2902
  uint32_t xoptions)
2903
8
{
2904
8
uint32_t ascii_option = 0;
2905
8
uint32_t prop = ESC_p;
2906
2907
8
switch(escape)
2908
8
  {
2909
0
  case ESC_D:
2910
0
  prop = ESC_P;
2911
0
  PCRE2_FALLTHROUGH /* Fall through */
2912
0
  case ESC_d:
2913
0
  ascii_option = PCRE2_EXTRA_ASCII_BSD;
2914
0
  break;
2915
2916
0
  case ESC_S:
2917
0
  prop = ESC_P;
2918
0
  PCRE2_FALLTHROUGH /* Fall through */
2919
8
  case ESC_s:
2920
8
  ascii_option = PCRE2_EXTRA_ASCII_BSS;
2921
8
  break;
2922
2923
0
  case ESC_W:
2924
0
  prop = ESC_P;
2925
0
  PCRE2_FALLTHROUGH /* Fall through */
2926
0
  case ESC_w:
2927
0
  ascii_option = PCRE2_EXTRA_ASCII_BSW;
2928
0
  break;
2929
8
  }
2930
2931
8
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2932
0
  {
2933
0
  *parsed_pattern++ = META_ESCAPE + escape;
2934
0
  }
2935
8
else
2936
8
  {
2937
8
  *parsed_pattern++ = META_ESCAPE + prop;
2938
8
  switch(escape)
2939
8
    {
2940
0
    case ESC_d:
2941
0
    case ESC_D:
2942
0
    *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2943
0
    break;
2944
2945
8
    case ESC_s:
2946
8
    case ESC_S:
2947
8
    *parsed_pattern++ = PT_SPACE << 16;
2948
8
    break;
2949
2950
0
    case ESC_w:
2951
0
    case ESC_W:
2952
0
    *parsed_pattern++ = PT_WORD << 16;
2953
0
    break;
2954
8
    }
2955
8
  }
2956
2957
8
return parsed_pattern;
2958
8
}
2959
2960
2961
2962
/*************************************************
2963
* Maximum size of parsed_pattern for given input *
2964
*************************************************/
2965
2966
/* This function is called from parse_regex() below, to determine the amount
2967
of memory to allocate for parsed_pattern. It is also called to check whether
2968
the amount of data written respects the amount of memory allocated.
2969
2970
Arguments:
2971
  ptr             points to the start of the pattern
2972
  ptrend          points to the end of the pattern
2973
  utf             TRUE in UTF mode
2974
  options         the options bits
2975
2976
Returns:          the number of uint32_t units for parsed_pattern
2977
*/
2978
static ptrdiff_t
2979
max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,
2980
  uint32_t options)
2981
16
{
2982
16
PCRE2_SIZE big32count = 0;
2983
16
ptrdiff_t parsed_size_needed;
2984
2985
/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of
2986
unsigned 32-bit ints written out to the parsed pattern is bounded by the length
2987
of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,
2988
when literal characters greater than META_END (0x80000000) have to be coded as
2989
two units. In this case, therefore, we scan the pattern to check for such
2990
values. */
2991
2992
#if PCRE2_CODE_UNIT_WIDTH == 32
2993
if (!utf)
2994
  {
2995
  PCRE2_SPTR p;
2996
  for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;
2997
  }
2998
#else
2999
16
(void)utf;  /* Avoid compiler warning */
3000
16
#endif
3001
3002
16
parsed_size_needed = (ptrend - ptr) + big32count;
3003
3004
/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (4
3005
elements) for each character. This is overkill, but memory is plentiful these
3006
days. */
3007
3008
16
if ((options & PCRE2_AUTO_CALLOUT) != 0)
3009
0
  parsed_size_needed += (ptrend - ptr) * 4;
3010
3011
16
return parsed_size_needed;
3012
16
}
3013
3014
3015
3016
/*************************************************
3017
*      Parse regex and identify named groups     *
3018
*************************************************/
3019
3020
/* This function is called first of all. It scans the pattern and does two
3021
things: (1) It identifies capturing groups and makes a table of named capturing
3022
groups so that information about them is fully available to both the compiling
3023
scans. (2) It writes a parsed version of the pattern with comments omitted and
3024
escapes processed into the parsed_pattern vector.
3025
3026
Arguments:
3027
  ptr             points to the start of the pattern
3028
  options         compiling dynamic options (may change during the scan)
3029
  has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
3030
  cb              pointer to the compile data block
3031
3032
Returns:   zero on success or a non-zero error code, with the
3033
             error offset placed in the cb field
3034
*/
3035
3036
/* A structure and some flags for dealing with nested groups. */
3037
3038
typedef struct nest_save {
3039
  uint16_t  nest_depth;
3040
  uint16_t  reset_group;
3041
  uint16_t  max_group;
3042
  uint16_t  flags;
3043
  uint32_t  options;
3044
  uint32_t  xoptions;
3045
} nest_save;
3046
3047
0
#define NSF_RESET          0x0001u
3048
0
#define NSF_CONDASSERT     0x0002u
3049
0
#define NSF_ATOMICSR       0x0004u
3050
3051
/* Options that are changeable within the pattern must be tracked during
3052
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
3053
but all must be tracked so that META_OPTIONS items set the correct values for
3054
the main compiling phase. */
3055
3056
0
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
3057
0
  PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
3058
0
  PCRE2_UNGREEDY)
3059
3060
0
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
3061
0
  PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
3062
0
  PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
3063
3064
/* States used for analyzing ranges in character classes. The two OK values
3065
must be last. */
3066
3067
enum {
3068
  RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
3069
  RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
3070
  RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
3071
  RANGE_FORBID_STARTED, /* State after '[\d-'*/
3072
  RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */
3073
  RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */
3074
};
3075
3076
/* States used for analyzing operators and operands in extended character
3077
classes. */
3078
3079
enum {
3080
  CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */
3081
  CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */
3082
  CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */
3083
};
3084
3085
/* States used for determining the parse mode in character classes. The two
3086
PERL_EXT values must be last. */
3087
3088
enum {
3089
  CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */
3090
  CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */
3091
  CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */
3092
  CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */
3093
};
3094
3095
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
3096
the storing of literal values in the main parsed pattern, where they can always
3097
be quantified. */
3098
3099
#if PCRE2_CODE_UNIT_WIDTH == 32
3100
#define PARSED_LITERAL(c, p) \
3101
  { \
3102
  if (c >= META_END) *p++ = META_BIGVALUE; \
3103
  *p++ = c; \
3104
  okquantifier = TRUE; \
3105
  }
3106
#else
3107
300
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
3108
#endif
3109
3110
/* Here's the actual function. */
3111
3112
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,
3113
  BOOL *has_lookbehind, compile_block *cb)
3114
16
{
3115
16
uint32_t c;
3116
16
uint32_t delimiter;
3117
16
uint32_t namelen;
3118
16
uint32_t class_range_state;
3119
16
uint32_t class_op_state;
3120
16
uint32_t class_mode_state;
3121
16
uint32_t *class_start;
3122
16
uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
3123
16
uint32_t *verbstartptr = NULL;
3124
16
uint32_t *previous_callout = NULL;
3125
16
uint32_t *parsed_pattern = cb->parsed_pattern;
3126
16
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
3127
16
uint32_t *this_parsed_item = NULL;
3128
16
uint32_t *prev_parsed_item = NULL;
3129
16
uint32_t meta_quantifier = 0;
3130
16
uint32_t add_after_mark = 0;
3131
16
uint16_t nest_depth = 0;
3132
16
int16_t class_depth_m1 = -1; /* The m1 means minus 1. */
3133
16
int16_t class_maxdepth_m1 = -1;
3134
16
uint16_t hash;
3135
16
int after_manual_callout = 0;
3136
16
int expect_cond_assert = 0;
3137
16
int errorcode = 0;
3138
16
int escape;
3139
16
int i;
3140
16
BOOL inescq = FALSE;
3141
16
BOOL inverbname = FALSE;
3142
16
BOOL utf = (options & PCRE2_UTF) != 0;
3143
16
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
3144
16
BOOL is_dupname;
3145
16
BOOL negate_class;
3146
16
BOOL okquantifier = FALSE;
3147
16
PCRE2_SPTR thisptr;
3148
16
PCRE2_SPTR name;
3149
16
PCRE2_SPTR ptrend = cb->end_pattern;
3150
16
PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
3151
16
PCRE2_SPTR class_range_forbid_ptr = NULL;
3152
16
named_group *ng;
3153
16
nest_save *top_nest, *end_nests;
3154
#ifdef PCRE2_DEBUG
3155
uint32_t *parsed_pattern_check;
3156
ptrdiff_t parsed_pattern_extra = 0;
3157
ptrdiff_t parsed_pattern_extra_check = 0;
3158
PCRE2_SPTR ptr_check;
3159
#endif
3160
3161
16
PCRE2_ASSERT(parsed_pattern != NULL);
3162
3163
/* Insert leading items for word and line matching (features provided for the
3164
benefit of pcre2grep). */
3165
3166
16
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
3167
0
  {
3168
0
  *parsed_pattern++ = META_CIRCUMFLEX;
3169
0
  *parsed_pattern++ = META_NOCAPTURE;
3170
0
  }
3171
16
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
3172
0
  {
3173
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
3174
0
  *parsed_pattern++ = META_NOCAPTURE;
3175
0
  }
3176
3177
#ifdef PCRE2_DEBUG
3178
parsed_pattern_check = parsed_pattern;
3179
ptr_check = ptr;
3180
#endif
3181
3182
/* If the pattern is actually a literal string, process it separately to avoid
3183
cluttering up the main loop. */
3184
3185
16
if ((options & PCRE2_LITERAL) != 0)
3186
0
  {
3187
0
  while (ptr < ptrend)
3188
0
    {
3189
    /* LCOV_EXCL_START */
3190
0
    if (parsed_pattern >= parsed_pattern_end)
3191
0
      {
3192
0
      PCRE2_DEBUG_UNREACHABLE();
3193
0
      errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3194
0
      goto FAILED;
3195
0
      }
3196
    /* LCOV_EXCL_STOP */
3197
3198
0
    thisptr = ptr;
3199
0
    GETCHARINCTEST(c, ptr);
3200
0
    if (auto_callout)
3201
0
      parsed_pattern = manage_callouts(thisptr, &previous_callout,
3202
0
        auto_callout, parsed_pattern, cb);
3203
0
    PARSED_LITERAL(c, parsed_pattern);
3204
0
    }
3205
0
  goto PARSED_END;
3206
0
  }
3207
3208
/* Process a real regex which may contain meta-characters. */
3209
3210
16
top_nest = NULL;
3211
16
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3212
3213
/* The size of the nest_save structure might not be a factor of the size of the
3214
workspace. Therefore we must round down end_nests so as to correctly avoid
3215
creating a nest_save that spans the end of the workspace. */
3216
3217
16
end_nests = (nest_save *)((char *)end_nests -
3218
16
  ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3219
3220
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
3221
3222
16
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
3223
3224
/* Now scan the pattern */
3225
3226
396
while (ptr < ptrend)
3227
380
  {
3228
380
  int prev_expect_cond_assert;
3229
380
  uint32_t min_repeat = 0, max_repeat = 0;
3230
380
  uint32_t set, unset, *optset;
3231
380
  uint32_t xset, xunset, *xoptset;
3232
380
  uint32_t terminator;
3233
380
  uint32_t prev_meta_quantifier;
3234
380
  BOOL prev_okquantifier;
3235
380
  PCRE2_SPTR tempptr;
3236
380
  PCRE2_SIZE offset;
3237
3238
380
  if (nest_depth > cb->cx->parens_nest_limit)
3239
0
    {
3240
0
    errorcode = ERR19;
3241
0
    goto FAILED;        /* Parentheses too deeply nested */
3242
0
    }
3243
3244
  /* Check that we haven't emitted too much into parsed_pattern. We allocate
3245
  a suitably-sized buffer upfront, then do unchecked writes to it. If we only
3246
  write a little bit too much, everything will appear to be OK, because the
3247
  upfront size is an overestimate... but a malicious pattern could end up
3248
  forcing a write past the buffer end. We must catch this during
3249
  development. */
3250
3251
#ifdef PCRE2_DEBUG
3252
  /* Strong post-write check. Won't help in release builds - at this point
3253
  the write has already occurred so it's too late. However, should stop us
3254
  committing unsafe code. */
3255
  PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
3256
               (parsed_pattern_extra - parsed_pattern_extra_check) <=
3257
                 max_parsed_pattern(ptr_check, ptr, utf, options));
3258
  parsed_pattern_check = parsed_pattern;
3259
  parsed_pattern_extra_check = parsed_pattern_extra;
3260
  ptr_check = ptr;
3261
#endif
3262
3263
  /* LCOV_EXCL_START */
3264
380
  if (parsed_pattern >= parsed_pattern_end)
3265
0
    {
3266
    /* Weak pre-write check; only ensures parsed_pattern[0] is writeable
3267
    (but the code below can write many chars). Better than nothing. */
3268
0
    PCRE2_DEBUG_UNREACHABLE();
3269
0
    errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3270
0
    goto FAILED;
3271
0
    }
3272
  /* LCOV_EXCL_STOP */
3273
3274
  /* If the last time round this loop something was added, parsed_pattern will
3275
  no longer be equal to this_parsed_item. Remember where the previous item
3276
  started and reset for the next item. Note that sometimes round the loop,
3277
  nothing gets added (e.g. for ignored white space). */
3278
3279
380
  if (this_parsed_item != parsed_pattern)
3280
380
    {
3281
380
    prev_parsed_item = this_parsed_item;
3282
380
    this_parsed_item = parsed_pattern;
3283
380
    }
3284
3285
  /* Get next input character, save its position for callout handling. */
3286
3287
380
  thisptr = ptr;
3288
380
  GETCHARINCTEST(c, ptr);
3289
3290
  /* Copy quoted literals until \E, allowing for the possibility of automatic
3291
  callouts, except when processing a (*VERB) "name".  */
3292
3293
380
  if (inescq)
3294
0
    {
3295
0
    if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3296
0
      {
3297
0
      inescq = FALSE;
3298
0
      ptr++;   /* Skip E */
3299
0
      }
3300
0
    else
3301
0
      {
3302
0
      if (inverbname)
3303
0
        {                          /* Don't use PARSED_LITERAL() because it */
3304
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3305
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3306
#endif
3307
0
        *parsed_pattern++ = c;
3308
0
        }
3309
0
      else
3310
0
        {
3311
0
        if (after_manual_callout-- <= 0)
3312
0
          parsed_pattern = manage_callouts(thisptr, &previous_callout,
3313
0
            auto_callout, parsed_pattern, cb);
3314
0
        PARSED_LITERAL(c, parsed_pattern);
3315
0
        }
3316
0
      meta_quantifier = 0;
3317
0
      }
3318
0
    continue;  /* Next character */
3319
0
    }
3320
3321
  /* If we are processing the "name" part of a (*VERB:NAME) item, all
3322
  characters up to the closing parenthesis are literals except when
3323
  PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
3324
  and \E and escaped characters are allowed (no character types such as \d). If
3325
  PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
3326
  this by not entering the special (*VERB:NAME) processing - they are then
3327
  picked up below. Note that c is a character, not a code unit, so we must not
3328
  use MAX_255 to test its size because MAX_255 tests code units and is assumed
3329
  TRUE in 8-bit mode. */
3330
3331
380
  if (inverbname &&
3332
0
       (
3333
        /* EITHER: not both options set */
3334
0
        ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
3335
0
                    (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
3336
0
#ifdef SUPPORT_UNICODE
3337
        /* OR: character > 255 AND not Unicode Pattern White Space */
3338
0
        (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
3339
0
#endif
3340
        /* OR: not a # comment or isspace() white space */
3341
0
        (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
3342
0
#ifdef SUPPORT_UNICODE
3343
        /* and not CHAR_NEL when Unicode is supported */
3344
0
          && c != CHAR_NEL
3345
0
#endif
3346
0
       )))
3347
0
    {
3348
0
    PCRE2_SIZE verbnamelength;
3349
3350
0
    switch(c)
3351
0
      {
3352
0
      default:                     /* Don't use PARSED_LITERAL() because it */
3353
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3354
      if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3355
#endif
3356
0
      *parsed_pattern++ = c;
3357
0
      break;
3358
3359
0
      case CHAR_RIGHT_PARENTHESIS:
3360
0
      inverbname = FALSE;
3361
      /* This is the length in characters */
3362
0
      verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
3363
      /* But the limit on the length is in code units */
3364
0
      if (ptr - verbnamestart - 1 > (int)MAX_MARK)
3365
0
        {
3366
0
        ptr--;
3367
0
        errorcode = ERR76;
3368
0
        goto FAILED;
3369
0
        }
3370
0
      *verblengthptr = (uint32_t)verbnamelength;
3371
3372
      /* If this name was on a verb such as (*ACCEPT) which does not continue,
3373
      a (*MARK) was generated for the name. We now add the original verb as the
3374
      next item. */
3375
3376
0
      if (add_after_mark != 0)
3377
0
        {
3378
0
        *parsed_pattern++ = add_after_mark;
3379
0
        add_after_mark = 0;
3380
0
        }
3381
0
      break;
3382
3383
0
      case CHAR_BACKSLASH:
3384
0
      if ((options & PCRE2_ALT_VERBNAMES) != 0)
3385
0
        {
3386
0
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3387
0
          xoptions, cb->bracount, FALSE, cb);
3388
0
        if (errorcode != 0) goto FAILED;
3389
0
        }
3390
0
      else escape = 0;   /* Treat all as literal */
3391
3392
0
      switch(escape)
3393
0
        {
3394
0
        case 0:                    /* Don't use PARSED_LITERAL() because it */
3395
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3396
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3397
#endif
3398
0
        *parsed_pattern++ = c;
3399
0
        break;
3400
3401
0
        case ESC_ub:
3402
0
        *parsed_pattern++ = CHAR_u;
3403
0
        PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3404
0
        break;
3405
3406
0
        case ESC_Q:
3407
0
        inescq = TRUE;
3408
0
        break;
3409
3410
0
        case ESC_E:           /* Ignore */
3411
0
        break;
3412
3413
0
        default:
3414
0
        errorcode = ERR40;    /* Invalid in verb name */
3415
0
        goto FAILED;
3416
0
        }
3417
0
      }
3418
0
    continue;   /* Next character in pattern */
3419
0
    }
3420
3421
  /* Not a verb name character. At this point we must process everything that
3422
  must not change the quantification state. This is mainly comments, but we
3423
  handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3424
  A+, as in Perl. An isolated \E is ignored. */
3425
3426
380
  if (c == CHAR_BACKSLASH && ptr < ptrend)
3427
40
    {
3428
40
    if (*ptr == CHAR_Q || *ptr == CHAR_E)
3429
0
      {
3430
      /* A literal inside a \Q...\E is not allowed if we are expecting a
3431
      conditional assertion, but an empty \Q\E sequence is OK. */
3432
0
      if (expect_cond_assert > 0 && *ptr == CHAR_Q &&
3433
0
          !(ptrend - ptr >= 3 && ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E))
3434
0
        {
3435
0
        ptr--;
3436
0
        errorcode = ERR28;
3437
0
        goto FAILED;
3438
0
        }
3439
0
      inescq = *ptr == CHAR_Q;
3440
0
      ptr++;
3441
0
      continue;
3442
0
      }
3443
40
    }
3444
3445
  /* Skip over whitespace and # comments in extended mode. Note that c is a
3446
  character, not a code unit, so we must not use MAX_255 to test its size
3447
  because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3448
  whitespace characters are those designated as "Pattern White Space" by
3449
  Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3450
  U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3451
  subset of space characters that match \h and \v. */
3452
3453
380
  if ((options & PCRE2_EXTENDED) != 0)
3454
0
    {
3455
0
    if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3456
0
#ifdef SUPPORT_UNICODE
3457
0
    if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3458
0
#endif
3459
0
    if (c == CHAR_NUMBER_SIGN)
3460
0
      {
3461
0
      while (ptr < ptrend)
3462
0
        {
3463
0
        if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3464
0
          {                       /* IS_NEWLINE sets cb->nllen. */
3465
0
          ptr += cb->nllen;
3466
0
          break;
3467
0
          }
3468
0
        ptr++;
3469
0
#ifdef SUPPORT_UNICODE
3470
0
        if (utf) FORWARDCHARTEST(ptr, ptrend);
3471
0
#endif
3472
0
        }
3473
0
      continue;  /* Next character in pattern */
3474
0
      }
3475
0
    }
3476
3477
  /* Skip over bracketed comments */
3478
3479
380
  if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3480
8
      ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3481
0
    {
3482
0
    while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3483
0
    if (ptr >= ptrend)
3484
0
      {
3485
0
      errorcode = ERR18;  /* A special error for missing ) in a comment */
3486
0
      goto FAILED;        /* to make it easier to debug. */
3487
0
      }
3488
0
    ptr++;
3489
0
    continue;  /* Next character in pattern */
3490
0
    }
3491
3492
  /* If the next item is not a quantifier, fill in length of any previous
3493
  callout and create an auto callout if required. */
3494
3495
380
  if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3496
308
       (c != CHAR_LEFT_CURLY_BRACKET ||
3497
48
         (tempptr = ptr,
3498
48
         !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3499
260
    {
3500
260
    if (after_manual_callout-- <= 0)
3501
260
      {
3502
260
      parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3503
260
        parsed_pattern, cb);
3504
260
      this_parsed_item = parsed_pattern;  /* New start for current item */
3505
260
      }
3506
260
    }
3507
3508
  /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3509
  assertion, possibly preceded by a callout. If the value is 1, we have just
3510
  had the callout and expect an assertion. There must be at least 3 more
3511
  characters in all cases. When expect_cond_assert is 2, we know that the
3512
  current character is an opening parenthesis, as otherwise we wouldn't be
3513
  here. However, when it is 1, we need to check, and it's easiest just to check
3514
  always. Note that expect_cond_assert may be negative, since all callouts just
3515
  decrement it. */
3516
3517
380
  if (expect_cond_assert > 0)
3518
0
    {
3519
0
    BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3520
0
              (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3521
0
    if (ok)
3522
0
      {
3523
0
      if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3524
0
        {
3525
0
        ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3526
0
        }
3527
0
      else switch(ptr[1])  /* Traditional symbolic format */
3528
0
        {
3529
0
        case CHAR_C:
3530
0
        ok = expect_cond_assert == 2;
3531
0
        break;
3532
3533
0
        case CHAR_EQUALS_SIGN:
3534
0
        case CHAR_EXCLAMATION_MARK:
3535
0
        break;
3536
3537
0
        case CHAR_LESS_THAN_SIGN:
3538
0
        ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3539
0
        break;
3540
3541
0
        default:
3542
0
        ok = FALSE;
3543
0
        }
3544
0
      }
3545
3546
0
    if (!ok)
3547
0
      {
3548
0
      errorcode = ERR28;
3549
0
      if (expect_cond_assert == 2) goto FAILED;
3550
0
      goto FAILED_BACK;
3551
0
      }
3552
0
    }
3553
3554
  /* Remember whether we are expecting a conditional assertion, and set the
3555
  default for this item. */
3556
3557
380
  prev_expect_cond_assert = expect_cond_assert;
3558
380
  expect_cond_assert = 0;
3559
3560
  /* Remember quantification status for the previous significant item, then set
3561
  default for this item. */
3562
3563
380
  prev_okquantifier = okquantifier;
3564
380
  prev_meta_quantifier = meta_quantifier;
3565
380
  okquantifier = FALSE;
3566
380
  meta_quantifier = 0;
3567
3568
  /* If the previous significant item was a quantifier, adjust the parsed code
3569
  if there is a following modifier. The base meta value is always followed by
3570
  the PLUS and QUERY values, in that order. We do this here rather than after
3571
  reading a quantifier so that intervening comments and /x whitespace can be
3572
  ignored without having to replicate code. */
3573
3574
380
  if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3575
0
    {
3576
0
    parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3577
0
      prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3578
0
        0x00020000u : 0x00010000u);
3579
0
    continue;  /* Next character in pattern */
3580
0
    }
3581
3582
  /* Process the next item in the main part of a pattern. */
3583
3584
380
  switch(c)
3585
380
    {
3586
108
    default:              /* Non-special character */
3587
108
    PARSED_LITERAL(c, parsed_pattern);
3588
108
    break;
3589
3590
3591
    /* ---- Escape sequence ---- */
3592
3593
40
    case CHAR_BACKSLASH:
3594
40
    tempptr = ptr;
3595
40
    escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3596
40
      xoptions, cb->bracount, FALSE, cb);
3597
40
    if (errorcode != 0)
3598
0
      {
3599
0
      ESCAPE_FAILED:
3600
0
      if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3601
0
        goto FAILED;
3602
0
      ptr = tempptr;
3603
0
      if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3604
0
        {
3605
0
        GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3606
0
        }
3607
0
      escape = 0;                 /* Treat as literal character */
3608
0
      }
3609
3610
    /* The escape was a data escape or literal character. */
3611
3612
40
    if (escape == 0)
3613
36
      {
3614
36
      PARSED_LITERAL(c, parsed_pattern);
3615
36
      }
3616
3617
    /* The escape was a back (or forward) reference. We keep the offset in
3618
    order to give a more useful diagnostic for a bad forward reference. For
3619
    references to groups numbered less than 10 we can't use more than two items
3620
    in parsed_pattern because they may be just two characters in the input (and
3621
    in a 64-bit world an offset may need two elements). So for them, the offset
3622
    of the first occurrent is held in a special vector. */
3623
3624
4
    else if (escape < 0)
3625
0
      {
3626
0
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
3627
0
      escape = -escape - 1;
3628
0
      *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3629
0
      if (escape < 10)
3630
0
        {
3631
0
        if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3632
0
          cb->small_ref_offset[escape] = offset;
3633
0
        }
3634
0
      else
3635
0
        {
3636
0
        PUTOFFSET(offset, parsed_pattern);
3637
0
        }
3638
0
      okquantifier = TRUE;
3639
0
      }
3640
3641
    /* The escape was a character class such as \d etc. or other special
3642
    escape indicator such as \A or \X. Most of them generate just a single
3643
    parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3644
    value. They are supported only when Unicode is available. The type and
3645
    value are packed into a single 32-bit value so that the whole sequences
3646
    uses only two elements in the parsed_vector. This is because the same
3647
    coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3648
    set.
3649
3650
    There are also some cases where the escape sequence is followed by a name:
3651
    \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3652
    and \g'name' are subroutine calls by name; \g{name} is a synonym for
3653
    \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3654
    and returned as a negative value (handled above). A name is coded as an
3655
    offset into the pattern and a length. */
3656
3657
4
    else switch (escape)
3658
4
      {
3659
0
      case ESC_C:
3660
#ifdef NEVER_BACKSLASH_C
3661
      errorcode = ERR85;
3662
      goto ESCAPE_FAILED;
3663
#else
3664
0
      if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3665
0
        {
3666
0
        errorcode = ERR83;
3667
0
        goto ESCAPE_FAILED;
3668
0
        }
3669
0
#endif
3670
0
      okquantifier = TRUE;
3671
0
      *parsed_pattern++ = META_ESCAPE + escape;
3672
0
      break;
3673
3674
      /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3675
      when \u{ is not followed by hex digits and }. It requests two literal
3676
      characters, u and { and we need this, as otherwise \u{ 12} (for example)
3677
      would be treated as u{12} now that spaces are allowed in quantifiers. */
3678
3679
0
      case ESC_ub:
3680
0
      *parsed_pattern++ = CHAR_u;
3681
0
      PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3682
0
      break;
3683
3684
0
      case ESC_X:
3685
#ifndef SUPPORT_UNICODE
3686
      errorcode = ERR45;   /* Supported only with Unicode support */
3687
      goto ESCAPE_FAILED;
3688
#endif
3689
0
      case ESC_H:
3690
0
      case ESC_h:
3691
0
      case ESC_N:
3692
0
      case ESC_R:
3693
0
      case ESC_V:
3694
0
      case ESC_v:
3695
0
      okquantifier = TRUE;
3696
0
      *parsed_pattern++ = META_ESCAPE + escape;
3697
0
      break;
3698
3699
0
      default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3700
0
      *parsed_pattern++ = META_ESCAPE + escape;
3701
0
      break;
3702
3703
      /* Escapes that may change in UCP mode. */
3704
3705
0
      case ESC_d:
3706
0
      case ESC_D:
3707
4
      case ESC_s:
3708
4
      case ESC_S:
3709
4
      case ESC_w:
3710
4
      case ESC_W:
3711
4
      okquantifier = TRUE;
3712
4
      parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3713
4
        xoptions);
3714
4
      break;
3715
3716
      /* Unicode property matching */
3717
3718
0
      case ESC_P:
3719
0
      case ESC_p:
3720
0
#ifdef SUPPORT_UNICODE
3721
0
        {
3722
0
        BOOL negated;
3723
0
        uint16_t ptype = 0, pdata = 0;
3724
0
        if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))
3725
0
          goto ESCAPE_FAILED;
3726
0
        if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3727
0
        *parsed_pattern++ = META_ESCAPE + escape;
3728
0
        *parsed_pattern++ = (ptype << 16) | pdata;
3729
0
        okquantifier = TRUE;
3730
0
        }
3731
#else
3732
      errorcode = ERR45;
3733
      goto ESCAPE_FAILED;
3734
#endif
3735
0
      break;  /* End \P and \p */
3736
3737
      /* When \g is used with quotes or angle brackets as delimiters, it is a
3738
      numerical or named subroutine call, and control comes here. When used
3739
      with brace delimiters it is a numerical back reference and does not come
3740
      here because check_escape() returns it directly as a reference. \k is
3741
      always a named back reference. */
3742
3743
0
      case ESC_g:
3744
0
      case ESC_k:
3745
0
      if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3746
0
          *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3747
0
        {
3748
0
        errorcode = (escape == ESC_g)? ERR57 : ERR69;
3749
0
        goto ESCAPE_FAILED;
3750
0
        }
3751
0
      terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3752
0
        CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3753
0
        CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3754
3755
      /* For a non-braced \g, check for a numerical recursion. */
3756
3757
0
      if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3758
0
        {
3759
0
        PCRE2_SPTR p = ptr + 1;
3760
3761
0
        if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3762
0
            &errorcode))
3763
0
          {
3764
0
          if (p >= ptrend || *p != terminator)
3765
0
            {
3766
0
            ptr = p;
3767
0
            errorcode = ERR119;  /* Missing terminator for number */
3768
0
            goto ESCAPE_FAILED;
3769
0
            }
3770
0
          ptr = p + 1;
3771
0
          goto SET_RECURSION;
3772
0
          }
3773
0
        if (errorcode != 0) goto ESCAPE_FAILED;
3774
0
        }
3775
3776
      /* Not a numerical recursion. Perl allows spaces and tabs after { and
3777
      before } but not for other delimiters. */
3778
3779
0
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3780
0
          &errorcode, cb)) goto ESCAPE_FAILED;
3781
3782
      /* \k and \g when used with braces are back references, whereas \g used
3783
      with quotes or angle brackets is a recursion */
3784
3785
0
      *parsed_pattern++ =
3786
0
        (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3787
0
          META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3788
0
      *parsed_pattern++ = namelen;
3789
3790
0
      PUTOFFSET(offset, parsed_pattern);
3791
0
      okquantifier = TRUE;
3792
0
      break;  /* End special escape processing */
3793
4
      }
3794
40
    break;    /* End escape sequence processing */
3795
3796
3797
    /* ---- Single-character special items ---- */
3798
3799
40
    case CHAR_CIRCUMFLEX_ACCENT:
3800
16
    *parsed_pattern++ = META_CIRCUMFLEX;
3801
16
    break;
3802
3803
0
    case CHAR_DOLLAR_SIGN:
3804
0
    *parsed_pattern++ = META_DOLLAR;
3805
0
    break;
3806
3807
4
    case CHAR_DOT:
3808
4
    *parsed_pattern++ = META_DOT;
3809
4
    okquantifier = TRUE;
3810
4
    break;
3811
3812
3813
    /* ---- Single-character quantifiers ---- */
3814
3815
12
    case CHAR_ASTERISK:
3816
12
    meta_quantifier = META_ASTERISK;
3817
12
    goto CHECK_QUANTIFIER;
3818
3819
28
    case CHAR_PLUS:
3820
28
    meta_quantifier = META_PLUS;
3821
28
    goto CHECK_QUANTIFIER;
3822
3823
32
    case CHAR_QUESTION_MARK:
3824
32
    meta_quantifier = META_QUERY;
3825
32
    goto CHECK_QUANTIFIER;
3826
3827
3828
    /* ---- Potential {n,m} quantifier ---- */
3829
3830
48
    case CHAR_LEFT_CURLY_BRACKET:
3831
48
    if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3832
48
        &errorcode))
3833
0
      {
3834
0
      if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3835
0
      PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3836
0
      break;                               /* No more quantifier processing */
3837
0
      }
3838
48
    meta_quantifier = META_MINMAX;
3839
    /* Fall through */
3840
3841
3842
    /* ---- Quantifier post-processing ---- */
3843
3844
    /* Check that a quantifier is allowed after the previous item. This
3845
    guarantees that there is a previous item. */
3846
3847
120
    CHECK_QUANTIFIER:
3848
120
    if (!prev_okquantifier)
3849
0
      {
3850
0
      errorcode = ERR9;
3851
0
      goto FAILED;
3852
0
      }
3853
3854
    /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3855
    quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3856
    sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3857
    wrapping it in non-capturing brackets, but we have to allow for a preceding
3858
    (*MARK) for when (*ACCEPT) has an argument. */
3859
3860
120
    if (*prev_parsed_item == META_ACCEPT)
3861
0
      {
3862
0
      uint32_t *p;
3863
0
      for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3864
0
      *verbstartptr = META_NOCAPTURE;
3865
0
      parsed_pattern[1] = META_KET;
3866
0
      parsed_pattern += 2;
3867
3868
#ifdef PCRE2_DEBUG
3869
      PCRE2_ASSERT(parsed_pattern_extra >= 2);
3870
      parsed_pattern_extra -= 2;
3871
#endif
3872
0
      }
3873
3874
    /* Now we can put the quantifier into the parsed pattern vector. At this
3875
    stage, we have only the basic quantifier. The check for a following + or ?
3876
    modifier happens at the top of the loop, after any intervening comments
3877
    have been removed. */
3878
3879
120
    *parsed_pattern++ = meta_quantifier;
3880
120
    if (c == CHAR_LEFT_CURLY_BRACKET)
3881
48
      {
3882
48
      *parsed_pattern++ = min_repeat;
3883
48
      *parsed_pattern++ = max_repeat;
3884
48
      }
3885
120
    break;
3886
3887
3888
    /* ---- Character class ---- */
3889
3890
76
    case CHAR_LEFT_SQUARE_BRACKET:
3891
3892
    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3893
    used for "start of word" and "end of word". As these are otherwise illegal
3894
    sequences, we don't break anything by recognizing them. They are replaced
3895
    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3896
    erroneous and are handled by the normal code below. */
3897
3898
76
    if (ptrend - ptr >= 6 &&
3899
76
         (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3900
76
          PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3901
0
      {
3902
0
      *parsed_pattern++ = META_ESCAPE + ESC_b;
3903
3904
0
      if (ptr[2] == CHAR_LESS_THAN_SIGN)
3905
0
        {
3906
0
        *parsed_pattern++ = META_LOOKAHEAD;
3907
0
        }
3908
0
      else
3909
0
        {
3910
0
        *parsed_pattern++ = META_LOOKBEHIND;
3911
0
        *has_lookbehind = TRUE;
3912
3913
        /* The offset is used only for the "non-fixed length" error; this won't
3914
        occur here, so just store zero. */
3915
3916
0
        PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3917
0
        }
3918
3919
0
      if ((options & PCRE2_UCP) == 0)
3920
0
        *parsed_pattern++ = META_ESCAPE + ESC_w;
3921
0
      else
3922
0
        {
3923
0
        *parsed_pattern++ = META_ESCAPE + ESC_p;
3924
0
        *parsed_pattern++ = PT_WORD << 16;
3925
0
        }
3926
0
      *parsed_pattern++ = META_KET;
3927
0
      ptr += 6;
3928
0
      okquantifier = TRUE;
3929
0
      break;
3930
0
      }
3931
3932
    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3933
    they are encountered at the top level, so we'll do that too. */
3934
3935
76
    if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3936
76
         *ptr == CHAR_EQUALS_SIGN) &&
3937
0
        check_posix_syntax(ptr, ptrend, &tempptr))
3938
0
      {
3939
0
      errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3940
0
      ptr = tempptr + 2;
3941
0
      goto FAILED;
3942
0
      }
3943
3944
76
    class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?
3945
76
        CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;
3946
3947
    /* Jump here from '(?[...])'. That jump must initialize class_mode_state,
3948
    set c to the '[' character, and ptr to just after the '['. */
3949
3950
76
    FROM_PERL_EXTENDED_CLASS:
3951
76
    okquantifier = TRUE;
3952
3953
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3954
    because there are holes in the encoding, and simply using the range A-Z
3955
    (for example) would include the characters in the holes. This applies only
3956
    to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3957
    in this respect. In order to accommodate this, we keep track of whether
3958
    character values are literal or not, and a state variable for handling
3959
    ranges. */
3960
3961
    /* Loop for the contents of the class. Classes may be nested, if
3962
    PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */
3963
3964
    /* c is still set to '[' so the loop will handle the start of the class. */
3965
3966
76
    class_depth_m1 = -1;
3967
76
    class_maxdepth_m1 = -1;
3968
76
    class_range_state = RANGE_NO;
3969
76
    class_op_state = CLASS_OP_EMPTY;
3970
76
    class_start = NULL;
3971
3972
76
    for (;;)
3973
368
      {
3974
368
      BOOL char_is_literal = TRUE;
3975
3976
      /* Inside \Q...\E everything is literal except \E */
3977
3978
368
      if (inescq)
3979
0
        {
3980
0
        if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3981
0
          {
3982
0
          inescq = FALSE;                   /* Reset literal state */
3983
0
          ptr++;                            /* Skip the 'E' */
3984
0
          goto CLASS_CONTINUE;
3985
0
          }
3986
3987
        /* Surprisingly, you cannot use \Q..\E to escape a character inside a
3988
        Perl extended class. However, empty \Q\E sequences are allowed, so here
3989
        were're only giving an error if the \Q..\E is non-empty. */
3990
3991
0
        if (class_mode_state == CLASS_MODE_PERL_EXT)
3992
0
          {
3993
0
          errorcode = ERR116;
3994
0
          goto FAILED;
3995
0
          }
3996
3997
0
        goto CLASS_LITERAL;
3998
0
        }
3999
4000
      /* Skip over space and tab (only) in extended-more mode, or anywhere
4001
      inside a Perl extended class (which implies /xx). */
4002
4003
368
      if ((c == CHAR_SPACE || c == CHAR_HT) &&
4004
4
          ((options & PCRE2_EXTENDED_MORE) != 0 ||
4005
4
           class_mode_state >= CLASS_MODE_PERL_EXT))
4006
0
        goto CLASS_CONTINUE;
4007
4008
      /* Handle POSIX class names. Perl allows a negation extension of the
4009
      form [:^name:]. A square bracket that doesn't match the syntax is
4010
      treated as a literal. We also recognize the POSIX constructions
4011
      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4012
      5.6 and 5.8 do. */
4013
4014
368
      if (class_depth_m1 >= 0 &&
4015
292
          c == CHAR_LEFT_SQUARE_BRACKET &&
4016
0
          ptrend - ptr >= 3 &&
4017
0
          (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
4018
0
           *ptr == CHAR_EQUALS_SIGN) &&
4019
0
          check_posix_syntax(ptr, ptrend, &tempptr))
4020
0
        {
4021
0
        BOOL posix_negate = FALSE;
4022
0
        int posix_class;
4023
4024
        /* Perl treats a hyphen before a POSIX class as a literal, not the
4025
        start of a range. However, it gives a warning in its warning mode. PCRE
4026
        does not have a warning mode, so we give an error, because this is
4027
        likely an error on the user's part. */
4028
4029
0
        if (class_range_state == RANGE_STARTED)
4030
0
          {
4031
0
          ptr = tempptr + 2;
4032
0
          errorcode = ERR50;
4033
0
          goto FAILED;
4034
0
          }
4035
4036
        /* Perl treats a hyphen after a POSIX class as a literal, not the
4037
        start of a range. However, it gives a warning in its warning mode
4038
        unless the hyphen is the last character in the class. PCRE does not
4039
        have a warning mode, so we give an error, because this is likely an
4040
        error on the user's part.
4041
4042
        Roll back to the hyphen for the error position. */
4043
4044
0
        if (class_range_state == RANGE_FORBID_STARTED)
4045
0
          {
4046
0
          ptr = class_range_forbid_ptr;
4047
0
          errorcode = ERR50;
4048
0
          goto FAILED;
4049
0
          }
4050
4051
        /* Disallow implicit union in Perl extended classes. */
4052
4053
0
        if (class_op_state == CLASS_OP_OPERAND &&
4054
0
            class_mode_state == CLASS_MODE_PERL_EXT)
4055
0
          {
4056
0
          ptr = tempptr + 2;
4057
0
          errorcode = ERR113;
4058
0
          goto FAILED;
4059
0
          }
4060
4061
0
        if (*ptr != CHAR_COLON)
4062
0
          {
4063
0
          ptr = tempptr + 2;
4064
0
          errorcode = ERR13;
4065
0
          goto FAILED;
4066
0
          }
4067
4068
0
        if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
4069
0
          {
4070
0
          posix_negate = TRUE;
4071
0
          ptr++;
4072
0
          }
4073
4074
0
        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4075
0
        ptr = tempptr + 2;
4076
0
        if (posix_class < 0)
4077
0
          {
4078
0
          errorcode = ERR30;
4079
0
          goto FAILED;
4080
0
          }
4081
4082
        /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
4083
        case, the hyphen is treated as a literal, but for '-1' it is disallowed
4084
        (because it would be interpreted as range). */
4085
4086
0
        class_range_state = RANGE_FORBID_NO;
4087
0
        class_op_state = CLASS_OP_OPERAND;
4088
4089
        /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
4090
        of the POSIX classes are converted to use Unicode properties \p or \P
4091
        or, in one case, \h or \H. The substitutes table has two values per
4092
        class, containing the type and value of a \p or \P item. The special
4093
        cases are specified with a negative type: a non-zero value causes \h or
4094
        \H to be used, and a zero value falls through to behave like a non-UCP
4095
        POSIX class. There are now also some extra options that force ASCII for
4096
        some classes. */
4097
4098
0
#ifdef SUPPORT_UNICODE
4099
0
        if ((options & PCRE2_UCP) != 0 &&
4100
0
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
4101
0
            !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
4102
0
              (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
4103
0
          {
4104
0
          int ptype = posix_substitutes[2*posix_class];
4105
0
          int pvalue = posix_substitutes[2*posix_class + 1];
4106
4107
0
          if (ptype >= 0)
4108
0
            {
4109
0
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
4110
0
            *parsed_pattern++ = (ptype << 16) | pvalue;
4111
0
            goto CLASS_CONTINUE;
4112
0
            }
4113
4114
0
          if (pvalue != 0)
4115
0
            {
4116
0
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
4117
0
            goto CLASS_CONTINUE;
4118
0
            }
4119
4120
          /* Fall through */
4121
0
          }
4122
0
#endif  /* SUPPORT_UNICODE */
4123
4124
        /* Non-UCP POSIX class */
4125
4126
0
        *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
4127
0
        *parsed_pattern++ = posix_class;
4128
0
        }
4129
4130
      /* Check for the start of the outermost class, or the start of a nested class. */
4131
4132
368
      else if ((c == CHAR_LEFT_SQUARE_BRACKET &&
4133
76
                (class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||
4134
0
                 class_mode_state == CLASS_MODE_PERL_EXT)) ||
4135
292
               (c == CHAR_LEFT_PARENTHESIS &&
4136
0
                class_mode_state == CLASS_MODE_PERL_EXT))
4137
76
        {
4138
76
        uint32_t start_c = c;
4139
76
        uint32_t new_class_mode_state;
4140
4141
        /* Update the class mode, if moving into a 'leaf' inside a Perl extended
4142
        class. */
4143
4144
76
        if (start_c == CHAR_LEFT_SQUARE_BRACKET &&
4145
76
            class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)
4146
0
          new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;
4147
76
        else
4148
76
          new_class_mode_state = class_mode_state;
4149
4150
        /* Tidy up the other class before starting the nested class. */
4151
        /* -[ beginning a nested class is a literal '-' */
4152
4153
76
        if (class_range_state == RANGE_STARTED)
4154
0
          parsed_pattern[-1] = CHAR_MINUS;
4155
4156
        /* Disallow implicit union in Perl extended classes. */
4157
4158
76
        if (class_op_state == CLASS_OP_OPERAND &&
4159
0
            class_mode_state == CLASS_MODE_PERL_EXT)
4160
0
          {
4161
0
          errorcode = ERR113;
4162
0
          goto FAILED;
4163
0
          }
4164
4165
        /* Validate nesting depth */
4166
76
        if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)
4167
0
          {
4168
0
          ptr--;  /* Point rightwards at the paren, same as ERR19. */
4169
0
          errorcode = ERR107;  /* Classes too deeply nested */
4170
0
          goto FAILED;
4171
0
          }
4172
4173
        /* Process the character class start. If the first character is '^', set
4174
        the negation flag. If the first few characters (either before or after ^)
4175
        are \Q\E or \E or space or tab in extended-more mode, we skip them too.
4176
        This makes for compatibility with Perl. */
4177
4178
76
        negate_class = FALSE;
4179
76
        for (;;)
4180
76
          {
4181
76
          if (ptr >= ptrend)
4182
0
            {
4183
0
            if (start_c == CHAR_LEFT_PARENTHESIS)
4184
0
              errorcode = ERR14;  /* Missing terminating ')' */
4185
0
            else
4186
0
              errorcode = ERR6;   /* Missing terminating ']' */
4187
0
            goto FAILED;
4188
0
            }
4189
4190
76
          GETCHARINCTEST(c, ptr);
4191
76
          if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;
4192
76
          else if (c == CHAR_BACKSLASH)
4193
12
            {
4194
12
            if (ptr < ptrend && *ptr == CHAR_E) ptr++;
4195
12
            else if (ptrend - ptr >= 3 &&
4196
12
                PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4197
0
              ptr += 3;
4198
12
            else
4199
12
              break;
4200
12
            }
4201
64
          else if ((c == CHAR_SPACE || c == CHAR_HT) &&  /* Note: just these two */
4202
4
                   ((options & PCRE2_EXTENDED_MORE) != 0 ||
4203
4
                    new_class_mode_state >= CLASS_MODE_PERL_EXT))
4204
0
            continue;
4205
64
          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4206
0
            negate_class = TRUE;
4207
64
          else break;
4208
76
          }
4209
4210
        /* Now the real contents of the class; c has the first "real" character.
4211
        Empty classes are permitted only if the option is set, and if it's not
4212
        a Perl-extended class. */
4213
4214
76
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4215
0
            (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&
4216
0
            new_class_mode_state < CLASS_MODE_PERL_EXT)
4217
0
          {
4218
0
          PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);
4219
4220
0
          if (class_start != NULL)
4221
0
            {
4222
0
            PCRE2_ASSERT(class_depth_m1 >= 0);
4223
            /* Represents that the class is an extended class. */
4224
0
            *class_start |= CLASS_IS_ECLASS;
4225
0
            class_start = NULL;
4226
0
            }
4227
4228
0
          *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
4229
4230
          /* Leave nesting depth unchanged; but check for zero depth to handle the
4231
          very first (top-level) class being empty. */
4232
0
          if (class_depth_m1 < 0) break;
4233
4234
0
          class_range_state = RANGE_NO; /* for processing the containing class */
4235
0
          class_op_state = CLASS_OP_OPERAND;
4236
0
          goto CLASS_CONTINUE;
4237
0
          }
4238
4239
        /* Enter a non-empty class. */
4240
4241
76
        if (class_start != NULL)
4242
0
          {
4243
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4244
          /* Represents that the class is an extended class. */
4245
0
          *class_start |= CLASS_IS_ECLASS;
4246
0
          class_start = NULL;
4247
0
          }
4248
4249
76
        class_start = parsed_pattern;
4250
76
        *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
4251
76
        class_range_state = RANGE_NO;
4252
76
        class_op_state = CLASS_OP_EMPTY;
4253
76
        class_mode_state = new_class_mode_state;
4254
76
        ++class_depth_m1;
4255
76
        if (class_maxdepth_m1 < class_depth_m1)
4256
76
          class_maxdepth_m1 = class_depth_m1;
4257
        /* Reset; no op seen yet at new depth. */
4258
76
        cb->class_op_used[class_depth_m1] = 0;
4259
4260
        /* Implement the special start-of-class literal meaning of ']'. */
4261
76
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4262
0
            new_class_mode_state != CLASS_MODE_PERL_EXT)
4263
0
          {
4264
0
          class_range_state = RANGE_OK_LITERAL;
4265
0
          class_op_state = CLASS_OP_OPERAND;
4266
0
          PARSED_LITERAL(c, parsed_pattern);
4267
0
          goto CLASS_CONTINUE;
4268
0
          }
4269
4270
76
        continue;  /* We have already loaded c with the next character */
4271
76
        }
4272
4273
      /* Check for the end of the class. */
4274
4275
292
      else if (c == CHAR_RIGHT_SQUARE_BRACKET ||
4276
216
               (c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))
4277
76
        {
4278
        /* In Perl extended mode, the ']' can only be used to match the
4279
        opening '[', and ')' must match an opening parenthesis. */
4280
76
        if (class_mode_state == CLASS_MODE_PERL_EXT)
4281
0
          {
4282
0
          if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)
4283
0
            {
4284
0
            errorcode = ERR14;
4285
0
            ptr--;  /* Correct the offset */
4286
0
            goto FAILED;
4287
0
            }
4288
0
          if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)
4289
0
            {
4290
0
            errorcode = ERR22;
4291
0
            goto FAILED;
4292
0
            }
4293
0
          }
4294
4295
        /* Check no trailing operator. */
4296
76
        if (class_op_state == CLASS_OP_OPERATOR)
4297
0
          {
4298
0
          errorcode = ERR110;
4299
0
          goto FAILED;
4300
0
          }
4301
4302
        /* Check no empty expression for Perl extended expressions. */
4303
76
        if (class_mode_state == CLASS_MODE_PERL_EXT &&
4304
0
            class_op_state == CLASS_OP_EMPTY)
4305
0
          {
4306
0
          errorcode = ERR114;
4307
0
          goto FAILED;
4308
0
          }
4309
4310
        /* -] at the end of a class is a literal '-' */
4311
76
        if (class_range_state == RANGE_STARTED)
4312
0
          parsed_pattern[-1] = CHAR_MINUS;
4313
4314
76
        *parsed_pattern++ = META_CLASS_END;
4315
4316
76
        if (--class_depth_m1 < 0)
4317
76
          {
4318
          /* Check for and consume ')' after '(?[...]'. */
4319
76
          PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);
4320
76
          if (class_mode_state == CLASS_MODE_PERL_EXT)
4321
0
            {
4322
0
            if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4323
0
              {
4324
0
              errorcode = ERR115;
4325
0
              goto FAILED;
4326
0
              }
4327
4328
0
            ptr++;
4329
0
            }
4330
4331
76
          break;
4332
76
          }
4333
4334
0
        class_range_state = RANGE_NO; /* for processing the containing class */
4335
0
        class_op_state = CLASS_OP_OPERAND;
4336
0
        if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)
4337
0
          class_mode_state = CLASS_MODE_PERL_EXT;
4338
        /* The extended class flag has already
4339
        been set for the parent class. */
4340
0
        class_start = NULL;
4341
0
        }
4342
4343
      /* Handle a Perl set binary operator */
4344
4345
216
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4346
0
               (c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4347
0
                c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))
4348
0
        {
4349
        /* Check that there was a preceding operand. */
4350
0
        if (class_op_state != CLASS_OP_OPERAND)
4351
0
          {
4352
0
          errorcode = ERR109;
4353
0
          goto FAILED;
4354
0
          }
4355
4356
0
        if (class_start != NULL)
4357
0
          {
4358
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4359
          /* Represents that the class is an extended class. */
4360
0
          *class_start |= CLASS_IS_ECLASS;
4361
0
          class_start = NULL;
4362
0
          }
4363
4364
0
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4365
0
                     class_range_state != RANGE_FORBID_STARTED);
4366
4367
0
        *parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :
4368
0
                            c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4369
0
                            c == CHAR_MINUS? META_ECLASS_SUB :
4370
0
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4371
0
                            META_ECLASS_XOR;
4372
0
        class_range_state = RANGE_NO;
4373
0
        class_op_state = CLASS_OP_OPERATOR;
4374
0
        }
4375
4376
      /* Handle a Perl set unary operator */
4377
4378
216
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4379
0
               c == CHAR_EXCLAMATION_MARK)
4380
0
        {
4381
        /* Check that the "!" has not got a preceding operand (i.e. it's the
4382
        start of the class, or follows an operator). */
4383
0
        if (class_op_state == CLASS_OP_OPERAND)
4384
0
          {
4385
0
          errorcode = ERR113;
4386
0
          goto FAILED;
4387
0
          }
4388
4389
0
        if (class_start != NULL)
4390
0
          {
4391
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4392
          /* Represents that the class is an extended class. */
4393
0
          *class_start |= CLASS_IS_ECLASS;
4394
0
          class_start = NULL;
4395
0
          }
4396
4397
0
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4398
0
                     class_range_state != RANGE_FORBID_STARTED);
4399
4400
0
        *parsed_pattern++ = META_ECLASS_NOT;
4401
0
        class_range_state = RANGE_NO;
4402
0
        class_op_state = CLASS_OP_OPERATOR;
4403
0
        }
4404
4405
      /* Handle a UTS#18 set operator */
4406
4407
216
      else if (class_mode_state == CLASS_MODE_ALT_EXT &&
4408
0
               (c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4409
0
                c == CHAR_AMPERSAND || c == CHAR_TILDE) &&
4410
0
               ptr < ptrend && *ptr == c)
4411
0
        {
4412
0
        ++ptr;
4413
4414
        /* Check there isn't a triple-repetition. */
4415
0
        if (ptr < ptrend && *ptr == c)
4416
0
          {
4417
0
          while (ptr < ptrend && *ptr == c) ++ptr;  /* Improve error offset. */
4418
0
          errorcode = ERR108;
4419
0
          goto FAILED;
4420
0
          }
4421
4422
        /* Check for a preceding operand. */
4423
0
        if (class_op_state != CLASS_OP_OPERAND)
4424
0
          {
4425
0
          errorcode = ERR109;
4426
0
          goto FAILED;
4427
0
          }
4428
4429
        /* Check for mixed precedence. Forbid [A--B&&C]. */
4430
0
        if (cb->class_op_used[class_depth_m1] != 0 &&
4431
0
            cb->class_op_used[class_depth_m1] != (uint8_t)c)
4432
0
          {
4433
0
          errorcode = ERR111;
4434
0
          goto FAILED;
4435
0
          }
4436
4437
0
        if (class_start != NULL)
4438
0
          {
4439
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4440
          /* Represents that the class is an extended class. */
4441
0
          *class_start |= CLASS_IS_ECLASS;
4442
0
          class_start = NULL;
4443
0
          }
4444
4445
        /* Dangling '-' before an operator is a literal */
4446
0
        if (class_range_state == RANGE_STARTED)
4447
0
          parsed_pattern[-1] = CHAR_MINUS;
4448
4449
0
        *parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4450
0
                            c == CHAR_MINUS? META_ECLASS_SUB :
4451
0
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4452
0
                            META_ECLASS_XOR;
4453
0
        class_range_state = RANGE_NO;
4454
0
        class_op_state = CLASS_OP_OPERATOR;
4455
0
        cb->class_op_used[class_depth_m1] = (uint8_t)c;
4456
0
        }
4457
4458
      /* Handle escapes in a class */
4459
4460
216
      else if (c == CHAR_BACKSLASH)
4461
28
        {
4462
28
        tempptr = ptr;
4463
28
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
4464
28
          xoptions, cb->bracount, TRUE, cb);
4465
4466
28
        if (errorcode != 0)
4467
0
          {
4468
0
          if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||
4469
0
              class_mode_state >= CLASS_MODE_PERL_EXT)
4470
0
            goto FAILED;
4471
0
          ptr = tempptr;
4472
0
          if (ptr >= ptrend) c = CHAR_BACKSLASH; else
4473
0
            {
4474
0
            GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
4475
0
            }
4476
0
          escape = 0;                 /* Treat as literal character */
4477
0
          }
4478
4479
28
        switch(escape)
4480
28
          {
4481
24
          case 0:  /* Escaped character code point is in c */
4482
24
          char_is_literal = FALSE;
4483
24
          goto CLASS_LITERAL;      /* (a few lines above) */
4484
4485
0
          case ESC_b:
4486
0
          c = CHAR_BS;    /* \b is backspace in a class */
4487
0
          char_is_literal = FALSE;
4488
0
          goto CLASS_LITERAL;
4489
4490
0
          case ESC_k:
4491
0
          c = CHAR_k;     /* \k is not special in a class, just like \g */
4492
0
          char_is_literal = FALSE;
4493
0
          goto CLASS_LITERAL;
4494
4495
0
          case ESC_Q:
4496
0
          inescq = TRUE;  /* Enter literal mode */
4497
0
          goto CLASS_CONTINUE;
4498
4499
0
          case ESC_E:     /* Ignore orphan \E */
4500
0
          goto CLASS_CONTINUE;
4501
4502
0
          case ESC_B:     /* Always an error in a class */
4503
0
          case ESC_R:
4504
0
          case ESC_X:
4505
0
          errorcode = ERR7;
4506
0
          goto FAILED;
4507
4508
0
          case ESC_N:     /* Not permitted by Perl either */
4509
0
          errorcode = ERR71;
4510
0
          goto FAILED;
4511
4512
0
          case ESC_H:
4513
0
          case ESC_h:
4514
0
          case ESC_V:
4515
0
          case ESC_v:
4516
0
          *parsed_pattern++ = META_ESCAPE + escape;
4517
0
          break;
4518
4519
          /* These escapes may be converted to Unicode property tests when
4520
          PCRE2_UCP is set. */
4521
4522
0
          case ESC_d:
4523
0
          case ESC_D:
4524
4
          case ESC_s:
4525
4
          case ESC_S:
4526
4
          case ESC_w:
4527
4
          case ESC_W:
4528
4
          parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
4529
4
            xoptions);
4530
4
          break;
4531
4532
          /* Explicit Unicode property matching */
4533
4534
0
          case ESC_P:
4535
0
          case ESC_p:
4536
0
#ifdef SUPPORT_UNICODE
4537
0
            {
4538
0
            BOOL negated;
4539
0
            uint16_t ptype = 0, pdata = 0;
4540
0
            if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))
4541
0
              goto FAILED;
4542
4543
            /* In caseless matching, particular characteristics Lu, Ll, and Lt
4544
            get converted to the general characteristic L&. That is, upper,
4545
            lower, and title case letters are all conflated. */
4546
4547
0
            if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
4548
0
                (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
4549
0
              {
4550
0
              ptype = PT_LAMP;
4551
0
              pdata = 0;
4552
0
              }
4553
4554
0
            if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
4555
0
            *parsed_pattern++ = META_ESCAPE + escape;
4556
0
            *parsed_pattern++ = (ptype << 16) | pdata;
4557
0
            }
4558
#else
4559
          errorcode = ERR45;
4560
          goto FAILED;
4561
#endif
4562
0
          break;  /* End \P and \p */
4563
4564
          /* All others are not allowed in a class */
4565
4566
          /* LCOV_EXCL_START */
4567
0
          default:
4568
0
          PCRE2_DEBUG_UNREACHABLE();
4569
0
          PCRE2_FALLTHROUGH /* Fall through */
4570
          /* LCOV_EXCL_STOP */
4571
4572
0
          case ESC_A:
4573
0
          case ESC_Z:
4574
0
          case ESC_z:
4575
0
          case ESC_G:
4576
0
          case ESC_K:
4577
0
          case ESC_C:
4578
0
          errorcode = ERR7;
4579
0
          goto FAILED;
4580
28
          }
4581
4582
        /* All the switch-cases above which end in "break" describe a set
4583
        of characters. None may start a range. */
4584
4585
        /* The second part of a range can be a single-character escape
4586
        sequence (detected above), but not any of the other escapes. Perl
4587
        treats a hyphen as a literal in such circumstances. However, in Perl's
4588
        warning mode, a warning is given, so PCRE now faults it, as it is
4589
        almost certainly a mistake on the user's part. */
4590
4591
4
        if (class_range_state == RANGE_STARTED)
4592
0
          {
4593
0
          errorcode = ERR50;
4594
0
          goto FAILED;
4595
0
          }
4596
4597
        /* Perl gives a warning unless the hyphen following a multi-character
4598
        escape is the last character in the class. PCRE throws an error. */
4599
4600
4
        if (class_range_state == RANGE_FORBID_STARTED)
4601
0
          {
4602
0
          ptr = class_range_forbid_ptr;
4603
0
          errorcode = ERR50;
4604
0
          goto FAILED;
4605
0
          }
4606
4607
        /* Disallow implicit union in Perl extended classes. */
4608
4609
4
        if (class_op_state == CLASS_OP_OPERAND &&
4610
0
            class_mode_state == CLASS_MODE_PERL_EXT)
4611
0
          {
4612
0
          errorcode = ERR113;
4613
0
          goto FAILED;
4614
0
          }
4615
4616
4
        class_range_state = RANGE_FORBID_NO;
4617
4
        class_op_state = CLASS_OP_OPERAND;
4618
4
        }
4619
4620
      /* Forbid unescaped literals, and the special meaning of '-', inside a
4621
      Perl extended class. */
4622
4623
188
      else if (class_mode_state == CLASS_MODE_PERL_EXT)
4624
0
        {
4625
0
        errorcode = ERR116;
4626
0
        goto FAILED;
4627
0
        }
4628
4629
      /* Handle potential start of range */
4630
4631
188
      else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
4632
56
        {
4633
56
        *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
4634
56
          META_RANGE_LITERAL : META_RANGE_ESCAPED;
4635
56
        class_range_state = RANGE_STARTED;
4636
56
        }
4637
4638
      /* Handle forbidden start of range */
4639
4640
132
      else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
4641
0
        {
4642
0
        *parsed_pattern++ = CHAR_MINUS;
4643
0
        class_range_state = RANGE_FORBID_STARTED;
4644
0
        class_range_forbid_ptr = ptr;
4645
0
        }
4646
4647
      /* Handle a literal character */
4648
4649
132
      else
4650
132
        {
4651
156
        CLASS_LITERAL:
4652
4653
        /* Disallow implicit union in Perl extended classes. */
4654
4655
156
        if (class_op_state == CLASS_OP_OPERAND &&
4656
84
            class_mode_state == CLASS_MODE_PERL_EXT)
4657
0
          {
4658
0
          errorcode = ERR113;
4659
0
          goto FAILED;
4660
0
          }
4661
4662
156
        if (class_range_state == RANGE_STARTED)
4663
56
          {
4664
56
          if (c == parsed_pattern[-2])       /* Optimize one-char range */
4665
0
            parsed_pattern--;
4666
56
          else if (parsed_pattern[-2] > c)   /* Check range is in order */
4667
0
            {
4668
0
            errorcode = ERR8;
4669
0
            goto FAILED;
4670
0
            }
4671
56
          else
4672
56
            {
4673
56
            if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
4674
0
              parsed_pattern[-1] = META_RANGE_ESCAPED;
4675
56
            PARSED_LITERAL(c, parsed_pattern);
4676
56
            }
4677
56
          class_range_state = RANGE_NO;
4678
56
          class_op_state = CLASS_OP_OPERAND;
4679
56
          }
4680
100
        else if (class_range_state == RANGE_FORBID_STARTED)
4681
0
          {
4682
0
          ptr = class_range_forbid_ptr;
4683
0
          errorcode = ERR50;
4684
0
          goto FAILED;
4685
0
          }
4686
100
        else  /* Potential start of range */
4687
100
          {
4688
100
          class_range_state = char_is_literal?
4689
76
            RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
4690
100
          class_op_state = CLASS_OP_OPERAND;
4691
100
          PARSED_LITERAL(c, parsed_pattern);
4692
100
          }
4693
156
        }
4694
4695
      /* Proceed to next thing in the class. */
4696
4697
216
      CLASS_CONTINUE:
4698
216
      if (ptr >= ptrend)
4699
0
        {
4700
0
        if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)
4701
0
          errorcode = ERR14;   /* Missing terminating ')' */
4702
0
        if (class_mode_state == CLASS_MODE_ALT_EXT &&
4703
0
            class_depth_m1 == 0 && class_maxdepth_m1 == 1)
4704
0
          errorcode = ERR112;  /* Missing terminating ']', but we saw '[ [ ]...' */
4705
0
        else
4706
0
          errorcode = ERR6;    /* Missing terminating ']' */
4707
0
        goto FAILED;
4708
0
        }
4709
216
      GETCHARINCTEST(c, ptr);
4710
216
      }     /* End of class-processing loop */
4711
4712
76
    break;  /* End of character class */
4713
4714
4715
    /* ---- Opening parenthesis ---- */
4716
4717
76
    case CHAR_LEFT_PARENTHESIS:
4718
8
    if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4719
4720
    /* If ( is not followed by ? it is either a capture or a special verb or an
4721
    alpha assertion or a positive non-atomic lookahead. */
4722
4723
8
    if (*ptr != CHAR_QUESTION_MARK)
4724
8
      {
4725
8
      const char *vn;
4726
4727
      /* Handle capturing brackets (or non-capturing if auto-capture is turned
4728
      off). */
4729
4730
8
      if (*ptr != CHAR_ASTERISK)
4731
8
        {
4732
8
        nest_depth++;
4733
8
        if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
4734
8
          {
4735
8
          if (cb->bracount >= MAX_GROUP_NUMBER)
4736
0
            {
4737
0
            errorcode = ERR97;
4738
0
            goto FAILED;
4739
0
            }
4740
8
          cb->bracount++;
4741
8
          *parsed_pattern++ = META_CAPTURE | cb->bracount;
4742
8
          }
4743
0
        else *parsed_pattern++ = META_NOCAPTURE;
4744
8
        }
4745
4746
      /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
4747
      quantifier" error rather than "(*MARK) must have an argument". */
4748
4749
0
      else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
4750
0
        break;
4751
4752
      /* Handle "alpha assertions" such as (*pla:...). Most of these are
4753
      synonyms for the historical symbolic assertions, but the script run and
4754
      non-atomic lookaround ones are new. They are distinguished by starting
4755
      with a lower case letter. Checking both ends of the alphabet makes this
4756
      work in all character codes. */
4757
4758
0
      else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
4759
0
        {
4760
0
        uint32_t meta;
4761
4762
0
        vn = alasnames;
4763
0
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4764
0
          &errorcode, cb)) goto FAILED;
4765
0
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4766
0
        if (*ptr != CHAR_COLON)
4767
0
          {
4768
0
          errorcode = ERR95;  /* Malformed */
4769
0
          goto FAILED_FORWARD;
4770
0
          }
4771
4772
        /* Scan the table of alpha assertion names */
4773
4774
0
        for (i = 0; i < alascount; i++)
4775
0
          {
4776
0
          if (namelen == alasmeta[i].len &&
4777
0
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4778
0
            break;
4779
0
          vn += alasmeta[i].len + 1;
4780
0
          }
4781
4782
0
        if (i >= alascount)
4783
0
          {
4784
0
          errorcode = ERR95;  /* Alpha assertion not recognized */
4785
0
          goto FAILED;
4786
0
          }
4787
4788
        /* Check for expecting an assertion condition. If so, only atomic
4789
        lookaround assertions are valid. */
4790
4791
0
        meta = alasmeta[i].meta;
4792
0
        if (prev_expect_cond_assert > 0 &&
4793
0
            (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
4794
0
          {
4795
0
          errorcode = ERR28;  /* Atomic assertion expected */
4796
0
          goto FAILED;
4797
0
          }
4798
4799
        /* The lookaround alphabetic synonyms can mostly be handled by jumping
4800
        to the code that handles the traditional symbolic forms. */
4801
4802
0
        switch(meta)
4803
0
          {
4804
          /* LCOV_EXCL_START */
4805
0
          default:
4806
0
          PCRE2_DEBUG_UNREACHABLE();
4807
0
          errorcode = ERR89;  /* Unknown code; should never occur because */
4808
0
          goto FAILED;        /* the meta values come from a table above. */
4809
          /* LCOV_EXCL_STOP */
4810
4811
0
          case META_ATOMIC:
4812
0
          goto ATOMIC_GROUP;
4813
4814
0
          case META_LOOKAHEAD:
4815
0
          goto POSITIVE_LOOK_AHEAD;
4816
4817
0
          case META_LOOKAHEAD_NA:
4818
0
          goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4819
4820
0
          case META_LOOKAHEADNOT:
4821
0
          goto NEGATIVE_LOOK_AHEAD;
4822
4823
0
          case META_SCS:
4824
0
          ptr++;
4825
0
          *parsed_pattern++ = META_SCS;
4826
4827
0
          parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
4828
0
                                              0, &errorcode, cb);
4829
0
          if (parsed_pattern == NULL) goto FAILED;
4830
0
          goto POST_ASSERTION;
4831
4832
0
          case META_LOOKBEHIND:
4833
0
          case META_LOOKBEHINDNOT:
4834
0
          case META_LOOKBEHIND_NA:
4835
0
          *parsed_pattern++ = meta;
4836
0
          ptr--;
4837
0
          goto POST_LOOKBEHIND;
4838
4839
          /* The script run facilities are handled here. Unicode support is
4840
          required (give an error if not, as this is a security issue). Always
4841
          record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4842
          META_ATOMIC and remember that we need two META_KETs at the end. */
4843
4844
0
          case META_SCRIPT_RUN:
4845
0
          case META_ATOMIC_SCRIPT_RUN:
4846
0
#ifdef SUPPORT_UNICODE
4847
0
          *parsed_pattern++ = META_SCRIPT_RUN;
4848
0
          nest_depth++;
4849
0
          ptr++;
4850
0
          if (meta == META_ATOMIC_SCRIPT_RUN)
4851
0
            {
4852
0
            *parsed_pattern++ = META_ATOMIC;
4853
0
            if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4854
0
            else if (++top_nest >= end_nests)
4855
0
              {
4856
0
              errorcode = ERR84;
4857
0
              goto FAILED;
4858
0
              }
4859
0
            top_nest->nest_depth = nest_depth;
4860
0
            top_nest->flags = NSF_ATOMICSR;
4861
0
            top_nest->options = options & PARSE_TRACKED_OPTIONS;
4862
0
            top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4863
4864
#ifdef PCRE2_DEBUG
4865
            /* We'll write out two META_KETs for a single ")" in the input
4866
            pattern, so we reserve space for that in our bounds check. */
4867
            parsed_pattern_extra++;
4868
#endif
4869
0
            }
4870
0
          break;
4871
#else  /* SUPPORT_UNICODE */
4872
          errorcode = ERR96;
4873
          goto FAILED;
4874
#endif
4875
0
          }
4876
0
        }
4877
4878
4879
      /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4880
4881
0
      else
4882
0
        {
4883
0
        vn = verbnames;
4884
0
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4885
0
          &errorcode, cb)) goto FAILED;
4886
0
        if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4887
0
                              *ptr != CHAR_RIGHT_PARENTHESIS))
4888
0
          {
4889
0
          errorcode = ERR60;  /* Malformed */
4890
0
          goto FAILED;
4891
0
          }
4892
4893
        /* Scan the table of verb names */
4894
4895
0
        for (i = 0; i < verbcount; i++)
4896
0
          {
4897
0
          if (namelen == verbs[i].len &&
4898
0
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4899
0
            break;
4900
0
          vn += verbs[i].len + 1;
4901
0
          }
4902
4903
0
        if (i >= verbcount)
4904
0
          {
4905
0
          errorcode = ERR60;  /* Verb not recognized */
4906
0
          goto FAILED;
4907
0
          }
4908
4909
        /* An empty argument is treated as no argument. */
4910
4911
0
        if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4912
0
             ptr[1] == CHAR_RIGHT_PARENTHESIS)
4913
0
          ptr++;    /* Advance to the closing parens */
4914
4915
        /* Check for mandatory non-empty argument; this is (*MARK) */
4916
4917
0
        if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4918
0
          {
4919
0
          errorcode = ERR66;
4920
0
          goto FAILED;
4921
0
          }
4922
4923
        /* Remember where this verb, possibly with a preceding (*MARK), starts,
4924
        for handling quantified (*ACCEPT). */
4925
4926
0
        verbstartptr = parsed_pattern;
4927
0
        okquantifier = (verbs[i].meta == META_ACCEPT);
4928
#ifdef PCRE2_DEBUG
4929
        /* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)
4930
        with a non-capturing bracket, if there is a following quantifier. */
4931
        if (okquantifier) parsed_pattern_extra += 2;
4932
#endif
4933
4934
        /* It appears that Perl allows any characters whatsoever, other than a
4935
        closing parenthesis, to appear in arguments ("names"), so we no longer
4936
        insist on letters, digits, and underscores. Perl does not, however, do
4937
        any interpretation within arguments, and has no means of including a
4938
        closing parenthesis. PCRE supports escape processing but only when it
4939
        is requested by an option. We set inverbname TRUE here, and let the
4940
        main loop take care of this so that escape and \x processing is done by
4941
        the main code above. */
4942
4943
0
        if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4944
0
          {
4945
          /* Some optional arguments can be treated as a preceding (*MARK) */
4946
4947
0
          if (verbs[i].has_arg < 0)
4948
0
            {
4949
0
            add_after_mark = verbs[i].meta;
4950
0
            *parsed_pattern++ = META_MARK;
4951
0
            }
4952
4953
          /* The remaining verbs with arguments (except *MARK) need a different
4954
          opcode. */
4955
4956
0
          else
4957
0
            {
4958
0
            *parsed_pattern++ = verbs[i].meta +
4959
0
              ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4960
0
            }
4961
4962
          /* Set up for reading the name in the main loop. */
4963
4964
0
          verblengthptr = parsed_pattern++;
4965
0
          verbnamestart = ptr;
4966
0
          inverbname = TRUE;
4967
0
          }
4968
0
        else  /* No verb "name" argument */
4969
0
          {
4970
0
          *parsed_pattern++ = verbs[i].meta;
4971
0
          }
4972
0
        }     /* End of (*VERB) handling */
4973
8
      break;  /* Done with this parenthesis */
4974
8
      }       /* End of groups that don't start with (? */
4975
4976
4977
    /* ---- Items starting (? ---- */
4978
4979
    /* The type of item is determined by what follows (?. Handle (?| and option
4980
    changes under "default" because both need a new block on the nest stack.
4981
    Comments starting with (?# are handled above. Note that there is some
4982
    ambiguity about the sequence (?- because if a digit follows it's a relative
4983
    recursion or subroutine call whereas otherwise it's an option unsetting. */
4984
4985
0
    if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4986
4987
0
    switch(*ptr)
4988
0
      {
4989
0
      default:
4990
0
      if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4991
0
        goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4992
4993
      /* We now have either (?| or a (possibly empty) option setting,
4994
      optionally followed by a non-capturing group. */
4995
4996
0
      nest_depth++;
4997
0
      if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4998
0
      else if (++top_nest >= end_nests)
4999
0
        {
5000
0
        errorcode = ERR84;
5001
0
        goto FAILED;
5002
0
        }
5003
0
      top_nest->nest_depth = nest_depth;
5004
0
      top_nest->flags = 0;
5005
0
      top_nest->options = options & PARSE_TRACKED_OPTIONS;
5006
0
      top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5007
5008
      /* Start of non-capturing group that resets the capture count for each
5009
      branch. */
5010
5011
0
      if (*ptr == CHAR_VERTICAL_LINE)
5012
0
        {
5013
0
        top_nest->reset_group = (uint16_t)cb->bracount;
5014
0
        top_nest->max_group = (uint16_t)cb->bracount;
5015
0
        top_nest->flags |= NSF_RESET;
5016
0
        cb->external_flags |= PCRE2_DUPCAPUSED;
5017
0
        *parsed_pattern++ = META_NOCAPTURE;
5018
0
        ptr++;
5019
0
        }
5020
5021
      /* Scan for options imnrsxJU to be set or unset. */
5022
5023
0
      else
5024
0
        {
5025
0
        BOOL hyphenok = TRUE;
5026
0
        uint32_t oldoptions = options;
5027
0
        uint32_t oldxoptions = xoptions;
5028
5029
0
        top_nest->reset_group = 0;
5030
0
        top_nest->max_group = 0;
5031
0
        set = unset = 0;
5032
0
        optset = &set;
5033
0
        xset = xunset = 0;
5034
0
        xoptset = &xset;
5035
5036
        /* ^ at the start unsets irmnsx and disables the subsequent use of - */
5037
5038
0
        if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
5039
0
          {
5040
0
          options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
5041
0
                       PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
5042
0
          xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
5043
0
          hyphenok = FALSE;
5044
0
          ptr++;
5045
0
          }
5046
5047
0
        while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
5048
0
                               *ptr != CHAR_COLON)
5049
0
          {
5050
0
          switch (*ptr++)
5051
0
            {
5052
0
            case CHAR_MINUS:
5053
0
            if (!hyphenok)
5054
0
              {
5055
0
              errorcode = ERR94;
5056
0
              goto FAILED;
5057
0
              }
5058
0
            optset = &unset;
5059
0
            xoptset = &xunset;
5060
0
            hyphenok = FALSE;
5061
0
            break;
5062
5063
            /* There are some two-character sequences that start with 'a'. */
5064
5065
0
            case CHAR_a:
5066
0
            if (ptr < ptrend)
5067
0
              {
5068
0
              if (*ptr == CHAR_D)
5069
0
                {
5070
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSD;
5071
0
                ptr++;
5072
0
                break;
5073
0
                }
5074
0
              if (*ptr == CHAR_P)
5075
0
                {
5076
0
                *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
5077
0
                ptr++;
5078
0
                break;
5079
0
                }
5080
0
              if (*ptr == CHAR_S)
5081
0
                {
5082
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSS;
5083
0
                ptr++;
5084
0
                break;
5085
0
                }
5086
0
              if (*ptr == CHAR_T)
5087
0
                {
5088
0
                *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
5089
0
                ptr++;
5090
0
                break;
5091
0
                }
5092
0
              if (*ptr == CHAR_W)
5093
0
                {
5094
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSW;
5095
0
                ptr++;
5096
0
                break;
5097
0
                }
5098
0
              }
5099
0
            *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
5100
0
                        PCRE2_EXTRA_ASCII_BSW|
5101
0
                        PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
5102
0
            break;
5103
5104
0
            case CHAR_J:  /* Record that it changed in the external options */
5105
0
            *optset |= PCRE2_DUPNAMES;
5106
0
            cb->external_flags |= PCRE2_JCHANGED;
5107
0
            break;
5108
5109
0
            case CHAR_i: *optset |= PCRE2_CASELESS; break;
5110
0
            case CHAR_m: *optset |= PCRE2_MULTILINE; break;
5111
0
            case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
5112
0
            case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
5113
0
            case CHAR_s: *optset |= PCRE2_DOTALL; break;
5114
0
            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
5115
5116
            /* If x appears twice it sets the extended extended option. */
5117
5118
0
            case CHAR_x:
5119
0
            *optset |= PCRE2_EXTENDED;
5120
0
            if (ptr < ptrend && *ptr == CHAR_x)
5121
0
              {
5122
0
              *optset |= PCRE2_EXTENDED_MORE;
5123
0
              ptr++;
5124
0
              }
5125
0
            break;
5126
5127
0
            default:
5128
0
            errorcode = ERR11;
5129
0
            goto FAILED;
5130
0
            }
5131
0
          }
5132
5133
        /* If we are setting extended without extended-more, ensure that any
5134
        existing extended-more gets unset. Also, unsetting extended must also
5135
        unset extended-more. */
5136
5137
0
        if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
5138
0
            (unset & PCRE2_EXTENDED) != 0)
5139
0
          unset |= PCRE2_EXTENDED_MORE;
5140
5141
0
        options = (options | set) & (~unset);
5142
0
        xoptions = (xoptions | xset) & (~xunset);
5143
5144
        /* If the options ended with ')' this is not the start of a nested
5145
        group with option changes, so the options change at this level.
5146
        In this case, if the previous level set up a nest block, discard the
5147
        one we have just created. Otherwise adjust it for the previous level.
5148
        If the options ended with ':' we are starting a non-capturing group,
5149
        possibly with an options setting. */
5150
5151
0
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5152
0
        if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
5153
0
          {
5154
0
          nest_depth--;  /* This is not a nested group after all. */
5155
0
          if (top_nest > (nest_save *)(cb->start_workspace) &&
5156
0
              (top_nest-1)->nest_depth == nest_depth) top_nest--;
5157
0
          else top_nest->nest_depth = nest_depth;
5158
0
          }
5159
0
        else *parsed_pattern++ = META_NOCAPTURE;
5160
5161
        /* If nothing changed, no need to record. */
5162
5163
0
        if (options != oldoptions || xoptions != oldxoptions)
5164
0
          {
5165
0
          *parsed_pattern++ = META_OPTIONS;
5166
0
          *parsed_pattern++ = options;
5167
0
          *parsed_pattern++ = xoptions;
5168
0
          }
5169
0
        }     /* End options processing */
5170
0
      break;  /* End default case after (? */
5171
5172
5173
      /* ---- Python syntax support ---- */
5174
5175
0
      case CHAR_P:
5176
0
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5177
5178
      /* (?P<name> is the same as (?<name>, which defines a named group. */
5179
5180
0
      if (*ptr == CHAR_LESS_THAN_SIGN)
5181
0
        {
5182
0
        terminator = CHAR_GREATER_THAN_SIGN;
5183
0
        goto DEFINE_NAME;
5184
0
        }
5185
5186
      /* (?P>name) is the same as (?&name), which is a recursion or subroutine
5187
      call. */
5188
5189
0
      if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
5190
5191
      /* (?P=name) is the same as \k<name>, a back reference by name. Anything
5192
      else after (?P is an error. */
5193
5194
0
      if (*ptr != CHAR_EQUALS_SIGN)
5195
0
        {
5196
0
        errorcode = ERR41;
5197
0
        goto FAILED_FORWARD;
5198
0
        }
5199
0
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5200
0
          &namelen, &errorcode, cb)) goto FAILED;
5201
0
      *parsed_pattern++ = META_BACKREF_BYNAME;
5202
0
      *parsed_pattern++ = namelen;
5203
0
      PUTOFFSET(offset, parsed_pattern);
5204
0
      okquantifier = TRUE;
5205
0
      break;   /* End of (?P processing */
5206
5207
5208
      /* ---- Recursion/subroutine calls by number ---- */
5209
5210
0
      case CHAR_R:
5211
0
      i = 0;         /* (?R) == (?R0) */
5212
0
      ptr++;
5213
0
      if (ptr >= ptrend || (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_LEFT_PARENTHESIS))
5214
0
        {
5215
0
        errorcode = ERR58;
5216
0
        goto FAILED;
5217
0
        }
5218
0
      terminator = CHAR_NUL;
5219
0
      goto SET_RECURSION;
5220
5221
      /* An item starting (?- followed by a digit comes here via the "default"
5222
      case because (?- followed by a non-digit is an options setting. */
5223
5224
0
      case CHAR_PLUS:
5225
0
      if (ptr + 1 >= ptrend)
5226
0
        {
5227
0
        ++ptr;
5228
0
        goto UNCLOSED_PARENTHESIS;
5229
0
        }
5230
0
      if (!IS_DIGIT(ptr[1]))
5231
0
        {
5232
0
        errorcode = ERR29;   /* Missing number */
5233
0
        ++ptr;
5234
0
        goto FAILED_FORWARD;
5235
0
        }
5236
0
      PCRE2_FALLTHROUGH /* Fall through */
5237
0
5238
0
      case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5239
0
      case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5240
0
      RECURSION_BYNUMBER:
5241
0
      if (!read_number(&ptr, ptrend,
5242
0
          (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
5243
0
          MAX_GROUP_NUMBER, ERR61,
5244
0
          &i, &errorcode)) goto FAILED;
5245
0
      PCRE2_ASSERT(i >= 0);  /* NB (?0) is permitted, represented by i=0 */
5246
0
      terminator = CHAR_NUL;
5247
5248
0
      SET_RECURSION:
5249
0
      *parsed_pattern++ = META_RECURSE | (uint32_t)i;
5250
0
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5251
      /* End of recursive call by number handling */
5252
0
      goto READ_RECURSION_ARGUMENTS;
5253
5254
5255
      /* ---- Recursion/subroutine calls by name ---- */
5256
5257
0
      case CHAR_AMPERSAND:
5258
0
      RECURSE_BY_NAME:
5259
0
      if (!read_name(&ptr, ptrend, utf, 0, &offset, &name,
5260
0
          &namelen, &errorcode, cb)) goto FAILED;
5261
0
      *parsed_pattern++ = META_RECURSE_BYNAME;
5262
0
      *parsed_pattern++ = namelen;
5263
0
      terminator = CHAR_NUL;
5264
5265
0
      READ_RECURSION_ARGUMENTS:
5266
0
      PUTOFFSET(offset, parsed_pattern);
5267
0
      okquantifier = TRUE;
5268
5269
      /* Arguments are not supported for \g construct. */
5270
0
      if (terminator != CHAR_NUL) break;
5271
5272
0
      if (ptr < ptrend && *ptr == CHAR_LEFT_PARENTHESIS)
5273
0
        {
5274
0
        parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
5275
0
                                            offset, &errorcode, cb);
5276
0
        if (parsed_pattern == NULL) goto FAILED;
5277
0
        }
5278
5279
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5280
0
        goto UNCLOSED_PARENTHESIS;
5281
5282
0
      ptr++;
5283
0
      break;
5284
5285
      /* ---- Callout with numerical or string argument ---- */
5286
5287
0
      case CHAR_C:
5288
0
      if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)
5289
0
        {
5290
0
        ptr++;
5291
0
        errorcode = ERR103;
5292
0
        goto FAILED;
5293
0
        }
5294
5295
0
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5296
5297
      /* If the previous item was a condition starting (?(? an assertion,
5298
      optionally preceded by a callout, is expected. This is checked later on,
5299
      during actual compilation. However we need to identify this kind of
5300
      assertion in this pass because it must not be qualified. The value of
5301
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5302
      for a callout - still leaving a positive value that identifies the
5303
      assertion. Multiple callouts or any other items will make it zero or
5304
      less, which doesn't matter because they will cause an error later. */
5305
5306
0
      expect_cond_assert = prev_expect_cond_assert - 1;
5307
5308
      /* If previous_callout is not NULL, it means this follows a previous
5309
      callout. If it was a manual callout, do nothing; this means its "length
5310
      of next pattern item" field will remain zero. If it was an automatic
5311
      callout, abolish it. */
5312
5313
0
      if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
5314
0
          previous_callout == parsed_pattern - 4 &&
5315
0
          parsed_pattern[-1] == 255)
5316
0
        parsed_pattern = previous_callout;
5317
5318
      /* Save for updating next pattern item length, and skip one item before
5319
      completing. */
5320
5321
0
      previous_callout = parsed_pattern;
5322
0
      after_manual_callout = 1;
5323
5324
      /* Handle a string argument; specific delimiter is required. */
5325
5326
0
      if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
5327
0
        {
5328
0
        PCRE2_SIZE calloutlength;
5329
0
        PCRE2_SPTR startptr = ptr;
5330
5331
0
        delimiter = 0;
5332
0
        for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
5333
0
          {
5334
0
          if (*ptr == PRIV(callout_start_delims)[i])
5335
0
            {
5336
0
            delimiter = PRIV(callout_end_delims)[i];
5337
0
            break;
5338
0
            }
5339
0
          }
5340
0
        if (delimiter == 0)
5341
0
          {
5342
0
          errorcode = ERR82;
5343
0
          goto FAILED_FORWARD;
5344
0
          }
5345
5346
0
        *parsed_pattern = META_CALLOUT_STRING;
5347
0
        parsed_pattern += 3;   /* Skip pattern info */
5348
5349
0
        for (;;)
5350
0
          {
5351
0
          if (++ptr >= ptrend)
5352
0
            {
5353
0
            errorcode = ERR81;
5354
0
            ptr = startptr;   /* To give a more useful message */
5355
0
            goto FAILED;
5356
0
            }
5357
0
          if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
5358
0
            break;
5359
0
          }
5360
5361
0
        calloutlength = (PCRE2_SIZE)(ptr - startptr);
5362
0
        if (calloutlength > UINT32_MAX)
5363
0
          {
5364
0
          errorcode = ERR72;
5365
0
          goto FAILED;
5366
0
          }
5367
0
        *parsed_pattern++ = (uint32_t)calloutlength;
5368
0
        offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
5369
0
        PUTOFFSET(offset, parsed_pattern);
5370
0
        }
5371
5372
      /* Handle a callout with an optional numerical argument, which must be
5373
      less than or equal to 255. A missing argument gives 0. */
5374
5375
0
      else
5376
0
        {
5377
0
        int n = 0;
5378
0
        *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
5379
0
        parsed_pattern += 3;                       /* Skip pattern info */
5380
0
        while (ptr < ptrend && IS_DIGIT(*ptr))
5381
0
          {
5382
0
          n = n * 10 + (*ptr++ - CHAR_0);
5383
0
          if (n > 255)
5384
0
            {
5385
0
            errorcode = ERR38;
5386
0
            goto FAILED;
5387
0
            }
5388
0
          }
5389
0
        *parsed_pattern++ = n;
5390
0
        }
5391
5392
      /* Both formats must have a closing parenthesis */
5393
5394
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5395
0
        {
5396
0
        errorcode = ERR39;
5397
0
        goto FAILED;
5398
0
        }
5399
0
      ptr++;
5400
5401
      /* Remember the offset to the next item in the pattern, and set a default
5402
      length. This should get updated after the next item is read. */
5403
5404
0
      previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
5405
0
      previous_callout[2] = 0;
5406
0
      break;                  /* End callout */
5407
5408
5409
      /* ---- Conditional group ---- */
5410
5411
      /* A condition can be an assertion, a number (referring to a numbered
5412
      group's having been set), a name (referring to a named group), or 'R',
5413
      referring to overall recursion. R<digits> and R&name are also permitted
5414
      for recursion state tests. Numbers may be preceded by + or - to specify a
5415
      relative group number.
5416
5417
      There are several syntaxes for testing a named group: (?(name)) is used
5418
      by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5419
5420
      There are two unfortunate ambiguities. 'R' can be the recursive thing or
5421
      the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
5422
      the Perl DEFINE feature or the Python named test. We look for a name
5423
      first; if not found, we try the other case.
5424
5425
      For compatibility with auto-callouts, we allow a callout to be specified
5426
      before a condition that is an assertion. */
5427
5428
0
      case CHAR_LEFT_PARENTHESIS:
5429
0
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5430
0
      nest_depth++;
5431
5432
      /* If the next character is ? or * there must be an assertion next
5433
      (optionally preceded by a callout). We do not check this here, but
5434
      instead we set expect_cond_assert to 2. If this is still greater than
5435
      zero (callouts decrement it) when the next assertion is read, it will be
5436
      marked as a condition that must not be repeated. A value greater than
5437
      zero also causes checking that an assertion (possibly with callout)
5438
      follows. */
5439
5440
0
      if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
5441
0
        {
5442
0
        *parsed_pattern++ = META_COND_ASSERT;
5443
0
        ptr--;   /* Pull pointer back to the opening parenthesis. */
5444
0
        expect_cond_assert = 2;
5445
0
        break;  /* End of conditional */
5446
0
        }
5447
5448
      /* Handle (?([+-]number)... */
5449
5450
0
      if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
5451
0
          &errorcode))
5452
0
        {
5453
0
        PCRE2_ASSERT(i >= 0);
5454
0
        if (i <= 0)
5455
0
          {
5456
0
          errorcode = ERR15;
5457
0
          goto FAILED;
5458
0
          }
5459
0
        *parsed_pattern++ = META_COND_NUMBER;
5460
0
        offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5461
0
        PUTOFFSET(offset, parsed_pattern);
5462
0
        *parsed_pattern++ = i;
5463
0
        }
5464
0
      else if (errorcode != 0) goto FAILED;   /* Number too big */
5465
5466
      /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
5467
5468
0
      else if (ptrend - ptr >= 10 &&
5469
0
               PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
5470
0
               ptr[7] != CHAR_RIGHT_PARENTHESIS)
5471
0
        {
5472
0
        uint32_t ge = 0;
5473
0
        int major = 0;
5474
0
        int minor = 0;
5475
5476
0
        ptr += 7;
5477
0
        if (*ptr == CHAR_GREATER_THAN_SIGN)
5478
0
          {
5479
0
          ge = 1;
5480
0
          ptr++;
5481
0
          }
5482
5483
        /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
5484
        references its argument twice. */
5485
5486
0
        if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
5487
0
          {
5488
0
          errorcode = ERR79;
5489
0
          if (!ge) goto FAILED_FORWARD;
5490
0
          goto FAILED;
5491
0
          }
5492
5493
0
        if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
5494
0
          goto FAILED;
5495
5496
0
        if (ptr < ptrend && *ptr == CHAR_DOT)
5497
0
          {
5498
0
          if (++ptr >= ptrend || !IS_DIGIT(*ptr))
5499
0
            {
5500
0
            errorcode = ERR79;
5501
0
            if (ptr < ptrend) goto FAILED_FORWARD;
5502
0
            goto FAILED;
5503
0
            }
5504
0
          if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &minor, &errorcode))
5505
0
            goto FAILED;
5506
0
          }
5507
0
        if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5508
0
          {
5509
0
          errorcode = ERR79;
5510
0
          if (ptr < ptrend) goto FAILED_FORWARD;
5511
0
          goto FAILED;
5512
0
          }
5513
5514
0
        *parsed_pattern++ = META_COND_VERSION;
5515
0
        *parsed_pattern++ = ge;
5516
0
        *parsed_pattern++ = major;
5517
0
        *parsed_pattern++ = minor;
5518
0
        }
5519
5520
      /* All the remaining cases now require us to read a name. We cannot at
5521
      this stage distinguish ambiguous cases such as (?(R12) which might be a
5522
      recursion test by number or a name, because the named groups have not yet
5523
      all been identified. Those cases are treated as names, but given a
5524
      different META code. */
5525
5526
0
      else
5527
0
        {
5528
0
        BOOL was_r_ampersand = FALSE;
5529
5530
0
        if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
5531
0
          {
5532
0
          terminator = CHAR_RIGHT_PARENTHESIS;
5533
0
          was_r_ampersand = TRUE;
5534
0
          ptr++;
5535
0
          }
5536
0
        else if (*ptr == CHAR_LESS_THAN_SIGN)
5537
0
          terminator = CHAR_GREATER_THAN_SIGN;
5538
0
        else if (*ptr == CHAR_APOSTROPHE)
5539
0
          terminator = CHAR_APOSTROPHE;
5540
0
        else
5541
0
          {
5542
0
          terminator = CHAR_RIGHT_PARENTHESIS;
5543
0
          ptr--;   /* Point to char before name */
5544
0
          }
5545
5546
0
        if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5547
0
            &errorcode, cb)) goto FAILED;
5548
5549
        /* Handle (?(R&name) */
5550
5551
0
        if (was_r_ampersand)
5552
0
          {
5553
0
          *parsed_pattern = META_COND_RNAME;
5554
0
          ptr--;   /* Back to closing parens */
5555
0
          }
5556
5557
        /* Handle (?(name). If the name is "DEFINE" we identify it with a
5558
        special code. Likewise if the name consists of R followed only by
5559
        digits. Otherwise, handle it like a quoted name. */
5560
5561
0
        else if (terminator == CHAR_RIGHT_PARENTHESIS)
5562
0
          {
5563
0
          if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
5564
0
            *parsed_pattern = META_COND_DEFINE;
5565
0
          else
5566
0
            {
5567
0
            for (i = 1; i < (int)namelen; i++)
5568
0
              if (!IS_DIGIT(name[i])) break;
5569
0
            *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
5570
0
              META_COND_RNUMBER : META_COND_NAME;
5571
0
            }
5572
0
          ptr--;   /* Back to closing parens */
5573
0
          }
5574
5575
        /* Handle (?('name') or (?(<name>) */
5576
5577
0
        else *parsed_pattern = META_COND_NAME;
5578
5579
        /* All these cases except DEFINE end with the name length and offset;
5580
        DEFINE just has an offset (for the "too many branches" error). */
5581
5582
0
        if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
5583
0
        PUTOFFSET(offset, parsed_pattern);
5584
0
        }  /* End cases that read a name */
5585
5586
      /* Check the closing parenthesis of the condition */
5587
5588
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5589
0
        {
5590
0
        errorcode = ERR24;
5591
0
        goto FAILED;
5592
0
        }
5593
0
      ptr++;
5594
0
      break;  /* End of condition processing */
5595
5596
5597
      /* ---- Atomic group ---- */
5598
5599
0
      case CHAR_GREATER_THAN_SIGN:
5600
0
      ATOMIC_GROUP:                          /* Come from (*atomic: */
5601
0
      *parsed_pattern++ = META_ATOMIC;
5602
0
      nest_depth++;
5603
0
      ptr++;
5604
0
      break;
5605
5606
5607
      /* ---- Lookahead assertions ---- */
5608
5609
0
      case CHAR_EQUALS_SIGN:
5610
0
      POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
5611
0
      *parsed_pattern++ = META_LOOKAHEAD;
5612
0
      ptr++;
5613
0
      goto POST_ASSERTION;
5614
5615
0
      case CHAR_ASTERISK:
5616
0
      POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (*napla: */
5617
0
      *parsed_pattern++ = META_LOOKAHEAD_NA;
5618
0
      ptr++;
5619
0
      goto POST_ASSERTION;
5620
5621
0
      case CHAR_EXCLAMATION_MARK:
5622
0
      NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
5623
0
      *parsed_pattern++ = META_LOOKAHEADNOT;
5624
0
      ptr++;
5625
0
      goto POST_ASSERTION;
5626
5627
5628
      /* ---- Lookbehind assertions ---- */
5629
5630
      /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
5631
      is the start of the name of a capturing group. */
5632
5633
0
      case CHAR_LESS_THAN_SIGN:
5634
0
      if (ptrend - ptr <= 1 ||
5635
0
         (ptr[1] != CHAR_EQUALS_SIGN &&
5636
0
          ptr[1] != CHAR_EXCLAMATION_MARK &&
5637
0
          ptr[1] != CHAR_ASTERISK))
5638
0
        {
5639
0
        terminator = CHAR_GREATER_THAN_SIGN;
5640
0
        goto DEFINE_NAME;
5641
0
        }
5642
0
      *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
5643
0
        META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
5644
0
        META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
5645
5646
0
      POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
5647
0
      *has_lookbehind = TRUE;
5648
0
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5649
0
      PUTOFFSET(offset, parsed_pattern);
5650
0
      ptr += 2;
5651
      /* Fall through */
5652
5653
      /* If the previous item was a condition starting (?(? an assertion,
5654
      optionally preceded by a callout, is expected. This is checked later on,
5655
      during actual compilation. However we need to identify this kind of
5656
      assertion in this pass because it must not be qualified. The value of
5657
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5658
      for a callout - still leaving a positive value that identifies the
5659
      assertion. Multiple callouts or any other items will make it zero or
5660
      less, which doesn't matter because they will cause an error later. */
5661
5662
0
      POST_ASSERTION:
5663
0
      nest_depth++;
5664
0
      if (prev_expect_cond_assert > 0)
5665
0
        {
5666
0
        if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
5667
0
        else if (++top_nest >= end_nests)
5668
0
          {
5669
0
          errorcode = ERR84;
5670
0
          goto FAILED;
5671
0
          }
5672
0
        top_nest->nest_depth = nest_depth;
5673
0
        top_nest->flags = NSF_CONDASSERT;
5674
0
        top_nest->options = options & PARSE_TRACKED_OPTIONS;
5675
0
        top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5676
0
        }
5677
0
      break;
5678
5679
5680
      /* ---- Define a named group ---- */
5681
5682
      /* A named group may be defined as (?'name') or (?<name>). In the latter
5683
      case we jump to DEFINE_NAME from the disambiguation of (?< above with the
5684
      terminator set to '>'. */
5685
5686
0
      case CHAR_APOSTROPHE:
5687
0
      terminator = CHAR_APOSTROPHE;    /* Terminator */
5688
5689
0
      DEFINE_NAME:
5690
0
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5691
0
          &errorcode, cb)) goto FAILED;
5692
5693
      /* We have a name for this capturing group. It is also assigned a number,
5694
      which is its primary means of identification. */
5695
5696
0
      if (cb->bracount >= MAX_GROUP_NUMBER)
5697
0
        {
5698
0
        errorcode = ERR97;
5699
0
        goto FAILED;
5700
0
        }
5701
0
      cb->bracount++;
5702
0
      *parsed_pattern++ = META_CAPTURE | cb->bracount;
5703
0
      nest_depth++;
5704
5705
      /* Check not too many names */
5706
5707
0
      if (cb->names_found >= MAX_NAME_COUNT)
5708
0
        {
5709
0
        errorcode = ERR49;
5710
0
        goto FAILED;
5711
0
        }
5712
5713
      /* Adjust the entry size to accommodate the longest name found. */
5714
5715
0
      if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
5716
0
        cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
5717
5718
      /* Scan the list to check for duplicates. For duplicate names, if the
5719
      number is the same, break the loop, which causes the name to be
5720
      discarded; otherwise, if DUPNAMES is not set, give an error.
5721
      If it is set, allow the name with a different number, but continue
5722
      scanning in case this is a duplicate with the same number. For
5723
      non-duplicate names, give an error if the number is duplicated. */
5724
5725
0
      is_dupname = FALSE;
5726
0
      hash = PRIV(compile_get_hash_from_name)(name, namelen);
5727
0
      ng = cb->named_groups;
5728
0
      for (i = 0; i < cb->names_found; i++, ng++)
5729
0
        {
5730
0
        if (namelen == ng->length && hash == NAMED_GROUP_GET_HASH(ng) &&
5731
0
            PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
5732
0
          {
5733
          /* When a bracket is referenced by the same name multiple
5734
          times, is not considered as a duplicate and ignored. */
5735
0
          if (ng->number == cb->bracount) break;
5736
0
          if ((options & PCRE2_DUPNAMES) == 0)
5737
0
            {
5738
0
            errorcode = ERR43;
5739
0
            goto FAILED;
5740
0
            }
5741
5742
0
          ng->hash_dup |= NAMED_GROUP_IS_DUPNAME;
5743
0
          is_dupname = TRUE;                /* Mark as a duplicate */
5744
0
          cb->dupnames = TRUE;              /* Duplicate names exist */
5745
5746
          /* The entry represents a duplicate. */
5747
0
          name = ng->name;
5748
0
          namelen = 0;
5749
5750
          /* Even duplicated names may refer to the same
5751
          capture index. These references are also ignored. */
5752
0
          for (; i < cb->names_found; i++, ng++)
5753
0
            if (ng->name == name && ng->number == cb->bracount)
5754
0
              break;
5755
0
          break;
5756
0
          }
5757
0
        else if (ng->number == cb->bracount)
5758
0
          {
5759
0
          errorcode = ERR65;
5760
0
          goto FAILED;
5761
0
          }
5762
0
        }
5763
5764
      /* Ignore duplicate with same number. */
5765
0
      if (i < cb->names_found) break;
5766
5767
      /* Increase the list size if necessary */
5768
5769
0
      if (cb->names_found >= cb->named_group_list_size)
5770
0
        {
5771
0
        uint32_t newsize = cb->named_group_list_size * 2;
5772
0
        named_group *newspace =
5773
0
          cb->cx->memctl.malloc(newsize * sizeof(named_group),
5774
0
          cb->cx->memctl.memory_data);
5775
0
        if (newspace == NULL)
5776
0
          {
5777
0
          errorcode = ERR21;
5778
0
          goto FAILED;
5779
0
          }
5780
5781
0
        memcpy(newspace, cb->named_groups,
5782
0
          cb->named_group_list_size * sizeof(named_group));
5783
0
        if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5784
0
          cb->cx->memctl.free((void *)cb->named_groups,
5785
0
          cb->cx->memctl.memory_data);
5786
0
        cb->named_groups = newspace;
5787
0
        cb->named_group_list_size = newsize;
5788
0
        }
5789
5790
      /* Add this name to the list */
5791
0
      if (is_dupname)
5792
0
        hash |= NAMED_GROUP_IS_DUPNAME;
5793
5794
0
      cb->named_groups[cb->names_found].name = name;
5795
0
      cb->named_groups[cb->names_found].length = (uint16_t)namelen;
5796
0
      cb->named_groups[cb->names_found].number = cb->bracount;
5797
0
      cb->named_groups[cb->names_found].hash_dup = hash;
5798
0
      cb->names_found++;
5799
0
      break;
5800
5801
5802
      /* ---- Perl extended character class ---- */
5803
5804
      /* These are of the form '(?[...])'. We handle these via the same parser
5805
      that consumes ordinary '[...]' classes, but with a flag set to activate
5806
      the extended behaviour. */
5807
5808
0
      case CHAR_LEFT_SQUARE_BRACKET:
5809
0
      class_mode_state = CLASS_MODE_PERL_EXT;
5810
0
      c = *ptr++;
5811
0
      goto FROM_PERL_EXTENDED_CLASS;
5812
0
      }        /* End of (? switch */
5813
0
    break;     /* End of ( handling */
5814
5815
5816
    /* ---- Branch terminators ---- */
5817
5818
    /* Alternation: reset the capture count if we are in a (?| group. */
5819
5820
0
    case CHAR_VERTICAL_LINE:
5821
0
    if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
5822
0
        (top_nest->flags & NSF_RESET) != 0)
5823
0
      {
5824
0
      if (cb->bracount > top_nest->max_group)
5825
0
        top_nest->max_group = (uint16_t)cb->bracount;
5826
0
      cb->bracount = top_nest->reset_group;
5827
0
      }
5828
0
    *parsed_pattern++ = META_ALT;
5829
0
    break;
5830
5831
    /* End of group; reset the capture count to the maximum if we are in a (?|
5832
    group and/or reset the options that are tracked during parsing. Disallow
5833
    quantifier for a condition that is an assertion. */
5834
5835
8
    case CHAR_RIGHT_PARENTHESIS:
5836
8
    okquantifier = TRUE;
5837
8
    if (top_nest != NULL && top_nest->nest_depth == nest_depth)
5838
0
      {
5839
0
      options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
5840
0
      xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
5841
0
      if ((top_nest->flags & NSF_RESET) != 0 &&
5842
0
          top_nest->max_group > cb->bracount)
5843
0
        cb->bracount = top_nest->max_group;
5844
0
      if ((top_nest->flags & NSF_CONDASSERT) != 0)
5845
0
        okquantifier = FALSE;
5846
5847
0
      if ((top_nest->flags & NSF_ATOMICSR) != 0)
5848
0
        {
5849
0
        *parsed_pattern++ = META_KET;
5850
5851
#ifdef PCRE2_DEBUG
5852
        PCRE2_ASSERT(parsed_pattern_extra > 0);
5853
        parsed_pattern_extra--;
5854
#endif
5855
0
        }
5856
5857
0
      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
5858
0
        else top_nest--;
5859
0
      }
5860
8
    if (nest_depth == 0)    /* Unmatched closing parenthesis */
5861
0
      {
5862
0
      errorcode = ERR22;
5863
0
      goto FAILED;
5864
0
      }
5865
8
    nest_depth--;
5866
8
    *parsed_pattern++ = META_KET;
5867
8
    break;
5868
380
    }  /* End of switch on pattern character */
5869
380
  }    /* End of main character scan loop */
5870
5871
/* End of pattern reached. Check for missing ) at the end of a verb name. */
5872
5873
16
if (inverbname && ptr >= ptrend)
5874
0
  {
5875
0
  errorcode = ERR60;
5876
0
  goto FAILED;
5877
0
  }
5878
5879
5880
16
PARSED_END:
5881
5882
16
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
5883
16
             (parsed_pattern_extra - parsed_pattern_extra_check) <=
5884
16
               max_parsed_pattern(ptr_check, ptr, utf, options));
5885
5886
/* Manage callout for the final item */
5887
5888
16
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
5889
16
  parsed_pattern, cb);
5890
5891
/* Insert trailing items for word and line matching (features provided for the
5892
benefit of pcre2grep). */
5893
5894
16
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
5895
0
  {
5896
0
  *parsed_pattern++ = META_KET;
5897
0
  *parsed_pattern++ = META_DOLLAR;
5898
0
  }
5899
16
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5900
0
  {
5901
0
  *parsed_pattern++ = META_KET;
5902
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
5903
0
  }
5904
5905
/* Terminate the parsed pattern, then return success if all groups are closed.
5906
Otherwise we have unclosed parentheses. */
5907
5908
/* LCOV_EXCL_START */
5909
16
if (parsed_pattern >= parsed_pattern_end)
5910
0
  {
5911
0
  PCRE2_DEBUG_UNREACHABLE();
5912
0
  errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5913
0
  goto FAILED;
5914
0
  }
5915
/* LCOV_EXCL_STOP */
5916
5917
16
*parsed_pattern = META_END;
5918
16
if (nest_depth == 0) return 0;
5919
5920
0
UNCLOSED_PARENTHESIS:
5921
0
errorcode = ERR14;
5922
5923
/* Come here for all failures. */
5924
5925
0
FAILED:
5926
0
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5927
0
return errorcode;
5928
5929
/* Some errors need to indicate the previous character. */
5930
5931
0
FAILED_BACK:
5932
0
ptr--;
5933
0
#ifdef SUPPORT_UNICODE
5934
0
if (utf) BACKCHAR(ptr);
5935
0
#endif
5936
0
goto FAILED;
5937
5938
/* Some errors need to indicate the next character. */
5939
5940
0
FAILED_FORWARD:
5941
0
ptr++;
5942
0
#ifdef SUPPORT_UNICODE
5943
0
if (utf) FORWARDCHARTEST(ptr, ptrend);
5944
0
#endif
5945
0
goto FAILED;
5946
0
}
5947
5948
5949
5950
/*************************************************
5951
*       Find first significant opcode            *
5952
*************************************************/
5953
5954
/* This is called by several functions that scan a compiled expression looking
5955
for a fixed first character, or an anchoring opcode etc. It skips over things
5956
that do not influence this. For some calls, it makes sense to skip negative
5957
forward and all backward assertions, and also the \b assertion; for others it
5958
does not.
5959
5960
Arguments:
5961
  code         pointer to the start of the group
5962
  skipassert   TRUE if certain assertions are to be skipped
5963
5964
Returns:       pointer to the first significant opcode
5965
*/
5966
5967
static const PCRE2_UCHAR*
5968
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5969
24
{
5970
24
for (;;)
5971
24
  {
5972
24
  switch ((int)*code)
5973
24
    {
5974
0
    case OP_ASSERT_NOT:
5975
0
    case OP_ASSERTBACK:
5976
0
    case OP_ASSERTBACK_NOT:
5977
0
    case OP_ASSERTBACK_NA:
5978
0
    if (!skipassert) return code;
5979
0
    do code += GET(code, 1); while (*code == OP_ALT);
5980
0
    code += PRIV(OP_lengths)[*code];
5981
0
    break;
5982
5983
0
    case OP_WORD_BOUNDARY:
5984
0
    case OP_NOT_WORD_BOUNDARY:
5985
0
    case OP_UCP_WORD_BOUNDARY:
5986
0
    case OP_NOT_UCP_WORD_BOUNDARY:
5987
0
    if (!skipassert) return code;
5988
0
    PCRE2_FALLTHROUGH /* Fall through */
5989
0
5990
0
    case OP_CALLOUT:
5991
0
    case OP_CREF:
5992
0
    case OP_DNCREF:
5993
0
    case OP_RREF:
5994
0
    case OP_DNRREF:
5995
0
    case OP_FALSE:
5996
0
    case OP_TRUE:
5997
0
    code += PRIV(OP_lengths)[*code];
5998
0
    break;
5999
6000
0
    case OP_CALLOUT_STR:
6001
0
    code += GET(code, 1 + 2*LINK_SIZE);
6002
0
    break;
6003
6004
0
    case OP_SKIPZERO:
6005
0
    code += 2 + GET(code, 2) + LINK_SIZE;
6006
0
    break;
6007
6008
0
    case OP_COND:
6009
0
    case OP_SCOND:
6010
0
    if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
6011
0
        code[GET(code, 1)] != OP_KET)      /* More than one branch */
6012
0
      return code;
6013
0
    code += GET(code, 1) + 1 + LINK_SIZE;
6014
0
    break;
6015
6016
0
    case OP_MARK:
6017
0
    case OP_COMMIT_ARG:
6018
0
    case OP_PRUNE_ARG:
6019
0
    case OP_SKIP_ARG:
6020
0
    case OP_THEN_ARG:
6021
0
    code += code[1] + PRIV(OP_lengths)[*code];
6022
0
    break;
6023
6024
24
    default:
6025
24
    return code;
6026
24
    }
6027
24
  }
6028
6029
/* LCOV_EXCL_START */
6030
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
6031
/* LCOV_EXCL_STOP */
6032
0
}
6033
6034
6035
6036
/*************************************************
6037
*           Compile one branch                   *
6038
*************************************************/
6039
6040
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
6041
the options are changed during the branch, the pointer is used to change the
6042
external options bits. This function is used during the pre-compile phase when
6043
we are trying to find out the amount of memory needed, as well as during the
6044
real compile phase. The value of lengthptr distinguishes the two phases.
6045
6046
Arguments:
6047
  optionsptr        pointer to the option bits
6048
  xoptionsptr       pointer to the extra option bits
6049
  codeptr           points to the pointer to the current code point
6050
  pptrptr           points to the current parsed pattern pointer
6051
  errorcodeptr      points to error code variable
6052
  firstcuptr        place to put the first required code unit
6053
  firstcuflagsptr   place to put the first code unit flags
6054
  reqcuptr          place to put the last required code unit
6055
  reqcuflagsptr     place to put the last required code unit flags
6056
  bcptr             points to current branch chain
6057
  open_caps         points to current capitem
6058
  cb                contains pointers to tables etc.
6059
  lengthptr         NULL during the real compile phase
6060
                    points to length accumulator during pre-compile phase
6061
6062
Returns:            0 There's been an error, *errorcodeptr is non-zero
6063
                   +1 Success, this branch must match at least one character
6064
                   -1 Success, this branch may match an empty string
6065
*/
6066
6067
static int
6068
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
6069
  PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
6070
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
6071
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
6072
  compile_block *cb, PCRE2_SIZE *lengthptr)
6073
48
{
6074
48
int bravalue = 0;
6075
48
int okreturn = -1;
6076
48
int group_return = 0;
6077
48
uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
6078
48
uint32_t greedy_default, greedy_non_default;
6079
48
uint32_t repeat_type, op_type;
6080
48
uint32_t options = *optionsptr;               /* May change dynamically */
6081
48
uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
6082
48
uint32_t firstcu, reqcu;
6083
48
uint32_t zeroreqcu, zerofirstcu;
6084
48
uint32_t *pptr = *pptrptr;
6085
48
uint32_t meta, meta_arg;
6086
48
uint32_t firstcuflags, reqcuflags;
6087
48
uint32_t zeroreqcuflags, zerofirstcuflags;
6088
48
uint32_t req_caseopt, reqvary, tempreqvary;
6089
/* Some opcodes, such as META_CAPTURE_NUMBER or META_CAPTURE_NAME,
6090
depends on the previous value of offset. */
6091
48
PCRE2_SIZE offset = 0;
6092
48
PCRE2_SIZE length_prevgroup = 0;
6093
48
PCRE2_UCHAR *code = *codeptr;
6094
48
PCRE2_UCHAR *last_code = code;
6095
48
PCRE2_UCHAR *orig_code = code;
6096
48
PCRE2_UCHAR *tempcode;
6097
48
PCRE2_UCHAR *previous = NULL;
6098
48
PCRE2_UCHAR op_previous;
6099
48
BOOL groupsetfirstcu = FALSE;
6100
48
BOOL had_accept = FALSE;
6101
48
BOOL matched_char = FALSE;
6102
48
BOOL previous_matched_char = FALSE;
6103
48
BOOL reset_caseful = FALSE;
6104
6105
/* We can fish out the UTF setting once and for all into a BOOL, but we must
6106
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
6107
as we process the pattern. */
6108
6109
48
#ifdef SUPPORT_UNICODE
6110
48
BOOL utf = (options & PCRE2_UTF) != 0;
6111
48
BOOL ucp = (options & PCRE2_UCP) != 0;
6112
#else  /* No Unicode support */
6113
BOOL utf = FALSE;
6114
#endif
6115
6116
/* Set up the default and non-default settings for greediness */
6117
6118
48
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6119
48
greedy_non_default = greedy_default ^ 1;
6120
6121
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
6122
matching encountered yet". It gets changed to REQ_NONE if we hit something that
6123
matches a non-fixed first unit; reqcu just remains unset if we never find one.
6124
6125
When we hit a repeat whose minimum is zero, we may have to adjust these values
6126
to take the zero repeat into account. This is implemented by setting them to
6127
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
6128
item types that can be repeated set these backoff variables appropriately. */
6129
6130
48
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
6131
48
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
6132
6133
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
6134
according to the current setting of the caseless flag. The REQ_CASELESS value
6135
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
6136
to record the case status of the value. This is used only for ASCII characters.
6137
*/
6138
6139
48
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6140
6141
/* Switch on next META item until the end of the branch */
6142
6143
744
for (;; pptr++)
6144
792
  {
6145
792
  BOOL possessive_quantifier;
6146
792
  BOOL note_group_empty;
6147
792
  uint32_t mclength;
6148
792
  uint32_t skipunits;
6149
792
  uint32_t subreqcu, subfirstcu;
6150
792
  uint32_t groupnumber;
6151
792
  uint32_t verbarglen, verbculen;
6152
792
  uint32_t subreqcuflags, subfirstcuflags;
6153
792
  open_capitem *oc;
6154
792
  PCRE2_UCHAR mcbuffer[8];
6155
6156
  /* Get next META item in the pattern and its potential argument. */
6157
6158
792
  meta = META_CODE(*pptr);
6159
792
  meta_arg = META_DATA(*pptr);
6160
6161
  /* If we are in the pre-compile phase, accumulate the length used for the
6162
  previous cycle of this loop, unless the next item is a quantifier. */
6163
6164
792
  if (lengthptr != NULL)
6165
396
    {
6166
    /* LCOV_EXCL_START */
6167
396
    if (code >= cb->start_workspace + cb->workspace_size)
6168
0
      {
6169
0
      PCRE2_DEBUG_UNREACHABLE();
6170
0
      *errorcodeptr = ERR52;  /* Over-ran workspace - internal error */
6171
0
      cb->erroroffset = 0;
6172
0
      return 0;
6173
0
      }
6174
    /* LCOV_EXCL_STOP */
6175
6176
396
    if (code > cb->start_workspace + cb->workspace_size -
6177
396
        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
6178
0
      {
6179
0
      *errorcodeptr = ERR86;  /* Pattern too complicated */
6180
0
      cb->erroroffset = 0;
6181
0
      return 0;
6182
0
      }
6183
6184
    /* There is at least one situation where code goes backwards: this is the
6185
    case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
6186
    is processed, the whole class is eliminated. However, it is created first,
6187
    so we have to allow memory for it. Therefore, don't ever reduce the length
6188
    at this point. */
6189
6190
396
    if (code < last_code) code = last_code;
6191
6192
    /* If the next thing is not a quantifier, we add the length of the previous
6193
    item into the total, and reset the code pointer to the start of the
6194
    workspace. Otherwise leave the previous item available to be quantified. */
6195
6196
396
    if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6197
276
      {
6198
276
      if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
6199
0
        {
6200
0
        *errorcodeptr = ERR20;   /* Integer overflow */
6201
0
        cb->erroroffset = 0;
6202
0
        return 0;
6203
0
        }
6204
276
      *lengthptr += (PCRE2_SIZE)(code - orig_code);
6205
276
      if (*lengthptr > MAX_PATTERN_SIZE)
6206
0
        {
6207
0
        *errorcodeptr = ERR20;   /* Pattern is too large */
6208
0
        cb->erroroffset = 0;
6209
0
        return 0;
6210
0
        }
6211
276
      code = orig_code;
6212
276
      }
6213
6214
    /* Remember where this code item starts so we can catch the "backwards"
6215
    case above next time round. */
6216
6217
396
    last_code = code;
6218
396
    }
6219
6220
  /* Process the next parsed pattern item. If it is not a quantifier, remember
6221
  where it starts so that it can be quantified when a quantifier follows.
6222
  Checking for the legality of quantifiers happens in parse_regex(), except for
6223
  a quantifier after an assertion that is a condition. */
6224
6225
792
  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6226
552
    {
6227
552
    previous = code;
6228
552
    if (matched_char && !had_accept) okreturn = 1;
6229
552
    }
6230
6231
792
  previous_matched_char = matched_char;
6232
792
  matched_char = FALSE;
6233
792
  note_group_empty = FALSE;
6234
792
  skipunits = 0;         /* Default value for most subgroups */
6235
6236
792
  switch(meta)
6237
792
    {
6238
    /* ===================================================================*/
6239
    /* The branch terminates at pattern end or | or ) */
6240
6241
32
    case META_END:
6242
32
    case META_ALT:
6243
48
    case META_KET:
6244
48
    *firstcuptr = firstcu;
6245
48
    *firstcuflagsptr = firstcuflags;
6246
48
    *reqcuptr = reqcu;
6247
48
    *reqcuflagsptr = reqcuflags;
6248
48
    *codeptr = code;
6249
48
    *pptrptr = pptr;
6250
48
    return okreturn;
6251
6252
6253
    /* ===================================================================*/
6254
    /* Handle single-character metacharacters. In multiline mode, ^ disables
6255
    the setting of any following char as a first character. */
6256
6257
32
    case META_CIRCUMFLEX:
6258
32
    if ((options & PCRE2_MULTILINE) != 0)
6259
0
      {
6260
0
      if (firstcuflags == REQ_UNSET)
6261
0
        zerofirstcuflags = firstcuflags = REQ_NONE;
6262
0
      *code++ = OP_CIRCM;
6263
0
      }
6264
32
    else *code++ = OP_CIRC;
6265
32
    break;
6266
6267
0
    case META_DOLLAR:
6268
0
    *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
6269
0
    break;
6270
6271
    /* There can never be a first char if '.' is first, whatever happens about
6272
    repeats. The value of reqcu doesn't change either. */
6273
6274
8
    case META_DOT:
6275
8
    matched_char = TRUE;
6276
8
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6277
8
    zerofirstcu = firstcu;
6278
8
    zerofirstcuflags = firstcuflags;
6279
8
    zeroreqcu = reqcu;
6280
8
    zeroreqcuflags = reqcuflags;
6281
8
    *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
6282
8
    break;
6283
6284
6285
    /* ===================================================================*/
6286
    /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
6287
    Otherwise, an initial ']' is taken as a data character. When empty classes
6288
    are allowed, [] must generate an empty class - we have no dedicated opcode
6289
    to optimise the representation, but it's a rare case (the '(*FAIL)'
6290
    construct would be a clearer way for a pattern author to represent a
6291
    non-matching branch, but it does have different semantics to '[]' if both
6292
    are followed by a quantifier). The empty-negated [^] matches any character,
6293
    so is useful: generate OP_ALLANY for this. */
6294
6295
0
    case META_CLASS_EMPTY:
6296
0
    case META_CLASS_EMPTY_NOT:
6297
0
    matched_char = TRUE;
6298
0
    if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;
6299
0
    else
6300
0
      {
6301
0
      *code++ = OP_CLASS;
6302
0
      memset(code, 0, 32);
6303
0
      code += 32 / sizeof(PCRE2_UCHAR);
6304
0
      }
6305
6306
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6307
0
    zerofirstcu = firstcu;
6308
0
    zerofirstcuflags = firstcuflags;
6309
0
    break;
6310
6311
6312
    /* ===================================================================*/
6313
    /* Non-empty character class. If the included characters are all < 256, we
6314
    build a 32-byte bitmap of the permitted characters, except in the special
6315
    case where there is only one such character. For negated classes, we build
6316
    the map as usual, then invert it at the end. However, we use a different
6317
    opcode so that data characters > 255 can be handled correctly.
6318
6319
    If the class contains characters outside the 0-255 range, a different
6320
    opcode is compiled. It may optionally have a bit map for characters < 256,
6321
    but those above are explicitly listed afterwards. A flag code unit tells
6322
    whether the bitmap is present, and whether this is a negated class or
6323
    not. */
6324
6325
0
    case META_CLASS_NOT:
6326
152
    case META_CLASS:
6327
152
    matched_char = TRUE;
6328
6329
    /* Check for complex extended classes and handle them separately. */
6330
6331
152
    if ((*pptr & CLASS_IS_ECLASS) != 0)
6332
0
      {
6333
0
      if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,
6334
0
                                      errorcodeptr, cb, lengthptr))
6335
0
        return 0;
6336
0
      goto CLASS_END_PROCESSING;
6337
0
      }
6338
6339
    /* We can optimize the case of a single character in a class by generating
6340
    OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
6341
    negative. In the negative case there can be no first char if this item is
6342
    first, whatever repeat count may follow. In the case of reqcu, save the
6343
    previous value for reinstating. */
6344
6345
    /* NOTE: at present this optimization is not effective if the only
6346
    character in a class in 32-bit, non-UCP mode has its top bit set. */
6347
6348
152
    if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
6349
8
      {
6350
8
      uint32_t c = pptr[1];
6351
6352
8
      pptr += 2;                 /* Move on to class end */
6353
8
      if (meta == META_CLASS)    /* A positive one-char class can be */
6354
8
        {                        /* handled as a normal literal character. */
6355
8
        meta = c;                /* Set up the character */
6356
8
        goto NORMAL_CHAR_SET;
6357
8
        }
6358
6359
      /* Handle a negative one-character class */
6360
6361
0
      zeroreqcu = reqcu;
6362
0
      zeroreqcuflags = reqcuflags;
6363
0
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6364
0
      zerofirstcu = firstcu;
6365
0
      zerofirstcuflags = firstcuflags;
6366
6367
      /* For caseless UTF or UCP mode, check whether this character has more
6368
      than one other case. If so, generate a special OP_NOTPROP item instead of
6369
      OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
6370
      caseless set that starts with an ASCII character. If the character is
6371
      affected by the special Turkish rules, hardcode the not-matching
6372
      characters using a caseset. */
6373
6374
0
#ifdef SUPPORT_UNICODE
6375
0
      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
6376
0
        {
6377
0
        uint32_t caseset;
6378
6379
0
        if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6380
0
              PCRE2_EXTRA_TURKISH_CASING &&
6381
0
            UCD_ANY_I(c))
6382
0
          {
6383
0
          caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
6384
0
          }
6385
0
        else if ((caseset = UCD_CASESET(c)) != 0 &&
6386
0
                 (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6387
0
                 PRIV(ucd_caseless_sets)[caseset] < 128)
6388
0
          {
6389
0
          caseset = 0;  /* Ignore the caseless set if it's restricted. */
6390
0
          }
6391
6392
0
        if (caseset != 0)
6393
0
          {
6394
0
          *code++ = OP_NOTPROP;
6395
0
          *code++ = PT_CLIST;
6396
0
          *code++ = caseset;
6397
0
          break;   /* We are finished with this class */
6398
0
          }
6399
0
        }
6400
0
#endif
6401
      /* Char has only one other (usable) case, or UCP not available */
6402
6403
0
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
6404
0
      code += PUTCHAR(c, code);
6405
0
      break;   /* We are finished with this class */
6406
0
      }        /* End of 1-char optimization */
6407
6408
    /* Handle character classes that contain more than just one literal
6409
    character. If there are exactly two characters in a positive class, see if
6410
    they are case partners. This can be optimized to generate a caseless single
6411
    character match (which also sets first/required code units if relevant).
6412
    When casing restrictions apply, ignore a caseless set if both characters
6413
    are ASCII. When Turkish casing applies, an 'i' does not match its normal
6414
    Unicode "othercase". */
6415
6416
144
    if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
6417
32
        pptr[3] == META_CLASS_END)
6418
16
      {
6419
16
      uint32_t c = pptr[1];
6420
6421
16
#ifdef SUPPORT_UNICODE
6422
16
      if ((UCD_CASESET(c) == 0 ||
6423
0
           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6424
0
            c < 128 && pptr[2] < 128)) &&
6425
16
          !((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6426
16
              PCRE2_EXTRA_TURKISH_CASING &&
6427
0
            UCD_ANY_I(c)))
6428
16
#endif
6429
16
        {
6430
16
        uint32_t d;
6431
6432
16
#ifdef SUPPORT_UNICODE
6433
16
        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
6434
16
#endif
6435
16
          {
6436
#if PCRE2_CODE_UNIT_WIDTH != 8
6437
          if (c > 255) d = c; else
6438
#endif
6439
16
          d = TABLE_GET(c, cb->fcc, c);
6440
16
          }
6441
6442
16
        if (c != d && pptr[2] == d)
6443
0
          {
6444
0
          pptr += 3;                 /* Move on to class end */
6445
0
          meta = c;
6446
0
          if ((options & PCRE2_CASELESS) == 0)
6447
0
            {
6448
0
            reset_caseful = TRUE;
6449
0
            options |= PCRE2_CASELESS;
6450
0
            req_caseopt = REQ_CASELESS;
6451
0
            }
6452
0
          goto CLASS_CASELESS_CHAR;
6453
0
          }
6454
16
        }
6455
16
      }
6456
6457
    /* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */
6458
6459
144
    pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,
6460
144
                                          &code, meta == META_CLASS_NOT, NULL,
6461
144
                                          errorcodeptr, cb, lengthptr);
6462
144
    if (pptr == NULL) return 0;
6463
144
    PCRE2_ASSERT(*pptr == META_CLASS_END);
6464
6465
144
    CLASS_END_PROCESSING:
6466
6467
    /* If this class is the first thing in the branch, there can be no first
6468
    char setting, whatever the repeat count. Any reqcu setting must remain
6469
    unchanged after any kind of repeat. */
6470
6471
144
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6472
144
    zerofirstcu = firstcu;
6473
144
    zerofirstcuflags = firstcuflags;
6474
144
    zeroreqcu = reqcu;
6475
144
    zeroreqcuflags = reqcuflags;
6476
144
    break;  /* End of class processing */
6477
6478
6479
    /* ===================================================================*/
6480
    /* Deal with (*VERB)s. */
6481
6482
    /* Check for open captures before ACCEPT and close those that are within
6483
    the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6484
    assertion. In the first pass, just accumulate the length required;
6485
    otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6486
    workspace overflow. Do not set firstcu after *ACCEPT. */
6487
6488
0
    case META_ACCEPT:
6489
0
    cb->had_accept = had_accept = TRUE;
6490
0
    for (oc = open_caps;
6491
0
         oc != NULL && oc->assert_depth >= cb->assert_depth;
6492
0
         oc = oc->next)
6493
0
      {
6494
0
      if (lengthptr != NULL)
6495
0
        {
6496
0
        *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6497
0
        }
6498
0
      else
6499
0
        {
6500
0
        *code++ = OP_CLOSE;
6501
0
        PUT2INC(code, 0, oc->number);
6502
0
        }
6503
0
      }
6504
0
    *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6505
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6506
0
    break;
6507
6508
0
    case META_PRUNE:
6509
0
    case META_SKIP:
6510
0
    cb->had_pruneorskip = TRUE;
6511
0
    PCRE2_FALLTHROUGH /* Fall through */
6512
0
    case META_COMMIT:
6513
0
    case META_FAIL:
6514
0
    *code++ = verbops[(meta - META_MARK) >> 16];
6515
0
    break;
6516
6517
0
    case META_THEN:
6518
0
    cb->external_flags |= PCRE2_HASTHEN;
6519
0
    *code++ = OP_THEN;
6520
0
    break;
6521
6522
    /* Handle verbs with arguments. Arguments can be very long, especially in
6523
    16- and 32-bit modes, and can overflow the workspace in the first pass.
6524
    However, the argument length is constrained to be small enough to fit in
6525
    one code unit. This check happens in parse_regex(). In the first pass,
6526
    instead of putting the argument into memory, we just update the length
6527
    counter and set up an empty argument. */
6528
6529
0
    case META_THEN_ARG:
6530
0
    cb->external_flags |= PCRE2_HASTHEN;
6531
0
    goto VERB_ARG;
6532
6533
0
    case META_PRUNE_ARG:
6534
0
    case META_SKIP_ARG:
6535
0
    cb->had_pruneorskip = TRUE;
6536
0
    PCRE2_FALLTHROUGH /* Fall through */
6537
0
    case META_MARK:
6538
0
    case META_COMMIT_ARG:
6539
0
    VERB_ARG:
6540
0
    *code++ = verbops[(meta - META_MARK) >> 16];
6541
    /* The length is in characters. */
6542
0
    verbarglen = *(++pptr);
6543
0
    verbculen = 0;
6544
0
    tempcode = code++;
6545
0
    for (int i = 0; i < (int)verbarglen; i++)
6546
0
      {
6547
0
      meta = *(++pptr);
6548
0
#ifdef SUPPORT_UNICODE
6549
0
      if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6550
0
#endif
6551
0
        {
6552
0
        mclength = 1;
6553
0
        mcbuffer[0] = meta;
6554
0
        }
6555
0
      if (lengthptr != NULL) *lengthptr += mclength; else
6556
0
        {
6557
0
        memcpy(code, mcbuffer, CU2BYTES(mclength));
6558
0
        code += mclength;
6559
0
        verbculen += mclength;
6560
0
        }
6561
0
      }
6562
6563
0
    *tempcode = verbculen;   /* Fill in the code unit length */
6564
0
    *code++ = 0;             /* Terminating zero */
6565
0
    break;
6566
6567
6568
    /* ===================================================================*/
6569
    /* Handle options change. The new setting must be passed back for use in
6570
    subsequent branches. Reset the greedy defaults and the case value for
6571
    firstcu and reqcu. */
6572
6573
0
    case META_OPTIONS:
6574
0
    *optionsptr = options = *(++pptr);
6575
0
    *xoptionsptr = xoptions = *(++pptr);
6576
0
    greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6577
0
    greedy_non_default = greedy_default ^ 1;
6578
0
    req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6579
0
    break;
6580
6581
    /* ===================================================================*/
6582
    /* Handle scan substring. Scan substring assertion starts with META_SCS,
6583
    which recursively calls compile_branch. The first opcode processed by
6584
    this recursive call is always META_OFFSET. */
6585
6586
0
    case META_OFFSET:
6587
0
    if (lengthptr != NULL)
6588
0
      {
6589
0
      pptr = PRIV(compile_parse_scan_substr_args)(pptr, errorcodeptr, cb, lengthptr);
6590
0
      if (pptr == NULL)
6591
0
        return 0;
6592
0
      break;
6593
0
      }
6594
6595
0
    while (TRUE)
6596
0
      {
6597
0
      int count, index;
6598
0
      named_group *ng;
6599
6600
0
      switch (META_CODE(*pptr))
6601
0
        {
6602
0
        case META_OFFSET:
6603
0
        pptr++;
6604
0
        SKIPOFFSET(pptr);
6605
0
        continue;
6606
6607
0
        case META_CAPTURE_NAME:
6608
0
        ng = cb->named_groups + pptr[1];
6609
0
        pptr += 2;
6610
0
        count = 0;
6611
0
        index = 0;
6612
6613
0
        if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6614
0
          &count, errorcodeptr, cb)) return 0;
6615
6616
0
        code[0] = OP_DNCREF;
6617
0
        PUT2(code, 1, index);
6618
0
        PUT2(code, 1 + IMM2_SIZE, count);
6619
0
        code += 1 + 2 * IMM2_SIZE;
6620
0
        continue;
6621
6622
0
        case META_CAPTURE_NUMBER:
6623
0
        pptr += 2;
6624
0
        if (pptr[-1] == 0) continue;
6625
6626
0
        code[0] = OP_CREF;
6627
0
        PUT2(code, 1, pptr[-1]);
6628
0
        code += 1 + IMM2_SIZE;
6629
0
        continue;
6630
6631
0
        default:
6632
0
        break;
6633
0
        }
6634
6635
0
      break;
6636
0
      }
6637
0
    --pptr;
6638
0
    break;
6639
6640
0
    case META_SCS:
6641
0
    bravalue = OP_ASSERT_SCS;
6642
0
    cb->assert_depth += 1;
6643
0
    goto GROUP_PROCESS;
6644
6645
6646
    /* ===================================================================*/
6647
    /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6648
    because it could be a numerical check on recursion, or a name check on a
6649
    group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6650
    we can handle it either way. We first try for a name; if not found, process
6651
    the number. */
6652
6653
0
    case META_COND_RNUMBER:   /* (?(Rdigits) */
6654
0
    case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6655
0
    case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6656
0
    bravalue = OP_COND;
6657
6658
0
    if (lengthptr != NULL)
6659
0
      {
6660
0
      uint32_t i;
6661
0
      PCRE2_SPTR name;
6662
0
      named_group *ng;
6663
0
      uint32_t *start_pptr = pptr;
6664
0
      uint32_t length = *(++pptr);
6665
6666
0
      GETPLUSOFFSET(offset, pptr);
6667
0
      name = cb->start_pattern + offset;
6668
6669
      /* In the first pass, the names generated in the pre-pass are available,
6670
      but the main name table has not yet been created. Scan the list of names
6671
      generated in the pre-pass in order to get a number and whether or not
6672
      this name is duplicated. If it is not duplicated, we can handle it as a
6673
      numerical group. */
6674
6675
0
      ng = PRIV(compile_find_named_group)(name, length, cb);
6676
6677
0
      if (ng == NULL)
6678
0
        {
6679
        /* If the name was not found we have a bad reference, unless we are
6680
        dealing with R<digits>, which is treated as a recursion test by
6681
        number. */
6682
6683
0
        groupnumber = 0;
6684
0
        if (meta == META_COND_RNUMBER)
6685
0
          {
6686
0
          for (i = 1; i < length; i++)
6687
0
            {
6688
0
            groupnumber = groupnumber * 10 + (name[i] - CHAR_0);
6689
0
            if (groupnumber > MAX_GROUP_NUMBER)
6690
0
              {
6691
0
              *errorcodeptr = ERR61;
6692
0
              cb->erroroffset = offset + i;
6693
0
              return 0;
6694
0
              }
6695
0
            }
6696
0
          }
6697
6698
0
        if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6699
0
          {
6700
0
          *errorcodeptr = ERR15;
6701
0
          cb->erroroffset = offset;
6702
0
          return 0;
6703
0
          }
6704
6705
        /* (?Rdigits) treated as a recursion reference by number. A value of
6706
        zero (which is the result of both (?R) and (?R0)) means "any", and is
6707
        translated into RREF_ANY (which is 0xffff). */
6708
6709
0
        if (groupnumber == 0) groupnumber = RREF_ANY;
6710
0
        PCRE2_ASSERT(start_pptr[0] == META_COND_RNUMBER);
6711
0
        start_pptr[1] = groupnumber;
6712
0
        skipunits = 1+IMM2_SIZE;
6713
0
        goto GROUP_PROCESS_NOTE_EMPTY;
6714
0
        }
6715
6716
      /* From here on, we know we have a name (not a number),
6717
      so treat META_COND_RNUMBER the same as META_COND_NAME. */
6718
0
      if (meta == META_COND_RNUMBER) meta = META_COND_NAME;
6719
6720
0
      if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
6721
0
        {
6722
        /* Found a non-duplicated name. Since it is a global,
6723
        it is enough to update it in the pre-processing phase. */
6724
0
        if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6725
6726
0
        start_pptr[0] = meta;
6727
0
        start_pptr[1] = ng->number;
6728
6729
0
        skipunits = 1 + IMM2_SIZE;
6730
0
        goto GROUP_PROCESS_NOTE_EMPTY;
6731
0
        }
6732
6733
      /* We have a duplicated name. In the compile pass we have to search the
6734
      main table in order to get the index and count values. */
6735
6736
0
      start_pptr[0] = meta | 1;
6737
0
      start_pptr[1] = (uint32_t)(ng - cb->named_groups);
6738
6739
      /* A duplicated name was found. Note that if an R<digits> name is found
6740
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6741
0
      skipunits = 1 + 2 * IMM2_SIZE;
6742
0
      }
6743
0
    else
6744
0
      {
6745
      /* Otherwise lengthptr equals to NULL,
6746
      which is the second phase of compilation. */
6747
0
      int count, index;
6748
0
      named_group *ng;
6749
6750
      /* Generate code using the data
6751
      collected in the pre-processing phase. */
6752
6753
0
      if (meta == META_COND_RNUMBER)
6754
0
        {
6755
0
        code[1+LINK_SIZE] = OP_RREF;
6756
0
        PUT2(code, 2 + LINK_SIZE, pptr[1]);
6757
0
        skipunits = 1 + IMM2_SIZE;
6758
0
        pptr += 1 + SIZEOFFSET;
6759
0
        goto GROUP_PROCESS_NOTE_EMPTY;
6760
0
        }
6761
6762
0
      if (meta_arg == 0)
6763
0
        {
6764
0
        code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6765
0
        PUT2(code, 2 + LINK_SIZE, pptr[1]);
6766
0
        skipunits = 1 + IMM2_SIZE;
6767
0
        pptr += 1 + SIZEOFFSET;
6768
0
        goto GROUP_PROCESS_NOTE_EMPTY;
6769
0
        }
6770
6771
0
      ng = cb->named_groups + pptr[1];
6772
0
      count = 0;  /* Values for first pass (avoids compiler warning) */
6773
0
      index = 0;
6774
6775
      /* The failed case is an internal error. */
6776
0
      if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6777
0
            &count, errorcodeptr, cb)) return 0;
6778
6779
      /* A duplicated name was found. Note that if an R<digits> name is found
6780
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6781
6782
0
      code[1 + LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;
6783
6784
      /* Insert appropriate data values. */
6785
0
      PUT2(code, 2 + LINK_SIZE, index);
6786
0
      PUT2(code, 2 + LINK_SIZE + IMM2_SIZE, count);
6787
0
      skipunits = 1 + 2 * IMM2_SIZE;
6788
0
      pptr += 1 + SIZEOFFSET;
6789
0
      }
6790
6791
0
    PCRE2_ASSERT(meta != META_CAPTURE_NAME);
6792
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6793
6794
    /* The DEFINE condition is always false. Its internal groups may never
6795
    be called, so matched_char must remain false, hence the jump to
6796
    GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6797
6798
0
    case META_COND_DEFINE:
6799
0
    bravalue = OP_COND;
6800
0
    GETPLUSOFFSET(offset, pptr);
6801
0
    code[1+LINK_SIZE] = OP_DEFINE;
6802
0
    skipunits = 1;
6803
0
    goto GROUP_PROCESS;
6804
6805
    /* Conditional test of a group's being set. */
6806
6807
0
    case META_COND_NUMBER:
6808
0
    bravalue = OP_COND;
6809
0
    GETPLUSOFFSET(offset, pptr);
6810
6811
0
    groupnumber = *(++pptr);
6812
0
    if (groupnumber > cb->bracount)
6813
0
      {
6814
0
      *errorcodeptr = ERR15;
6815
0
      cb->erroroffset = offset;
6816
0
      return 0;
6817
0
      }
6818
0
    if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6819
6820
    /* Point at initial ( for too many branches error */
6821
0
    offset -= 2;
6822
0
    code[1+LINK_SIZE] = OP_CREF;
6823
0
    skipunits = 1+IMM2_SIZE;
6824
0
    PUT2(code, 2+LINK_SIZE, groupnumber);
6825
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6826
6827
    /* Test for the PCRE2 version. */
6828
6829
0
    case META_COND_VERSION:
6830
0
    bravalue = OP_COND;
6831
0
    if (pptr[1] > 0)
6832
0
      code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6833
0
        (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6834
0
          OP_TRUE : OP_FALSE;
6835
0
    else
6836
0
      code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6837
0
        OP_TRUE : OP_FALSE;
6838
0
    skipunits = 1;
6839
0
    pptr += 3;
6840
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6841
6842
    /* The condition is an assertion, possibly preceded by a callout. */
6843
6844
0
    case META_COND_ASSERT:
6845
0
    bravalue = OP_COND;
6846
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6847
6848
6849
    /* ===================================================================*/
6850
    /* Handle all kinds of nested bracketed groups. The non-capturing,
6851
    non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6852
6853
0
    case META_LOOKAHEAD:
6854
0
    bravalue = OP_ASSERT;
6855
0
    cb->assert_depth += 1;
6856
0
    goto GROUP_PROCESS;
6857
6858
0
    case META_LOOKAHEAD_NA:
6859
0
    bravalue = OP_ASSERT_NA;
6860
0
    cb->assert_depth += 1;
6861
0
    goto GROUP_PROCESS;
6862
6863
    /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6864
    thing to do, but Perl allows all assertions to be quantified, and when
6865
    they contain capturing parentheses there may be a potential use for
6866
    this feature. Not that that applies to a quantified (?!) but we allow
6867
    it for uniformity. */
6868
6869
0
    case META_LOOKAHEADNOT:
6870
0
    if (pptr[1] == META_KET &&
6871
0
         (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6872
0
      {
6873
0
      *code++ = OP_FAIL;
6874
0
      pptr++;
6875
0
      }
6876
0
    else
6877
0
      {
6878
0
      bravalue = OP_ASSERT_NOT;
6879
0
      cb->assert_depth += 1;
6880
0
      goto GROUP_PROCESS;
6881
0
      }
6882
0
    break;
6883
6884
0
    case META_LOOKBEHIND:
6885
0
    bravalue = OP_ASSERTBACK;
6886
0
    cb->assert_depth += 1;
6887
0
    goto GROUP_PROCESS;
6888
6889
0
    case META_LOOKBEHINDNOT:
6890
0
    bravalue = OP_ASSERTBACK_NOT;
6891
0
    cb->assert_depth += 1;
6892
0
    goto GROUP_PROCESS;
6893
6894
0
    case META_LOOKBEHIND_NA:
6895
0
    bravalue = OP_ASSERTBACK_NA;
6896
0
    cb->assert_depth += 1;
6897
0
    goto GROUP_PROCESS;
6898
6899
0
    case META_ATOMIC:
6900
0
    bravalue = OP_ONCE;
6901
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6902
6903
0
    case META_SCRIPT_RUN:
6904
0
    bravalue = OP_SCRIPT_RUN;
6905
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6906
6907
0
    case META_NOCAPTURE:
6908
0
    bravalue = OP_BRA;
6909
    /* Fall through */
6910
6911
    /* Process nested bracketed regex. The nesting depth is maintained for the
6912
    benefit of the stackguard function. The test for too deep nesting is now
6913
    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6914
    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6915
    note of whether or not they may match an empty string. */
6916
6917
16
    GROUP_PROCESS_NOTE_EMPTY:
6918
16
    note_group_empty = TRUE;
6919
6920
16
    GROUP_PROCESS:
6921
16
    cb->parens_depth += 1;
6922
16
    *code = bravalue;
6923
16
    pptr++;
6924
16
    tempcode = code;
6925
16
    tempreqvary = cb->req_varyopt;        /* Save value before group */
6926
16
    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6927
6928
16
    if ((group_return =
6929
16
         compile_regex(
6930
16
         options,                         /* The options state */
6931
16
         xoptions,                        /* The extra options state */
6932
16
         &tempcode,                       /* Where to put code (updated) */
6933
16
         &pptr,                           /* Input pointer (updated) */
6934
16
         errorcodeptr,                    /* Where to put an error message */
6935
16
         skipunits,                       /* Skip over bracket number */
6936
16
         &subfirstcu,                     /* For possible first char */
6937
16
         &subfirstcuflags,
6938
16
         &subreqcu,                       /* For possible last char */
6939
16
         &subreqcuflags,
6940
16
         bcptr,                           /* Current branch chain */
6941
16
         open_caps,                       /* Pointer to capture stack */
6942
16
         cb,                              /* Compile data block */
6943
16
         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6944
16
           &length_prevgroup              /* Pre-compile phase */
6945
16
         )) == 0)
6946
0
      return 0;  /* Error */
6947
6948
16
    cb->parens_depth -= 1;
6949
6950
    /* If that was a non-conditional significant group (not an assertion, not a
6951
    DEFINE) that matches at least one character, then the current item matches
6952
    a character. Conditionals are handled below. */
6953
6954
16
    if (note_group_empty && bravalue != OP_COND && group_return > 0)
6955
16
      matched_char = TRUE;
6956
6957
    /* If we've just compiled an assertion, pop the assert depth. */
6958
6959
16
    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)
6960
0
      cb->assert_depth -= 1;
6961
6962
    /* At the end of compiling, code is still pointing to the start of the
6963
    group, while tempcode has been updated to point past the end of the group.
6964
    The parsed pattern pointer (pptr) is on the closing META_KET.
6965
6966
    If this is a conditional bracket, check that there are no more than
6967
    two branches in the group, or just one if it's a DEFINE group. We do this
6968
    in the real compile phase, not in the pre-pass, where the whole group may
6969
    not be available. */
6970
6971
16
    if (bravalue == OP_COND && lengthptr == NULL)
6972
0
      {
6973
0
      PCRE2_UCHAR *tc = code;
6974
0
      int condcount = 0;
6975
6976
0
      do {
6977
0
         condcount++;
6978
0
         tc += GET(tc,1);
6979
0
         }
6980
0
      while (*tc != OP_KET);
6981
6982
      /* A DEFINE group is never obeyed inline (the "condition" is always
6983
      false). It must have only one branch. Having checked this, change the
6984
      opcode to OP_FALSE. */
6985
6986
0
      if (code[LINK_SIZE+1] == OP_DEFINE)
6987
0
        {
6988
0
        if (condcount > 1)
6989
0
          {
6990
0
          cb->erroroffset = offset;
6991
0
          *errorcodeptr = ERR54;
6992
0
          return 0;
6993
0
          }
6994
0
        code[LINK_SIZE+1] = OP_FALSE;
6995
0
        bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6996
0
        }
6997
6998
      /* A "normal" conditional group. If there is just one branch, we must not
6999
      make use of its firstcu or reqcu, because this is equivalent to an
7000
      empty second branch. Also, it may match an empty string. If there are two
7001
      branches, this item must match a character if the group must. */
7002
7003
0
      else
7004
0
        {
7005
0
        if (condcount > 2)
7006
0
          {
7007
0
          cb->erroroffset = offset;
7008
0
          *errorcodeptr = ERR27;
7009
0
          return 0;
7010
0
          }
7011
0
        if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
7012
0
          else if (group_return > 0) matched_char = TRUE;
7013
0
        }
7014
0
      }
7015
7016
    /* In the pre-compile phase, update the length by the length of the group,
7017
    less the brackets at either end. Then reduce the compiled code to just a
7018
    set of non-capturing brackets so that it doesn't use much memory if it is
7019
    duplicated by a quantifier.*/
7020
7021
16
    if (lengthptr != NULL)
7022
8
      {
7023
8
      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7024
0
        {
7025
0
        *errorcodeptr = ERR20;
7026
0
        return 0;
7027
0
        }
7028
8
      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7029
8
      code++;   /* This already contains bravalue */
7030
8
      PUTINC(code, 0, 1 + LINK_SIZE);
7031
8
      *code++ = OP_KET;
7032
8
      PUTINC(code, 0, 1 + LINK_SIZE);
7033
8
      break;    /* No need to waste time with special character handling */
7034
8
      }
7035
7036
    /* Otherwise update the main code pointer to the end of the group. */
7037
7038
8
    code = tempcode;
7039
7040
    /* For a DEFINE group, required and first character settings are not
7041
    relevant. */
7042
7043
8
    if (bravalue == OP_DEFINE) break;
7044
7045
    /* Handle updating of the required and first code units for other types of
7046
    group. Update for normal brackets of all kinds, and conditions with two
7047
    branches (see code above). If the bracket is followed by a quantifier with
7048
    zero repeat, we have to back off. Hence the definition of zeroreqcu and
7049
    zerofirstcu outside the main loop so that they can be accessed for the back
7050
    off. */
7051
7052
8
    zeroreqcu = reqcu;
7053
8
    zeroreqcuflags = reqcuflags;
7054
8
    zerofirstcu = firstcu;
7055
8
    zerofirstcuflags = firstcuflags;
7056
8
    groupsetfirstcu = FALSE;
7057
7058
8
    if (bravalue >= OP_ONCE)  /* Not an assertion */
7059
8
      {
7060
      /* If we have not yet set a firstcu in this branch, take it from the
7061
      subpattern, remembering that it was set here so that a repeat of more
7062
      than one can replicate it as reqcu if necessary. If the subpattern has
7063
      no firstcu, set "none" for the whole branch. In both cases, a zero
7064
      repeat forces firstcu to "none". */
7065
7066
8
      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
7067
4
        {
7068
4
        if (subfirstcuflags < REQ_NONE)
7069
4
          {
7070
4
          firstcu = subfirstcu;
7071
4
          firstcuflags = subfirstcuflags;
7072
4
          groupsetfirstcu = TRUE;
7073
4
          }
7074
0
        else firstcuflags = REQ_NONE;
7075
4
        zerofirstcuflags = REQ_NONE;
7076
4
        }
7077
7078
      /* If firstcu was previously set, convert the subpattern's firstcu
7079
      into reqcu if there wasn't one, using the vary flag that was in
7080
      existence beforehand. */
7081
7082
4
      else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
7083
4
        {
7084
4
        subreqcu = subfirstcu;
7085
4
        subreqcuflags = subfirstcuflags | tempreqvary;
7086
4
        }
7087
7088
      /* If the subpattern set a required code unit (or set a first code unit
7089
      that isn't really the first code unit - see above), set it. */
7090
7091
8
      if (subreqcuflags < REQ_NONE)
7092
8
        {
7093
8
        reqcu = subreqcu;
7094
8
        reqcuflags = subreqcuflags;
7095
8
        }
7096
8
      }
7097
7098
    /* For a forward assertion, we take the reqcu, if set, provided that the
7099
    group has also set a firstcu. This can be helpful if the pattern that
7100
    follows the assertion doesn't set a different char. For example, it's
7101
    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7102
    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7103
    the "real" "a" would then become a reqcu instead of a firstcu. This is
7104
    overcome by a scan at the end if there's no firstcu, looking for an
7105
    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7106
    we must only take the reqcu when the group also set a firstcu. Otherwise,
7107
    in that example, 'X' ends up set for both. */
7108
7109
0
    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7110
0
             subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7111
0
      {
7112
0
      reqcu = subreqcu;
7113
0
      reqcuflags = subreqcuflags;
7114
0
      }
7115
7116
8
    break;  /* End of nested group handling */
7117
7118
7119
    /* ===================================================================*/
7120
    /* Handle named backreferences and recursions. */
7121
7122
0
    case META_BACKREF_BYNAME:
7123
0
    case META_RECURSE_BYNAME:
7124
0
      {
7125
0
      int count, index;
7126
0
      PCRE2_SPTR name;
7127
0
      named_group *ng;
7128
0
      uint32_t length = *(++pptr);
7129
7130
0
      GETPLUSOFFSET(offset, pptr);
7131
0
      name = cb->start_pattern + offset;
7132
7133
      /* In the first pass, the names generated in the pre-pass are available,
7134
      but the main name table has not yet been created. Scan the list of names
7135
      generated in the pre-pass in order to get a number and whether or not
7136
      this name is duplicated. */
7137
7138
0
      ng = PRIV(compile_find_named_group)(name, length, cb);
7139
7140
0
      if (ng == NULL)
7141
0
        {
7142
        /* If the name was not found we have a bad reference. */
7143
0
        *errorcodeptr = ERR15;
7144
0
        cb->erroroffset = offset;
7145
0
        return 0;
7146
0
        }
7147
7148
0
      groupnumber = ng->number;
7149
7150
      /* For a recursion, that's all that is needed. We can now go to
7151
      the code that handles numerical recursion, applying it to the first
7152
      group with the given name. */
7153
7154
0
      if (meta == META_RECURSE_BYNAME)
7155
0
        {
7156
0
        meta_arg = groupnumber;
7157
0
        goto HANDLE_NUMERICAL_RECURSION;
7158
0
        }
7159
7160
      /* For a back reference, update the back reference map and the
7161
      maximum back reference. */
7162
7163
0
      cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7164
0
      if (groupnumber > cb->top_backref)
7165
0
        cb->top_backref = groupnumber;
7166
7167
      /* If a back reference name is not duplicated, we can handle it as
7168
      a numerical reference. */
7169
7170
0
      if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
7171
0
        {
7172
0
        meta_arg = groupnumber;
7173
0
        goto HANDLE_SINGLE_REFERENCE;
7174
0
        }
7175
7176
      /* If a back reference name is duplicated, we generate a different
7177
      opcode to a numerical back reference. In the second pass we must
7178
      search for the index and count in the final name table. */
7179
7180
0
      count = 0;  /* Values for first pass (avoids compiler warning) */
7181
0
      index = 0;
7182
0
      if (lengthptr == NULL && !PRIV(compile_find_dupname_details)(name, length,
7183
0
            &index, &count, errorcodeptr, cb)) return 0;
7184
7185
0
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7186
0
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7187
0
      PUT2INC(code, 0, index);
7188
0
      PUT2INC(code, 0, count);
7189
0
      if ((options & PCRE2_CASELESS) != 0)
7190
0
        *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7191
0
                   REFI_FLAG_CASELESS_RESTRICT : 0) |
7192
0
                  (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7193
0
                   REFI_FLAG_TURKISH_CASING : 0);
7194
0
      }
7195
0
    break;
7196
7197
7198
    /* ===================================================================*/
7199
    /* Handle a numerical callout. */
7200
7201
0
    case META_CALLOUT_NUMBER:
7202
0
    code[0] = OP_CALLOUT;
7203
0
    PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7204
0
    PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7205
0
    code[1 + 2*LINK_SIZE] = pptr[3];
7206
0
    pptr += 3;
7207
0
    code += PRIV(OP_lengths)[OP_CALLOUT];
7208
0
    break;
7209
7210
7211
    /* ===================================================================*/
7212
    /* Handle a callout with a string argument. In the pre-pass we just compute
7213
    the length without generating anything. The length in pptr[3] includes both
7214
    delimiters; in the actual compile only the first one is copied, but a
7215
    terminating zero is added. Any doubled delimiters within the string make
7216
    this an overestimate, but it is not worth bothering about. */
7217
7218
0
    case META_CALLOUT_STRING:
7219
0
    if (lengthptr != NULL)
7220
0
      {
7221
0
      *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7222
0
      pptr += 3;
7223
0
      SKIPOFFSET(pptr);
7224
0
      }
7225
7226
    /* In the real compile we can copy the string. The starting delimiter is
7227
     included so that the client can discover it if they want. We also pass the
7228
     start offset to help a script language give better error messages. */
7229
7230
0
    else
7231
0
      {
7232
0
      PCRE2_SPTR pp;
7233
0
      uint32_t delimiter;
7234
0
      uint32_t length = pptr[3];
7235
0
      PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7236
7237
0
      code[0] = OP_CALLOUT_STR;
7238
0
      PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7239
0
      PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7240
7241
0
      pptr += 3;
7242
0
      GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7243
0
      pp = cb->start_pattern + offset;
7244
0
      delimiter = *callout_string++ = *pp++;
7245
0
      if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7246
0
        delimiter = CHAR_RIGHT_CURLY_BRACKET;
7247
0
      PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7248
7249
      /* The syntax of the pattern was checked in the parsing scan. The length
7250
      includes both delimiters, but we have passed the opening one just above,
7251
      so we reduce length before testing it. The test is for > 1 because we do
7252
      not want to copy the final delimiter. This also ensures that pp[1] is
7253
      accessible. */
7254
7255
0
      while (--length > 1)
7256
0
        {
7257
0
        if (*pp == delimiter && pp[1] == delimiter)
7258
0
          {
7259
0
          *callout_string++ = delimiter;
7260
0
          pp += 2;
7261
0
          length--;
7262
0
          }
7263
0
        else *callout_string++ = *pp++;
7264
0
        }
7265
0
      *callout_string++ = CHAR_NUL;
7266
7267
      /* Set the length of the entire item, the advance to its end. */
7268
7269
0
      PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7270
0
      code = callout_string;
7271
0
      }
7272
0
    break;
7273
7274
7275
    /* ===================================================================*/
7276
    /* Handle repetition. The different types are all sorted out in the parsing
7277
    pass. */
7278
7279
0
    case META_MINMAX_PLUS:
7280
0
    case META_MINMAX_QUERY:
7281
96
    case META_MINMAX:
7282
96
    repeat_min = *(++pptr);
7283
96
    repeat_max = *(++pptr);
7284
96
    goto REPEAT;
7285
7286
24
    case META_ASTERISK:
7287
24
    case META_ASTERISK_PLUS:
7288
24
    case META_ASTERISK_QUERY:
7289
24
    repeat_min = 0;
7290
24
    repeat_max = REPEAT_UNLIMITED;
7291
24
    goto REPEAT;
7292
7293
56
    case META_PLUS:
7294
56
    case META_PLUS_PLUS:
7295
56
    case META_PLUS_QUERY:
7296
56
    repeat_min = 1;
7297
56
    repeat_max = REPEAT_UNLIMITED;
7298
56
    goto REPEAT;
7299
7300
64
    case META_QUERY:
7301
64
    case META_QUERY_PLUS:
7302
64
    case META_QUERY_QUERY:
7303
64
    repeat_min = 0;
7304
64
    repeat_max = 1;
7305
7306
240
    REPEAT:
7307
240
    if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7308
7309
    /* Remember whether this is a variable length repeat, and default to
7310
    single-char opcodes. */
7311
7312
240
    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7313
7314
    /* Adjust first and required code units for a zero repeat. */
7315
7316
240
    if (repeat_min == 0)
7317
112
      {
7318
112
      firstcu = zerofirstcu;
7319
112
      firstcuflags = zerofirstcuflags;
7320
112
      reqcu = zeroreqcu;
7321
112
      reqcuflags = zeroreqcuflags;
7322
112
      }
7323
7324
    /* Note the greediness and possessiveness. */
7325
7326
240
    switch (meta)
7327
240
      {
7328
0
      case META_MINMAX_PLUS:
7329
0
      case META_ASTERISK_PLUS:
7330
0
      case META_PLUS_PLUS:
7331
0
      case META_QUERY_PLUS:
7332
0
      repeat_type = 0;                  /* Force greedy */
7333
0
      possessive_quantifier = TRUE;
7334
0
      break;
7335
7336
0
      case META_MINMAX_QUERY:
7337
0
      case META_ASTERISK_QUERY:
7338
0
      case META_PLUS_QUERY:
7339
0
      case META_QUERY_QUERY:
7340
0
      repeat_type = greedy_non_default;
7341
0
      possessive_quantifier = FALSE;
7342
0
      break;
7343
7344
240
      default:
7345
240
      repeat_type = greedy_default;
7346
240
      possessive_quantifier = FALSE;
7347
240
      break;
7348
240
      }
7349
7350
    /* Save start of previous item, in case we have to move it up in order to
7351
    insert something before it, and remember what it was. */
7352
7353
240
    PCRE2_ASSERT(previous != NULL);
7354
240
    tempcode = previous;
7355
240
    op_previous = *previous;
7356
7357
    /* Now handle repetition for the different types of item. If the repeat
7358
    minimum and the repeat maximum are both 1, we can ignore the quantifier for
7359
    non-parenthesized items, as they have only one alternative. For anything in
7360
    parentheses, we must not ignore if {1} is possessive. */
7361
7362
240
    switch (op_previous)
7363
240
      {
7364
      /* If previous was a character or negated character match, abolish the
7365
      item and generate a repeat item instead. If a char item has a minimum of
7366
      more than one, ensure that it is set in reqcu - it might not be if a
7367
      sequence such as x{3} is the first thing in a branch because the x will
7368
      have gone into firstcu instead.  */
7369
7370
88
      case OP_CHAR:
7371
88
      case OP_CHARI:
7372
88
      case OP_NOT:
7373
88
      case OP_NOTI:
7374
88
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7375
88
      op_type = chartypeoffset[op_previous - OP_CHAR];
7376
7377
      /* Deal with UTF characters that take up more than one code unit. */
7378
7379
88
#ifdef MAYBE_UTF_MULTI
7380
88
      if (utf && NOT_FIRSTCU(code[-1]))
7381
0
        {
7382
0
        PCRE2_UCHAR *lastchar = code - 1;
7383
0
        BACKCHAR(lastchar);
7384
0
        mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7385
0
        memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7386
0
        }
7387
88
      else
7388
88
#endif  /* MAYBE_UTF_MULTI */
7389
7390
      /* Handle the case of a single code unit - either with no UTF support, or
7391
      with UTF disabled, or for a single-code-unit UTF character. In the latter
7392
      case, for a repeated positive match, get the caseless flag for the
7393
      required code unit from the previous character, because a class like [Aa]
7394
      sets a caseless A but by now the req_caseopt flag has been reset. */
7395
7396
88
        {
7397
88
        mcbuffer[0] = code[-1];
7398
88
        mclength = 1;
7399
88
        if (op_previous <= OP_CHARI && repeat_min > 1)
7400
0
          {
7401
0
          reqcu = mcbuffer[0];
7402
0
          reqcuflags = cb->req_varyopt;
7403
0
          if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7404
0
          }
7405
88
        }
7406
88
      goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7407
7408
      /* If previous was a character class or a back reference, we put the
7409
      repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7410
7411
0
#ifdef SUPPORT_WIDE_CHARS
7412
8
      case OP_XCLASS:
7413
8
      case OP_ECLASS:
7414
8
#endif
7415
120
      case OP_CLASS:
7416
120
      case OP_NCLASS:
7417
120
      case OP_REF:
7418
120
      case OP_REFI:
7419
120
      case OP_DNREF:
7420
120
      case OP_DNREFI:
7421
7422
120
      if (repeat_max == 0)
7423
0
        {
7424
0
        code = previous;
7425
0
        goto END_REPEAT;
7426
0
        }
7427
120
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7428
7429
120
      if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7430
8
        *code++ = OP_CRSTAR + repeat_type;
7431
112
      else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7432
40
        *code++ = OP_CRPLUS + repeat_type;
7433
72
      else if (repeat_min == 0 && repeat_max == 1)
7434
0
        *code++ = OP_CRQUERY + repeat_type;
7435
72
      else
7436
72
        {
7437
72
        *code++ = OP_CRRANGE + repeat_type;
7438
72
        PUT2INC(code, 0, repeat_min);
7439
72
        if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7440
72
        PUT2INC(code, 0, repeat_max);
7441
72
        }
7442
120
      break;
7443
7444
      /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7445
      because pcre2_match() could not handle backtracking into recursively
7446
      called groups. Now that this backtracking is available, we no longer need
7447
      to do this. However, we still need to replicate recursions as we do for
7448
      groups so as to have independent backtracking points. We can replicate
7449
      for the minimum number of repeats directly. For optional repeats we now
7450
      wrap the recursion in OP_BRA brackets and make use of the bracket
7451
      repetition. */
7452
7453
0
      case OP_RECURSE:
7454
0
      if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7455
0
        goto END_REPEAT;
7456
7457
      /* Generate unwrapped repeats for a non-zero minimum, except when the
7458
      minimum is 1 and the maximum unlimited, because that can be handled with
7459
      OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7460
      minimum, we just need to generate the appropriate additional copies.
7461
      Otherwise we need to generate one more, to simulate the situation when
7462
      the minimum is zero. */
7463
7464
0
      if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7465
0
        {
7466
0
        int replicate = repeat_min;
7467
7468
0
        if (repeat_min == repeat_max) replicate--;
7469
7470
        /* In the pre-compile phase, we don't actually do the replication. We
7471
        just adjust the length as if we had. Do some paranoid checks for
7472
        potential integer overflow. */
7473
7474
0
        if (lengthptr != NULL)
7475
0
          {
7476
0
          PCRE2_SIZE delta;
7477
0
          if (PRIV(ckd_smul)(&delta, replicate, (int)length_prevgroup) ||
7478
0
              OFLOW_MAX - *lengthptr < delta)
7479
0
            {
7480
0
            *errorcodeptr = ERR20;
7481
0
            return 0;
7482
0
            }
7483
0
          *lengthptr += delta;
7484
0
          }
7485
0
        else for (int i = 0; i < replicate; i++)
7486
0
          {
7487
0
          memcpy(code, previous, CU2BYTES(length_prevgroup));
7488
0
          previous = code;
7489
0
          code += length_prevgroup;
7490
0
          }
7491
7492
        /* If the number of repeats is fixed, we are done. Otherwise, adjust
7493
        the counts and fall through. */
7494
7495
0
        if (repeat_min == repeat_max) break;
7496
0
        if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7497
0
        repeat_min = 0;
7498
0
        }
7499
7500
      /* Wrap the recursion call in OP_BRA brackets. */
7501
0
        {
7502
0
        PCRE2_SIZE length = (lengthptr != NULL) ? 1 + LINK_SIZE : length_prevgroup;
7503
7504
0
        (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(length));
7505
0
        op_previous = *previous = OP_BRA;
7506
0
        PUT(previous, 1, 1 + LINK_SIZE + length);
7507
0
        previous[1 + LINK_SIZE + length] = OP_KET;
7508
0
        PUT(previous, 2 + LINK_SIZE + length, 1 + LINK_SIZE + length);
7509
0
        }
7510
0
      code += 2 + 2 * LINK_SIZE;
7511
0
      length_prevgroup += 2 + 2 * LINK_SIZE;
7512
0
      group_return = -1;  /* Set "may match empty string" */
7513
7514
      /* Now treat as a repeated OP_BRA. */
7515
0
      PCRE2_FALLTHROUGH /* Fall through */
7516
7517
      /* If previous was a bracket group, we may have to replicate it in
7518
      certain cases. Note that at this point we can encounter only the "basic"
7519
      bracket opcodes such as BRA and CBRA, as this is the place where they get
7520
      converted into the more special varieties such as BRAPOS and SBRA.
7521
      Originally, PCRE did not allow repetition of assertions, but now it does,
7522
      for Perl compatibility. */
7523
7524
0
      case OP_ASSERT:
7525
0
      case OP_ASSERT_NOT:
7526
0
      case OP_ASSERT_NA:
7527
0
      case OP_ASSERTBACK:
7528
0
      case OP_ASSERTBACK_NOT:
7529
0
      case OP_ASSERTBACK_NA:
7530
0
      case OP_ASSERT_SCS:
7531
0
      case OP_ONCE:
7532
0
      case OP_SCRIPT_RUN:
7533
0
      case OP_BRA:
7534
16
      case OP_CBRA:
7535
16
      case OP_COND:
7536
16
        {
7537
16
        int len = (int)(code - previous);
7538
16
        PCRE2_UCHAR *bralink = NULL;
7539
16
        PCRE2_UCHAR *brazeroptr = NULL;
7540
7541
16
        if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7542
0
          goto END_REPEAT;
7543
7544
        /* Repeating a DEFINE group (or any group where the condition is always
7545
        FALSE and there is only one branch) is pointless, but Perl allows the
7546
        syntax, so we just ignore the repeat. */
7547
7548
16
        if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7549
0
            previous[GET(previous, 1)] != OP_ALT)
7550
0
          goto END_REPEAT;
7551
7552
        /* Perl allows all assertions to be quantified, and when they contain
7553
        capturing parentheses and/or are optional there are potential uses for
7554
        this feature. PCRE2 used to force the maximum quantifier to 1 on the
7555
        invalid grounds that further repetition was never useful. This was
7556
        always a bit pointless, since an assertion could be wrapped with a
7557
        repeated group to achieve the effect. General repetition is now
7558
        permitted, but if the maximum is unlimited it is set to one more than
7559
        the minimum. */
7560
7561
16
        if (op_previous < OP_ONCE)    /* Assertion */
7562
0
          {
7563
0
          if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7564
0
          }
7565
7566
        /* The case of a zero minimum is special because of the need to stick
7567
        OP_BRAZERO in front of it, and because the group appears once in the
7568
        data, whereas in other cases it appears the minimum number of times. For
7569
        this reason, it is simplest to treat this case separately, as otherwise
7570
        the code gets far too messy. There are several special subcases when the
7571
        minimum is zero. */
7572
7573
16
        if (repeat_min == 0)
7574
16
          {
7575
          /* If the maximum is also zero, we used to just omit the group from
7576
          the output altogether, like this:
7577
7578
          ** if (repeat_max == 0)
7579
          **   {
7580
          **   code = previous;
7581
          **   goto END_REPEAT;
7582
          **   }
7583
7584
          However, that fails when a group or a subgroup within it is
7585
          referenced as a subroutine from elsewhere in the pattern, so now we
7586
          stick in OP_SKIPZERO in front of it so that it is skipped on
7587
          execution. As we don't have a list of which groups are referenced, we
7588
          cannot do this selectively.
7589
7590
          If the maximum is 1 or unlimited, we just have to stick in the
7591
          BRAZERO and do no more at this point. */
7592
7593
16
          if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7594
16
            {
7595
16
            (void)memmove(previous + 1, previous, CU2BYTES(len));
7596
16
            code++;
7597
16
            if (repeat_max == 0)
7598
0
              {
7599
0
              *previous++ = OP_SKIPZERO;
7600
0
              goto END_REPEAT;
7601
0
              }
7602
16
            brazeroptr = previous;    /* Save for possessive optimizing */
7603
16
            *previous++ = OP_BRAZERO + repeat_type;
7604
16
            }
7605
7606
          /* If the maximum is greater than 1 and limited, we have to replicate
7607
          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7608
          The first one has to be handled carefully because it's the original
7609
          copy, which has to be moved up. The remainder can be handled by code
7610
          that is common with the non-zero minimum case below. We have to
7611
          adjust the value or repeat_max, since one less copy is required. */
7612
7613
0
          else
7614
0
            {
7615
0
            int linkoffset;
7616
0
            (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7617
0
            code += 2 + LINK_SIZE;
7618
0
            *previous++ = OP_BRAZERO + repeat_type;
7619
0
            *previous++ = OP_BRA;
7620
7621
            /* We chain together the bracket link offset fields that have to be
7622
            filled in later when the ends of the brackets are reached. */
7623
7624
0
            linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7625
0
            bralink = previous;
7626
0
            PUTINC(previous, 0, linkoffset);
7627
0
            }
7628
7629
16
          if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7630
16
          }
7631
7632
        /* If the minimum is greater than zero, replicate the group as many
7633
        times as necessary, and adjust the maximum to the number of subsequent
7634
        copies that we need. */
7635
7636
0
        else
7637
0
          {
7638
0
          if (repeat_min > 1)
7639
0
            {
7640
            /* In the pre-compile phase, we don't actually do the replication.
7641
            We just adjust the length as if we had. Do some paranoid checks for
7642
            potential integer overflow. */
7643
7644
0
            if (lengthptr != NULL)
7645
0
              {
7646
0
              PCRE2_SIZE delta;
7647
0
              if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7648
0
                                 (int)length_prevgroup) ||
7649
0
                  OFLOW_MAX - *lengthptr < delta)
7650
0
                {
7651
0
                *errorcodeptr = ERR20;
7652
0
                return 0;
7653
0
                }
7654
0
              *lengthptr += delta;
7655
0
              }
7656
7657
            /* This is compiling for real. If there is a set first code unit
7658
            for the group, and we have not yet set a "required code unit", set
7659
            it. */
7660
7661
0
            else
7662
0
              {
7663
0
              if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7664
0
                {
7665
0
                reqcu = firstcu;
7666
0
                reqcuflags = firstcuflags;
7667
0
                }
7668
0
              for (uint32_t i = 1; i < repeat_min; i++)
7669
0
                {
7670
0
                memcpy(code, previous, CU2BYTES(len));
7671
0
                code += len;
7672
0
                }
7673
0
              }
7674
0
            }
7675
7676
0
          if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7677
0
          }
7678
7679
        /* This code is common to both the zero and non-zero minimum cases. If
7680
        the maximum is limited, it replicates the group in a nested fashion,
7681
        remembering the bracket starts on a stack. In the case of a zero
7682
        minimum, the first one was set up above. In all cases the repeat_max
7683
        now specifies the number of additional copies needed. Again, we must
7684
        remember to replicate entries on the forward reference list. */
7685
7686
16
        if (repeat_max != REPEAT_UNLIMITED)
7687
16
          {
7688
          /* In the pre-compile phase, we don't actually do the replication. We
7689
          just adjust the length as if we had. For each repetition we must add
7690
          1 to the length for BRAZERO and for all but the last repetition we
7691
          must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7692
          paranoid checks to avoid integer overflow. */
7693
7694
16
          if (lengthptr != NULL && repeat_max > 0)
7695
0
            {
7696
0
            PCRE2_SIZE delta;
7697
0
            if (PRIV(ckd_smul)(&delta, repeat_max,
7698
0
                               (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7699
0
                OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7700
0
              {
7701
0
              *errorcodeptr = ERR20;
7702
0
              return 0;
7703
0
              }
7704
0
            delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7705
0
            *lengthptr += delta;
7706
0
            }
7707
7708
          /* This is compiling for real */
7709
7710
16
          else for (uint32_t i = repeat_max; i >= 1; i--)
7711
0
            {
7712
0
            *code++ = OP_BRAZERO + repeat_type;
7713
7714
            /* All but the final copy start a new nesting, maintaining the
7715
            chain of brackets outstanding. */
7716
7717
0
            if (i != 1)
7718
0
              {
7719
0
              int linkoffset;
7720
0
              *code++ = OP_BRA;
7721
0
              linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7722
0
              bralink = code;
7723
0
              PUTINC(code, 0, linkoffset);
7724
0
              }
7725
7726
0
            memcpy(code, previous, CU2BYTES(len));
7727
0
            code += len;
7728
0
            }
7729
7730
          /* Now chain through the pending brackets, and fill in their length
7731
          fields (which are holding the chain links pro tem). */
7732
7733
16
          while (bralink != NULL)
7734
0
            {
7735
0
            int oldlinkoffset;
7736
0
            int linkoffset = (int)(code - bralink + 1);
7737
0
            PCRE2_UCHAR *bra = code - linkoffset;
7738
0
            oldlinkoffset = GET(bra, 1);
7739
0
            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7740
0
            *code++ = OP_KET;
7741
0
            PUTINC(code, 0, linkoffset);
7742
0
            PUT(bra, 1, linkoffset);
7743
0
            }
7744
16
          }
7745
7746
        /* If the maximum is unlimited, set a repeater in the final copy. For
7747
        SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7748
        possessively repeated ONCE brackets can be converted into non-capturing
7749
        brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7750
        saves having to deal with possessive ONCEs specially.
7751
7752
        Otherwise, when we are doing the actual compile phase, check to see
7753
        whether this group is one that could match an empty string. If so,
7754
        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7755
        that runtime checking can be done. [This check is also applied to ONCE
7756
        and SCRIPT_RUN groups at runtime, but in a different way.]
7757
7758
        Then, if the quantifier was possessive and the bracket is not a
7759
        conditional, we convert the BRA code to the POS form, and the KET code
7760
        to KETRPOS. (It turns out to be convenient at runtime to detect this
7761
        kind of subpattern at both the start and at the end.) The use of
7762
        special opcodes makes it possible to reduce greatly the stack usage in
7763
        pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7764
        OP_BRAPOSZERO.
7765
7766
        Then, if the minimum number of matches is 1 or 0, cancel the possessive
7767
        flag so that the default action below, of wrapping everything inside
7768
        atomic brackets, does not happen. When the minimum is greater than 1,
7769
        there will be earlier copies of the group, and so we still have to wrap
7770
        the whole thing. */
7771
7772
0
        else
7773
0
          {
7774
0
          PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7775
0
          PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7776
7777
          /* Convert possessive ONCE brackets to non-capturing */
7778
7779
0
          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7780
7781
          /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7782
          to do is to set the KET. */
7783
7784
0
          if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7785
0
            *ketcode = OP_KETRMAX + repeat_type;
7786
7787
          /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7788
          (which have been converted to non-capturing above). */
7789
7790
0
          else
7791
0
            {
7792
            /* In the compile phase, adjust the opcode if the group can match
7793
            an empty string. For a conditional group with only one branch, the
7794
            value of group_return will not show "could be empty", so we must
7795
            check that separately. */
7796
7797
0
            if (lengthptr == NULL)
7798
0
              {
7799
0
              if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7800
0
              if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7801
0
                *bracode = OP_SCOND;
7802
0
              }
7803
7804
            /* Handle possessive quantifiers. */
7805
7806
0
            if (possessive_quantifier)
7807
0
              {
7808
              /* For COND brackets, we wrap the whole thing in a possessively
7809
              repeated non-capturing bracket, because we have not invented POS
7810
              versions of the COND opcodes. */
7811
7812
0
              if (*bracode == OP_COND || *bracode == OP_SCOND)
7813
0
                {
7814
0
                int nlen = (int)(code - bracode);
7815
0
                (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7816
0
                code += 1 + LINK_SIZE;
7817
0
                nlen += 1 + LINK_SIZE;
7818
0
                *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7819
0
                *code++ = OP_KETRPOS;
7820
0
                PUTINC(code, 0, nlen);
7821
0
                PUT(bracode, 1, nlen);
7822
0
                }
7823
7824
              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7825
7826
0
              else
7827
0
                {
7828
0
                *bracode += 1;              /* Switch to xxxPOS opcodes */
7829
0
                *ketcode = OP_KETRPOS;
7830
0
                }
7831
7832
              /* If the minimum is zero, mark it as possessive, then unset the
7833
              possessive flag when the minimum is 0 or 1. */
7834
7835
0
              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7836
0
              if (repeat_min < 2) possessive_quantifier = FALSE;
7837
0
              }
7838
7839
            /* Non-possessive quantifier */
7840
7841
0
            else *ketcode = OP_KETRMAX + repeat_type;
7842
0
            }
7843
0
          }
7844
16
        }
7845
16
      break;
7846
7847
      /* If previous was a character type match (\d or similar), abolish it and
7848
      create a suitable repeat item. The code is shared with single-character
7849
      repeats by setting op_type to add a suitable offset into repeat_type.
7850
      Note the the Unicode property types will be present only when
7851
      SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7852
      here because it just makes it horribly messy. */
7853
7854
16
      default:
7855
7856
      /* LCOV_EXCL_START */
7857
16
      if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)
7858
0
        {
7859
0
        PCRE2_DEBUG_UNREACHABLE();
7860
0
        *errorcodeptr = ERR10;  /* Not a character type - internal error */
7861
0
        return 0;
7862
0
        }
7863
      /* LCOV_EXCL_STOP */
7864
7865
16
        {
7866
16
        int prop_type, prop_value;
7867
16
        PCRE2_UCHAR *oldcode;
7868
7869
16
        if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7870
7871
16
        op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7872
16
        mclength = 0;                         /* Not a character */
7873
7874
16
        if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7875
8
          {
7876
8
          prop_type = previous[1];
7877
8
          prop_value = previous[2];
7878
8
          }
7879
8
        else
7880
8
          {
7881
          /* Come here from just above with a character in mcbuffer/mclength.
7882
          You must also set op_type before the jump. */
7883
96
          OUTPUT_SINGLE_REPEAT:
7884
96
          prop_type = prop_value = -1;
7885
96
          }
7886
7887
        /* At this point, if prop_type == prop_value == -1 we either have a
7888
        character in mcbuffer when mclength is greater than zero, or we have
7889
        mclength zero, in which case there is a non-property character type in
7890
        op_previous. If prop_type/value are not negative, we have a property
7891
        character type in op_previous. */
7892
7893
104
        oldcode = code;                   /* Save where we were */
7894
104
        code = previous;                  /* Usually overwrite previous item */
7895
7896
        /* If the maximum is zero then the minimum must also be zero; Perl allows
7897
        this case, so we do too - by simply omitting the item altogether. */
7898
7899
104
        if (repeat_max == 0) goto END_REPEAT;
7900
7901
        /* Combine the op_type with the repeat_type */
7902
7903
104
        repeat_type += op_type;
7904
7905
        /* A minimum of zero is handled either as the special case * or ?, or as
7906
        an UPTO, with the maximum given. */
7907
7908
104
        if (repeat_min == 0)
7909
88
          {
7910
88
          if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7911
72
            else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7912
24
          else
7913
24
            {
7914
24
            *code++ = OP_UPTO + repeat_type;
7915
24
            PUT2INC(code, 0, repeat_max);
7916
24
            }
7917
88
          }
7918
7919
        /* A repeat minimum of 1 is optimized into some special cases. If the
7920
        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7921
        left in place and, if the maximum is greater than 1, we use OP_UPTO with
7922
        one less than the maximum. */
7923
7924
16
        else if (repeat_min == 1)
7925
16
          {
7926
16
          if (repeat_max == REPEAT_UNLIMITED)
7927
16
            *code++ = OP_PLUS + repeat_type;
7928
0
          else
7929
0
            {
7930
0
            code = oldcode;  /* Leave previous item in place */
7931
0
            if (repeat_max == 1) goto END_REPEAT;
7932
0
            *code++ = OP_UPTO + repeat_type;
7933
0
            PUT2INC(code, 0, repeat_max - 1);
7934
0
            }
7935
16
          }
7936
7937
        /* The case {n,n} is just an EXACT, while the general case {n,m} is
7938
        handled as an EXACT followed by an UPTO or STAR or QUERY. */
7939
7940
0
        else
7941
0
          {
7942
0
          *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7943
0
          PUT2INC(code, 0, repeat_min);
7944
7945
          /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7946
          and then generate the second opcode. For a repeated Unicode property
7947
          match, there are two extra values that define the required property,
7948
          and mclength is set zero to indicate this. */
7949
7950
0
          if (repeat_max != repeat_min)
7951
0
            {
7952
0
            if (mclength > 0)
7953
0
              {
7954
0
              memcpy(code, mcbuffer, CU2BYTES(mclength));
7955
0
              code += mclength;
7956
0
              }
7957
0
            else
7958
0
              {
7959
0
              *code++ = op_previous;
7960
0
              if (prop_type >= 0)
7961
0
                {
7962
0
                *code++ = prop_type;
7963
0
                *code++ = prop_value;
7964
0
                }
7965
0
              }
7966
7967
            /* Now set up the following opcode */
7968
7969
0
            if (repeat_max == REPEAT_UNLIMITED)
7970
0
              *code++ = OP_STAR + repeat_type;
7971
0
            else
7972
0
              {
7973
0
              repeat_max -= repeat_min;
7974
0
              if (repeat_max == 1)
7975
0
                {
7976
0
                *code++ = OP_QUERY + repeat_type;
7977
0
                }
7978
0
              else
7979
0
                {
7980
0
                *code++ = OP_UPTO + repeat_type;
7981
0
                PUT2INC(code, 0, repeat_max);
7982
0
                }
7983
0
              }
7984
0
            }
7985
0
          }
7986
7987
        /* Fill in the character or character type for the final opcode. */
7988
7989
104
        if (mclength > 0)
7990
88
          {
7991
88
          memcpy(code, mcbuffer, CU2BYTES(mclength));
7992
88
          code += mclength;
7993
88
          }
7994
16
        else
7995
16
          {
7996
16
          *code++ = op_previous;
7997
16
          if (prop_type >= 0)
7998
8
            {
7999
8
            *code++ = prop_type;
8000
8
            *code++ = prop_value;
8001
8
            }
8002
16
          }
8003
104
        }
8004
0
      break;
8005
240
      }  /* End of switch on different op_previous values */
8006
8007
8008
    /* If the character following a repeat is '+', possessive_quantifier is
8009
    TRUE. For some opcodes, there are special alternative opcodes for this
8010
    case. For anything else, we wrap the entire repeated item inside OP_ONCE
8011
    brackets. Logically, the '+' notation is just syntactic sugar, taken from
8012
    Sun's Java package, but the special opcodes can optimize it.
8013
8014
    Some (but not all) possessively repeated subpatterns have already been
8015
    completely handled in the code just above. For them, possessive_quantifier
8016
    is always FALSE at this stage. Note that the repeated item starts at
8017
    tempcode, not at previous, which might be the first part of a string whose
8018
    (former) last char we repeated. */
8019
8020
240
    if (possessive_quantifier)
8021
0
      {
8022
0
      int len;
8023
8024
      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
8025
      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
8026
      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
8027
      remains is greater than zero, there's a further opcode that can be
8028
      handled. If not, do nothing, leaving the EXACT alone. */
8029
8030
0
      switch(*tempcode)
8031
0
        {
8032
0
        case OP_TYPEEXACT:
8033
0
        tempcode += PRIV(OP_lengths)[*tempcode] +
8034
0
          ((tempcode[1 + IMM2_SIZE] == OP_PROP
8035
0
          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
8036
0
        break;
8037
8038
        /* CHAR opcodes are used for exacts whose count is 1. */
8039
8040
0
        case OP_CHAR:
8041
0
        case OP_CHARI:
8042
0
        case OP_NOT:
8043
0
        case OP_NOTI:
8044
0
        case OP_EXACT:
8045
0
        case OP_EXACTI:
8046
0
        case OP_NOTEXACT:
8047
0
        case OP_NOTEXACTI:
8048
0
        tempcode += PRIV(OP_lengths)[*tempcode];
8049
0
#ifdef SUPPORT_UNICODE
8050
0
        if (utf && HAS_EXTRALEN(tempcode[-1]))
8051
0
          tempcode += GET_EXTRALEN(tempcode[-1]);
8052
0
#endif
8053
0
        break;
8054
8055
        /* For the class opcodes, the repeat operator appears at the end;
8056
        adjust tempcode to point to it. */
8057
8058
0
        case OP_CLASS:
8059
0
        case OP_NCLASS:
8060
0
        tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
8061
0
        break;
8062
8063
0
#ifdef SUPPORT_WIDE_CHARS
8064
0
        case OP_XCLASS:
8065
0
        case OP_ECLASS:
8066
0
        tempcode += GET(tempcode, 1);
8067
0
        break;
8068
0
#endif
8069
0
        }
8070
8071
      /* If tempcode is equal to code (which points to the end of the repeated
8072
      item), it means we have skipped an EXACT item but there is no following
8073
      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
8074
      all other cases, tempcode will be pointing to the repeat opcode, and will
8075
      be less than code, so the value of len will be greater than 0. */
8076
8077
0
      len = (int)(code - tempcode);
8078
0
      if (len > 0)
8079
0
        {
8080
0
        unsigned int repcode = *tempcode;
8081
8082
        /* There is a table for possessifying opcodes, all of which are less
8083
        than OP_CALLOUT. A zero entry means there is no possessified version.
8084
        */
8085
8086
0
        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
8087
0
          *tempcode = opcode_possessify[repcode];
8088
8089
        /* For opcode without a special possessified version, wrap the item in
8090
        ONCE brackets. */
8091
8092
0
        else
8093
0
          {
8094
0
          (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
8095
0
          code += 1 + LINK_SIZE;
8096
0
          len += 1 + LINK_SIZE;
8097
0
          tempcode[0] = OP_ONCE;
8098
0
          *code++ = OP_KET;
8099
0
          PUTINC(code, 0, len);
8100
0
          PUT(tempcode, 1, len);
8101
0
          }
8102
0
        }
8103
0
      }
8104
8105
    /* We set the "follows varying string" flag for subsequently encountered
8106
    reqcus if it isn't already set and we have just passed a varying length
8107
    item. */
8108
8109
240
    END_REPEAT:
8110
240
    cb->req_varyopt |= reqvary;
8111
240
    break;
8112
8113
8114
    /* ===================================================================*/
8115
    /* Handle a 32-bit data character with a value greater than META_END. */
8116
8117
0
    case META_BIGVALUE:
8118
0
    pptr++;
8119
0
    goto NORMAL_CHAR;
8120
8121
8122
    /* ===============================================================*/
8123
    /* Handle a back reference by number, which is the meta argument. The
8124
    pattern offsets for back references to group numbers less than 10 are held
8125
    in a special vector, to avoid using more than two parsed pattern elements
8126
    in 64-bit environments. We only need the offset to the first occurrence,
8127
    because if that doesn't fail, subsequent ones will also be OK. */
8128
8129
0
    case META_BACKREF:
8130
0
    if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8131
0
      else GETPLUSOFFSET(offset, pptr);
8132
8133
0
    if (meta_arg > cb->bracount)
8134
0
      {
8135
0
      cb->erroroffset = offset;
8136
0
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8137
0
      return 0;
8138
0
      }
8139
8140
    /* Come here from named backref handling when the reference is to a
8141
    single group (that is, not to a duplicated name). The back reference
8142
    data will have already been updated. We must disable firstcu if not
8143
    set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8144
    later. */
8145
8146
0
    HANDLE_SINGLE_REFERENCE:
8147
0
    if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8148
0
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8149
0
    PUT2INC(code, 0, meta_arg);
8150
0
    if ((options & PCRE2_CASELESS) != 0)
8151
0
      *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
8152
0
                 REFI_FLAG_CASELESS_RESTRICT : 0) |
8153
0
                (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
8154
0
                 REFI_FLAG_TURKISH_CASING : 0);
8155
8156
    /* Update the map of back references, and keep the highest one. We
8157
    could do this in parse_regex() for numerical back references, but not
8158
    for named back references, because we don't know the numbers to which
8159
    named back references refer. So we do it all in this function. */
8160
8161
0
    cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8162
0
    if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8163
0
    break;
8164
8165
8166
    /* ===============================================================*/
8167
    /* Handle recursion by inserting the number of the called group (which is
8168
    the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8169
    scanned and these numbers are replaced by offsets within the pattern. It is
8170
    done like this to avoid problems with forward references and adjusting
8171
    offsets when groups are duplicated and moved (as discovered in previous
8172
    implementations). Note that a recursion does not have a set first
8173
    character. */
8174
8175
0
    case META_RECURSE:
8176
0
    GETPLUSOFFSET(offset, pptr);
8177
0
    if (meta_arg > cb->bracount)
8178
0
      {
8179
0
      cb->erroroffset = offset;
8180
0
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8181
0
      return 0;
8182
0
      }
8183
0
    HANDLE_NUMERICAL_RECURSION:
8184
0
    *code = OP_RECURSE;
8185
0
    PUT(code, 1, meta_arg);
8186
0
    code += 1 + LINK_SIZE;
8187
    /* Repeat processing requires this information to
8188
    determine the real length in pre-compile phase. */
8189
0
    length_prevgroup = 1 + LINK_SIZE;
8190
8191
0
    if (META_CODE(pptr[1]) == META_OFFSET ||
8192
0
        META_CODE(pptr[1]) == META_CAPTURE_NAME ||
8193
0
        META_CODE(pptr[1]) == META_CAPTURE_NUMBER)
8194
0
      {
8195
0
      recurse_arguments *args;
8196
8197
0
      if (lengthptr != NULL)
8198
0
        {
8199
0
        if (!PRIV(compile_parse_recurse_args)(pptr, offset, errorcodeptr, cb))
8200
0
          return 0;
8201
8202
0
        args = (recurse_arguments*)cb->last_data;
8203
0
        length_prevgroup += (args->size * (1 + IMM2_SIZE));
8204
0
        *lengthptr += (args->size * (1 + IMM2_SIZE));
8205
0
        pptr += args->skip_size;
8206
0
        }
8207
0
      else
8208
0
        {
8209
0
        uint16_t *current, *end;
8210
8211
0
        args = (recurse_arguments*)cb->first_data;
8212
0
        PCRE2_ASSERT(args != NULL && args->header.type == CDATA_RECURSE_ARGS);
8213
8214
0
        current = (uint16_t*)(args + 1);
8215
0
        end = current + args->size;
8216
0
        PCRE2_ASSERT(end > current);
8217
8218
0
        do
8219
0
          {
8220
0
          code[0] = OP_CREF;
8221
0
          PUT2(code, 1, *current);
8222
0
          code += 1 + IMM2_SIZE;
8223
0
          }
8224
0
        while (++current < end);
8225
8226
0
        length_prevgroup += (args->size * (1 + IMM2_SIZE));
8227
0
        pptr += args->skip_size;
8228
0
        cb->first_data = args->header.next;
8229
0
        cb->cx->memctl.free(args, cb->cx->memctl.memory_data);
8230
0
        }
8231
0
      }
8232
8233
0
    groupsetfirstcu = FALSE;
8234
0
    cb->had_recurse = TRUE;
8235
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8236
0
    zerofirstcu = firstcu;
8237
0
    zerofirstcuflags = firstcuflags;
8238
0
    break;
8239
8240
8241
    /* ===============================================================*/
8242
    /* Handle capturing parentheses; the number is the meta argument. */
8243
8244
16
    case META_CAPTURE:
8245
16
    bravalue = OP_CBRA;
8246
16
    skipunits = IMM2_SIZE;
8247
16
    PUT2(code, 1+LINK_SIZE, meta_arg);
8248
16
    cb->lastcapture = meta_arg;
8249
16
    goto GROUP_PROCESS_NOTE_EMPTY;
8250
8251
8252
    /* ===============================================================*/
8253
    /* Handle escape sequence items. For ones like \d, the ESC_values are
8254
    arranged to be the same as the corresponding OP_values in the default case
8255
    when PCRE2_UCP is not set (which is the only case in which they will appear
8256
    here).
8257
8258
    Note: \Q and \E are never seen here, as they were dealt with in
8259
    parse_pattern(). Neither are numerical back references or recursions, which
8260
    were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8261
    \g, when followed by names, are turned into META_BACKREF_BYNAME or
8262
    META_RECURSE_BYNAME. */
8263
8264
8
    case META_ESCAPE:
8265
8266
    /* We can test for escape sequences that consume a character because their
8267
    values lie between ESC_b and ESC_Z; this may have to change if any new ones
8268
    are ever created. For these sequences, we disable the setting of a first
8269
    character if it hasn't already been set. */
8270
8271
8
    if (meta_arg > ESC_b && meta_arg < ESC_Z)
8272
8
      {
8273
8
      matched_char = TRUE;
8274
8
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8275
8
      }
8276
8277
    /* Set values to reset to if this is followed by a zero repeat. */
8278
8279
8
    zerofirstcu = firstcu;
8280
8
    zerofirstcuflags = firstcuflags;
8281
8
    zeroreqcu = reqcu;
8282
8
    zeroreqcuflags = reqcuflags;
8283
8284
    /* If Unicode is not supported, \P and \p are not allowed and are
8285
    faulted at parse time, so will never appear here. */
8286
8287
8
#ifdef SUPPORT_UNICODE
8288
8
    if (meta_arg == ESC_P || meta_arg == ESC_p)
8289
8
      {
8290
8
      uint32_t ptype = *(++pptr) >> 16;
8291
8
      uint32_t pdata = *pptr & 0xffff;
8292
8293
      /* In caseless matching, particular characteristics Lu, Ll, and Lt get
8294
      converted to the general characteristic L&. That is, upper, lower, and
8295
      title case letters are all conflated. */
8296
8297
8
      if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
8298
0
          (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
8299
0
        {
8300
0
        ptype = PT_LAMP;
8301
0
        pdata = 0;
8302
0
        }
8303
8304
      /* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}
8305
      is compiled to [] so as to benefit from the auto-anchoring code. */
8306
8307
8
      if (ptype == PT_ANY)
8308
0
        {
8309
0
        if (meta_arg == ESC_P)
8310
0
          {
8311
0
          *code++ = OP_CLASS;
8312
0
          memset(code, 0, 32);
8313
0
          code += 32 / sizeof(PCRE2_UCHAR);
8314
0
          }
8315
0
        else
8316
0
          *code++ = OP_ALLANY;
8317
0
        }
8318
8
      else
8319
8
        {
8320
8
        *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8321
8
        *code++ = ptype;
8322
8
        *code++ = pdata;
8323
8
        }
8324
8
      break;  /* End META_ESCAPE */
8325
8
      }
8326
0
#endif
8327
8328
    /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8329
    done. However, there's an option, in case anyone was relying on it. */
8330
8331
0
    if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8332
0
        (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8333
0
      {
8334
0
      *errorcodeptr = ERR99;
8335
0
      return 0;
8336
0
      }
8337
8338
    /* For the rest (including \X when Unicode is supported - if not it's
8339
    faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8340
    not set; if it is set, most of them do not show up here because they are
8341
    converted into Unicode property tests in parse_regex().
8342
8343
    In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8344
    instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8345
    There are special UCP codes for \B and \b which are used in UCP mode unless
8346
    "word" matching is being forced to ASCII.
8347
8348
    Note that \b and \B do a one-character lookbehind, and \A also behaves as
8349
    if it does. */
8350
8351
0
    switch(meta_arg)
8352
0
      {
8353
0
      case ESC_C:
8354
0
      cb->external_flags |= PCRE2_HASBKC;  /* Record */
8355
#if PCRE2_CODE_UNIT_WIDTH == 32
8356
      meta_arg = OP_ALLANY;
8357
      (void)utf; /* Avoid compiler warning. */
8358
#else
8359
0
      if (!utf) meta_arg = OP_ALLANY;
8360
0
#endif
8361
0
      break;
8362
8363
0
      case ESC_B:
8364
0
      case ESC_b:
8365
0
      if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8366
0
        meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8367
0
          OP_UCP_WORD_BOUNDARY;
8368
0
      PCRE2_FALLTHROUGH /* Fall through */
8369
8370
0
      case ESC_A:
8371
0
      if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8372
0
      break;
8373
8374
0
      case ESC_K:
8375
0
      cb->external_flags |= PCRE2_HASBSK;  /* Record */
8376
0
      break;
8377
0
      }
8378
8379
0
    *code++ = meta_arg;
8380
0
    break;  /* End META_ESCAPE */
8381
8382
8383
    /* ===================================================================*/
8384
    /* Handle an unrecognized meta value. A parsed pattern value less than
8385
    META_END is a literal. Otherwise we have a problem. */
8386
8387
288
    default:
8388
    /* LCOV_EXCL_START */
8389
288
    if (meta >= META_END)
8390
0
      {
8391
0
      PCRE2_DEBUG_UNREACHABLE();
8392
0
      *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8393
0
      return 0;
8394
0
      }
8395
    /* LCOV_EXCL_STOP */
8396
8397
    /* Handle a literal character. We come here by goto in the case of a
8398
    32-bit, non-UTF character whose value is greater than META_END. */
8399
8400
288
    NORMAL_CHAR:
8401
288
    meta = *pptr;     /* Get the full 32 bits */
8402
296
    NORMAL_CHAR_SET:  /* Character is already in meta */
8403
296
    matched_char = TRUE;
8404
8405
    /* For caseless UTF or UCP mode, check whether this character has more than
8406
    one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8407
    When casing restrictions apply, ignore caseless sets that start with an
8408
    ASCII character. If the character is affected by the special Turkish rules,
8409
    hardcode the matching characters using a caseset. */
8410
8411
296
#ifdef SUPPORT_UNICODE
8412
296
    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8413
0
      {
8414
0
      uint32_t caseset;
8415
8416
0
      if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
8417
0
            PCRE2_EXTRA_TURKISH_CASING &&
8418
0
          UCD_ANY_I(meta))
8419
0
        {
8420
0
        caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);
8421
0
        }
8422
0
      else if ((caseset = UCD_CASESET(meta)) != 0 &&
8423
0
               (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
8424
0
               PRIV(ucd_caseless_sets)[caseset] < 128)
8425
0
        {
8426
0
        caseset = 0;  /* Ignore the caseless set if it's restricted. */
8427
0
        }
8428
8429
0
      if (caseset != 0)
8430
0
        {
8431
0
        *code++ = OP_PROP;
8432
0
        *code++ = PT_CLIST;
8433
0
        *code++ = caseset;
8434
0
        if (firstcuflags == REQ_UNSET)
8435
0
          firstcuflags = zerofirstcuflags = REQ_NONE;
8436
0
        break;  /* End handling this meta item */
8437
0
        }
8438
0
      }
8439
296
#endif
8440
8441
    /* Caseful matches, or caseless and not one of the multicase characters. We
8442
    come here by goto in the case of a positive class that contains only
8443
    case-partners of a character with just two cases; matched_char has already
8444
    been set TRUE and options fudged if necessary. */
8445
8446
296
    CLASS_CASELESS_CHAR:
8447
8448
    /* Get the character's code units into mcbuffer, with the length in
8449
    mclength. When not in UTF mode, the length is always 1. */
8450
8451
296
#ifdef SUPPORT_UNICODE
8452
296
    if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8453
224
#endif
8454
224
      {
8455
224
      mclength = 1;
8456
224
      mcbuffer[0] = meta;
8457
224
      }
8458
8459
    /* Generate the appropriate code */
8460
8461
296
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8462
296
    memcpy(code, mcbuffer, CU2BYTES(mclength));
8463
296
    code += mclength;
8464
8465
    /* Remember if \r or \n were seen */
8466
8467
296
    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8468
24
      cb->external_flags |= PCRE2_HASCRORLF;
8469
8470
    /* Set the first and required code units appropriately. If no previous
8471
    first code unit, set it from this character, but revert to none on a zero
8472
    repeat. Otherwise, leave the firstcu value alone, and don't change it on
8473
    a zero repeat. */
8474
8475
296
    if (firstcuflags == REQ_UNSET)
8476
36
      {
8477
36
      zerofirstcuflags = REQ_NONE;
8478
36
      zeroreqcu = reqcu;
8479
36
      zeroreqcuflags = reqcuflags;
8480
8481
      /* If the character is more than one code unit long, we can set a single
8482
      firstcu only if it is not to be matched caselessly. Multiple possible
8483
      starting code units may be picked up later in the studying code. */
8484
8485
36
      if (mclength == 1 || req_caseopt == 0)
8486
36
        {
8487
36
        firstcu = mcbuffer[0];
8488
36
        firstcuflags = req_caseopt;
8489
36
        if (mclength != 1)
8490
8
          {
8491
8
          reqcu = code[-1];
8492
8
          reqcuflags = cb->req_varyopt;
8493
8
          }
8494
36
        }
8495
0
      else firstcuflags = reqcuflags = REQ_NONE;
8496
36
      }
8497
8498
    /* firstcu was previously set; we can set reqcu only if the length is
8499
    1 or the matching is caseful. */
8500
8501
260
    else
8502
260
      {
8503
260
      zerofirstcu = firstcu;
8504
260
      zerofirstcuflags = firstcuflags;
8505
260
      zeroreqcu = reqcu;
8506
260
      zeroreqcuflags = reqcuflags;
8507
260
      if (mclength == 1 || req_caseopt == 0)
8508
260
        {
8509
260
        reqcu = code[-1];
8510
260
        reqcuflags = req_caseopt | cb->req_varyopt;
8511
260
        }
8512
260
      }
8513
8514
    /* If caselessness was temporarily instated, reset it. */
8515
8516
296
    if (reset_caseful)
8517
0
      {
8518
0
      options &= ~PCRE2_CASELESS;
8519
0
      req_caseopt = 0;
8520
0
      reset_caseful = FALSE;
8521
0
      }
8522
8523
296
    break;    /* End literal character handling */
8524
792
    }         /* End of big switch */
8525
792
  }           /* End of big loop */
8526
8527
/* LCOV_EXCL_START */
8528
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8529
0
return 0;                  /* Avoid compiler warnings */
8530
/* LCOV_EXCL_STOP */
8531
48
}
8532
8533
8534
8535
/*************************************************
8536
*   Compile regex: a sequence of alternatives    *
8537
*************************************************/
8538
8539
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8540
the closing bracket or META_END. The code variable is pointing at the code unit
8541
into which the BRA operator has been stored. This function is used during the
8542
pre-compile phase when we are trying to find out the amount of memory needed,
8543
as well as during the real compile phase. The value of lengthptr distinguishes
8544
the two phases.
8545
8546
Arguments:
8547
  options           option bits, including any changes for this subpattern
8548
  xoptions          extra option bits, ditto
8549
  codeptr           -> the address of the current code pointer
8550
  pptrptr           -> the address of the current parsed pattern pointer
8551
  errorcodeptr      -> pointer to error code variable
8552
  skipunits         skip this many code units at start (for brackets and OP_COND)
8553
  firstcuptr        place to put the first required code unit
8554
  firstcuflagsptr   place to put the first code unit flags
8555
  reqcuptr          place to put the last required code unit
8556
  reqcuflagsptr     place to put the last required code unit flags
8557
  bcptr             pointer to the chain of currently open branches
8558
  cb                points to the data block with tables pointers etc.
8559
  lengthptr         NULL during the real compile phase
8560
                    points to length accumulator during pre-compile phase
8561
8562
Returns:            0 There has been an error
8563
                   +1 Success, this group must match at least one character
8564
                   -1 Success, this group may match an empty string
8565
*/
8566
8567
static int
8568
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8569
  uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8570
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8571
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8572
  compile_block *cb, PCRE2_SIZE *lengthptr)
8573
48
{
8574
48
PCRE2_UCHAR *code = *codeptr;
8575
48
PCRE2_UCHAR *last_branch = code;
8576
48
PCRE2_UCHAR *start_bracket = code;
8577
48
BOOL lookbehind;
8578
48
open_capitem capitem;
8579
48
int capnumber = 0;
8580
48
int okreturn = 1;
8581
48
uint32_t *pptr = *pptrptr;
8582
48
uint32_t firstcu, reqcu;
8583
48
uint32_t lookbehindlength;
8584
48
uint32_t lookbehindminlength;
8585
48
uint32_t firstcuflags, reqcuflags;
8586
48
PCRE2_SIZE length;
8587
48
branch_chain bc;
8588
8589
/* If set, call the external function that checks for stack availability. */
8590
8591
48
if (cb->cx->stack_guard != NULL &&
8592
0
    cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8593
0
  {
8594
0
  *errorcodeptr= ERR33;
8595
0
  cb->erroroffset = 0;
8596
0
  return 0;
8597
0
  }
8598
8599
/* Miscellaneous initialization */
8600
8601
48
bc.outer = bcptr;
8602
48
bc.current_branch = code;
8603
8604
48
firstcu = reqcu = 0;
8605
48
firstcuflags = reqcuflags = REQ_UNSET;
8606
8607
/* Accumulate the length for use in the pre-compile phase. Start with the
8608
length of the BRA and KET and any extra code units that are required at the
8609
beginning. We accumulate in a local variable to save frequent testing of
8610
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8611
start and end of each alternative, because compiled items are discarded during
8612
the pre-compile phase so that the workspace is not exceeded. */
8613
8614
48
length = 2 + 2*LINK_SIZE + skipunits;
8615
8616
/* Remember if this is a lookbehind assertion, and if it is, save its length
8617
and skip over the pattern offset. */
8618
8619
48
lookbehind = *code == OP_ASSERTBACK ||
8620
48
             *code == OP_ASSERTBACK_NOT ||
8621
48
             *code == OP_ASSERTBACK_NA;
8622
8623
48
if (lookbehind)
8624
0
  {
8625
0
  lookbehindlength = META_DATA(pptr[-1]);
8626
0
  lookbehindminlength = *pptr;
8627
0
  pptr += SIZEOFFSET;
8628
0
  }
8629
48
else lookbehindlength = lookbehindminlength = 0;
8630
8631
/* If this is a capturing subpattern, add to the chain of open capturing items
8632
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8633
need be tested here; changing this opcode to one of its variants, e.g.
8634
OP_SCBRAPOS, happens later, after the group has been compiled. */
8635
8636
48
if (*code == OP_CBRA)
8637
16
  {
8638
16
  capnumber = GET2(code, 1 + LINK_SIZE);
8639
16
  capitem.number = capnumber;
8640
16
  capitem.next = open_caps;
8641
16
  capitem.assert_depth = cb->assert_depth;
8642
16
  open_caps = &capitem;
8643
16
  }
8644
8645
/* Offset is set zero to mark that this bracket is still open */
8646
8647
48
PUT(code, 1, 0);
8648
48
code += 1 + LINK_SIZE + skipunits;
8649
8650
/* Loop for each alternative branch */
8651
8652
48
for (;;)
8653
48
  {
8654
48
  int branch_return;
8655
48
  uint32_t branchfirstcu = 0, branchreqcu = 0;
8656
48
  uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;
8657
8658
  /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8659
  is only a single minimum length for the whole assertion. When the minimum
8660
  length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8661
  though not necessarily the same length. In this case, the original OP_REVERSE
8662
  can be used. It can also be used if a branch in a variable length lookbehind
8663
  has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8664
  maximum and minimum values. */
8665
8666
48
  if (lookbehind && lookbehindlength > 0)
8667
0
    {
8668
0
    if (lookbehindminlength == LOOKBEHIND_MAX ||
8669
0
        lookbehindminlength == lookbehindlength)
8670
0
      {
8671
0
      *code++ = OP_REVERSE;
8672
0
      PUT2INC(code, 0, lookbehindlength);
8673
0
      length += 1 + IMM2_SIZE;
8674
0
      }
8675
0
    else
8676
0
      {
8677
0
      *code++ = OP_VREVERSE;
8678
0
      PUT2INC(code, 0, lookbehindminlength);
8679
0
      PUT2INC(code, 0, lookbehindlength);
8680
0
      length += 1 + 2*IMM2_SIZE;
8681
0
      }
8682
0
    }
8683
8684
  /* Now compile the branch; in the pre-compile phase its length gets added
8685
  into the length. */
8686
8687
48
  if ((branch_return =
8688
48
        compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8689
48
          &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8690
48
          &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8691
0
    return 0;
8692
8693
  /* If a branch can match an empty string, so can the whole group. */
8694
8695
48
  if (branch_return < 0) okreturn = -1;
8696
8697
  /* In the real compile phase, there is some post-processing to be done. */
8698
8699
48
  if (lengthptr == NULL)
8700
24
    {
8701
    /* If this is the first branch, the firstcu and reqcu values for the
8702
    branch become the values for the regex. */
8703
8704
24
    if (*last_branch != OP_ALT)
8705
24
      {
8706
24
      firstcu = branchfirstcu;
8707
24
      firstcuflags = branchfirstcuflags;
8708
24
      reqcu = branchreqcu;
8709
24
      reqcuflags = branchreqcuflags;
8710
24
      }
8711
8712
    /* If this is not the first branch, the first char and reqcu have to
8713
    match the values from all the previous branches, except that if the
8714
    previous value for reqcu didn't have REQ_VARY set, it can still match,
8715
    and we set REQ_VARY for the group from this branch's value. */
8716
8717
0
    else
8718
0
      {
8719
      /* If we previously had a firstcu, but it doesn't match the new branch,
8720
      we have to abandon the firstcu for the regex, but if there was
8721
      previously no reqcu, it takes on the value of the old firstcu. */
8722
8723
0
      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8724
0
        {
8725
0
        if (firstcuflags < REQ_NONE)
8726
0
          {
8727
0
          if (reqcuflags >= REQ_NONE)
8728
0
            {
8729
0
            reqcu = firstcu;
8730
0
            reqcuflags = firstcuflags;
8731
0
            }
8732
0
          }
8733
0
        firstcuflags = REQ_NONE;
8734
0
        }
8735
8736
      /* If we (now or from before) have no firstcu, a firstcu from the
8737
      branch becomes a reqcu if there isn't a branch reqcu. */
8738
8739
0
      if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8740
0
          branchreqcuflags >= REQ_NONE)
8741
0
        {
8742
0
        branchreqcu = branchfirstcu;
8743
0
        branchreqcuflags = branchfirstcuflags;
8744
0
        }
8745
8746
      /* Now ensure that the reqcus match */
8747
8748
0
      if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8749
0
          reqcu != branchreqcu)
8750
0
        reqcuflags = REQ_NONE;
8751
0
      else
8752
0
        {
8753
0
        reqcu = branchreqcu;
8754
0
        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8755
0
        }
8756
0
      }
8757
24
    }
8758
8759
  /* Handle reaching the end of the expression, either ')' or end of pattern.
8760
  In the real compile phase, go back through the alternative branches and
8761
  reverse the chain of offsets, with the field in the BRA item now becoming an
8762
  offset to the first alternative. If there are no alternatives, it points to
8763
  the end of the group. The length in the terminating ket is always the length
8764
  of the whole bracketed item. Return leaving the pointer at the terminating
8765
  char. */
8766
8767
48
  if (META_CODE(*pptr) != META_ALT)
8768
48
    {
8769
48
    if (lengthptr == NULL)
8770
24
      {
8771
24
      uint32_t branch_length = (uint32_t)(code - last_branch);
8772
24
      do
8773
24
        {
8774
24
        uint32_t prev_length = GET(last_branch, 1);
8775
24
        PUT(last_branch, 1, branch_length);
8776
24
        branch_length = prev_length;
8777
24
        last_branch -= branch_length;
8778
24
        }
8779
24
      while (branch_length > 0);
8780
24
      }
8781
8782
    /* Fill in the ket */
8783
8784
48
    *code = OP_KET;
8785
48
    PUT(code, 1, (uint32_t)(code - start_bracket));
8786
48
    code += 1 + LINK_SIZE;
8787
8788
    /* Set values to pass back */
8789
8790
48
    *codeptr = code;
8791
48
    *pptrptr = pptr;
8792
48
    *firstcuptr = firstcu;
8793
48
    *firstcuflagsptr = firstcuflags;
8794
48
    *reqcuptr = reqcu;
8795
48
    *reqcuflagsptr = reqcuflags;
8796
48
    if (lengthptr != NULL)
8797
24
      {
8798
24
      if (OFLOW_MAX - *lengthptr < length)
8799
0
        {
8800
0
        *errorcodeptr = ERR20;
8801
0
        return 0;
8802
0
        }
8803
24
      *lengthptr += length;
8804
24
      }
8805
48
    return okreturn;
8806
48
    }
8807
8808
  /* Another branch follows. In the pre-compile phase, we can move the code
8809
  pointer back to where it was for the start of the first branch. (That is,
8810
  pretend that each branch is the only one.)
8811
8812
  In the real compile phase, insert an ALT node. Its length field points back
8813
  to the previous branch while the bracket remains open. At the end the chain
8814
  is reversed. It's done like this so that the start of the bracket has a
8815
  zero offset until it is closed, making it possible to detect recursion. */
8816
8817
0
  if (lengthptr != NULL)
8818
0
    {
8819
0
    code = *codeptr + 1 + LINK_SIZE + skipunits;
8820
0
    length += 1 + LINK_SIZE;
8821
0
    }
8822
0
  else
8823
0
    {
8824
0
    *code = OP_ALT;
8825
0
    PUT(code, 1, (int)(code - last_branch));
8826
0
    bc.current_branch = last_branch = code;
8827
0
    code += 1 + LINK_SIZE;
8828
0
    }
8829
8830
  /* Set the maximum lookbehind length for the next branch (if not in a
8831
  lookbehind the value will be zero) and then advance past the vertical bar. */
8832
8833
0
  lookbehindlength = META_DATA(*pptr);
8834
0
  pptr++;
8835
0
  }
8836
8837
/* LCOV_EXCL_START */
8838
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8839
0
return 0;                  /* Avoid compiler warnings */
8840
/* LCOV_EXCL_STOP */
8841
48
}
8842
8843
8844
8845
/*************************************************
8846
*          Check for anchored pattern            *
8847
*************************************************/
8848
8849
/* Try to find out if this is an anchored regular expression. Consider each
8850
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8851
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8852
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8853
be found, because ^ generates OP_CIRCM in that mode.
8854
8855
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8856
This is the code for \G, which means "match at start of match position, taking
8857
into account the match offset".
8858
8859
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8860
because that will try the rest of the pattern at all possible matching points,
8861
so there is no point trying again.... er ....
8862
8863
.... except when the .* appears inside capturing parentheses, and there is a
8864
subsequent back reference to those parentheses. We haven't enough information
8865
to catch that case precisely.
8866
8867
At first, the best we could do was to detect when .* was in capturing brackets
8868
and the highest back reference was greater than or equal to that level.
8869
However, by keeping a bitmap of the first 31 back references, we can catch some
8870
of the more common cases more precisely.
8871
8872
... A second exception is when the .* appears inside an atomic group, because
8873
this prevents the number of characters it matches from being adjusted.
8874
8875
Arguments:
8876
  code           points to start of the compiled pattern
8877
  bracket_map    a bitmap of which brackets we are inside while testing; this
8878
                   handles up to substring 31; after that we just have to take
8879
                   the less precise approach
8880
  cb             points to the compile data block
8881
  atomcount      atomic group level
8882
  inassert       TRUE if in an assertion
8883
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8884
8885
Returns:     TRUE or FALSE
8886
*/
8887
8888
static BOOL
8889
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8890
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8891
16
{
8892
16
do {
8893
16
   PCRE2_SPTR scode = first_significant_code(
8894
16
     code + PRIV(OP_lengths)[*code], FALSE);
8895
16
   int op = *scode;
8896
8897
   /* Non-capturing brackets */
8898
8899
16
   if (op == OP_BRA  || op == OP_BRAPOS ||
8900
16
       op == OP_SBRA || op == OP_SBRAPOS)
8901
0
     {
8902
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8903
0
       return FALSE;
8904
0
     }
8905
8906
   /* Capturing brackets */
8907
8908
16
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8909
16
            op == OP_SCBRA || op == OP_SCBRAPOS)
8910
0
     {
8911
0
     int n = GET2(scode, 1+LINK_SIZE);
8912
0
     uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8913
0
     if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;
8914
0
     }
8915
8916
   /* Positive forward assertion */
8917
8918
16
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8919
0
     {
8920
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;
8921
0
     }
8922
8923
   /* Condition. If there is no second branch, it can't be anchored. */
8924
8925
16
   else if (op == OP_COND || op == OP_SCOND)
8926
0
     {
8927
0
     if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8928
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8929
0
       return FALSE;
8930
0
     }
8931
8932
   /* Atomic groups */
8933
8934
16
   else if (op == OP_ONCE)
8935
0
     {
8936
0
     if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8937
0
       return FALSE;
8938
0
     }
8939
8940
   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8941
   it isn't in brackets that are or may be referenced or inside an atomic
8942
   group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8943
   because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8944
   with the subject "aab", which matches "b", i.e. not at the start of a line.
8945
   There is also an option that disables auto-anchoring. */
8946
8947
16
   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8948
16
             op == OP_TYPEPOSSTAR))
8949
0
     {
8950
0
     if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8951
0
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8952
0
       return FALSE;
8953
0
     }
8954
8955
   /* Check for explicit anchoring */
8956
8957
16
   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8958
8959
16
   code += GET(code, 1);
8960
16
   }
8961
16
while (*code == OP_ALT);   /* Loop for each alternative */
8962
16
return TRUE;
8963
16
}
8964
8965
8966
8967
/*************************************************
8968
*         Check for starting with ^ or .*        *
8969
*************************************************/
8970
8971
/* This is called to find out if every branch starts with ^ or .* so that
8972
"first char" processing can be done to speed things up in multiline
8973
matching and for non-DOTALL patterns that start with .* (which must start at
8974
the beginning or after \n). As in the case of is_anchored() (see above), we
8975
have to take account of back references to capturing brackets that contain .*
8976
because in that case we can't make the assumption. Also, the appearance of .*
8977
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8978
or *SKIP does not count, because once again the assumption no longer holds.
8979
8980
Arguments:
8981
  code           points to start of the compiled pattern or a group
8982
  bracket_map    a bitmap of which brackets we are inside while testing; this
8983
                   handles up to substring 31; after that we just have to take
8984
                   the less precise approach
8985
  cb             points to the compile data
8986
  atomcount      atomic group level
8987
  inassert       TRUE if in an assertion
8988
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8989
8990
Returns:         TRUE or FALSE
8991
*/
8992
8993
static BOOL
8994
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8995
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8996
0
{
8997
0
do {
8998
0
   PCRE2_SPTR scode = first_significant_code(
8999
0
     code + PRIV(OP_lengths)[*code], FALSE);
9000
0
   int op = *scode;
9001
9002
   /* If we are at the start of a conditional assertion group, *both* the
9003
   conditional assertion *and* what follows the condition must satisfy the test
9004
   for start of line. Other kinds of condition fail. Note that there may be an
9005
   auto-callout at the start of a condition. */
9006
9007
0
   if (op == OP_COND)
9008
0
     {
9009
0
     scode += 1 + LINK_SIZE;
9010
9011
0
     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
9012
0
       else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
9013
9014
0
     switch (*scode)
9015
0
       {
9016
0
       case OP_CREF:
9017
0
       case OP_DNCREF:
9018
0
       case OP_RREF:
9019
0
       case OP_DNRREF:
9020
0
       case OP_FAIL:
9021
0
       case OP_FALSE:
9022
0
       case OP_TRUE:
9023
0
       return FALSE;
9024
9025
0
       default:     /* Assertion */
9026
0
       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
9027
0
         return FALSE;
9028
0
       do scode += GET(scode, 1); while (*scode == OP_ALT);
9029
0
       scode += 1 + LINK_SIZE;
9030
0
       break;
9031
0
       }
9032
0
     scode = first_significant_code(scode, FALSE);
9033
0
     op = *scode;
9034
0
     }
9035
9036
   /* Non-capturing brackets */
9037
9038
0
   if (op == OP_BRA  || op == OP_BRAPOS ||
9039
0
       op == OP_SBRA || op == OP_SBRAPOS)
9040
0
     {
9041
0
     if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
9042
0
       return FALSE;
9043
0
     }
9044
9045
   /* Capturing brackets */
9046
9047
0
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
9048
0
            op == OP_SCBRA || op == OP_SCBRAPOS)
9049
0
     {
9050
0
     int n = GET2(scode, 1+LINK_SIZE);
9051
0
     unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
9052
0
     if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))
9053
0
       return FALSE;
9054
0
     }
9055
9056
   /* Positive forward assertions */
9057
9058
0
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
9059
0
     {
9060
0
     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
9061
0
       return FALSE;
9062
0
     }
9063
9064
   /* Atomic brackets */
9065
9066
0
   else if (op == OP_ONCE)
9067
0
     {
9068
0
     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
9069
0
       return FALSE;
9070
0
     }
9071
9072
   /* .* means "start at start or after \n" if it isn't in atomic brackets or
9073
   brackets that may be referenced or an assertion, and as long as the pattern
9074
   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
9075
   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
9076
   i.e. not at the start of a line. There is also an option that disables this
9077
   optimization. */
9078
9079
0
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
9080
0
     {
9081
0
     if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
9082
0
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
9083
0
       return FALSE;
9084
0
     }
9085
9086
   /* Check for explicit circumflex; anything else gives a FALSE result. Note
9087
   in particular that this includes atomic brackets OP_ONCE because the number
9088
   of characters matched by .* cannot be adjusted inside them. */
9089
9090
0
   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
9091
9092
   /* Move on to the next alternative */
9093
9094
0
   code += GET(code, 1);
9095
0
   }
9096
0
while (*code == OP_ALT);  /* Loop for each alternative */
9097
0
return TRUE;
9098
0
}
9099
9100
9101
9102
/*************************************************
9103
*   Scan compiled regex for recursion reference  *
9104
*************************************************/
9105
9106
/* This function scans through a compiled pattern until it finds an instance of
9107
OP_RECURSE.
9108
9109
Arguments:
9110
  code        points to start of expression
9111
  utf         TRUE in UTF mode
9112
9113
Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
9114
*/
9115
9116
static PCRE2_UCHAR *
9117
find_recurse(PCRE2_UCHAR *code, BOOL utf)
9118
0
{
9119
0
for (;;)
9120
0
  {
9121
0
  PCRE2_UCHAR c = *code;
9122
0
  if (c == OP_END) return NULL;
9123
0
  if (c == OP_RECURSE) return code;
9124
9125
  /* XCLASS is used for classes that cannot be represented just by a bit map.
9126
  This includes negated single high-valued characters. ECLASS is used for
9127
  classes that use set operations internally. CALLOUT_STR is used for
9128
  callouts with string arguments. In each case the length in the table is
9129
  zero; the actual length is stored in the compiled code. */
9130
9131
0
  if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);
9132
0
  else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
9133
9134
  /* Otherwise, we can get the item's length from the table, except that for
9135
  repeated character types, we have to test for \p and \P, which have an extra
9136
  two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
9137
  we must add in its length. */
9138
9139
0
  else
9140
0
    {
9141
0
    switch(c)
9142
0
      {
9143
0
      case OP_TYPESTAR:
9144
0
      case OP_TYPEMINSTAR:
9145
0
      case OP_TYPEPLUS:
9146
0
      case OP_TYPEMINPLUS:
9147
0
      case OP_TYPEQUERY:
9148
0
      case OP_TYPEMINQUERY:
9149
0
      case OP_TYPEPOSSTAR:
9150
0
      case OP_TYPEPOSPLUS:
9151
0
      case OP_TYPEPOSQUERY:
9152
0
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
9153
0
      break;
9154
9155
0
      case OP_TYPEPOSUPTO:
9156
0
      case OP_TYPEUPTO:
9157
0
      case OP_TYPEMINUPTO:
9158
0
      case OP_TYPEEXACT:
9159
0
      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
9160
0
        code += 2;
9161
0
      break;
9162
9163
0
      case OP_MARK:
9164
0
      case OP_COMMIT_ARG:
9165
0
      case OP_PRUNE_ARG:
9166
0
      case OP_SKIP_ARG:
9167
0
      case OP_THEN_ARG:
9168
0
      code += code[1];
9169
0
      break;
9170
0
      }
9171
9172
    /* Add in the fixed length from the table */
9173
9174
0
    code += PRIV(OP_lengths)[c];
9175
9176
    /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
9177
    be followed by a multi-unit character. The length in the table is a
9178
    minimum, so we have to arrange to skip the extra units. */
9179
9180
0
#ifdef MAYBE_UTF_MULTI
9181
0
    if (utf) switch(c)
9182
0
      {
9183
0
      case OP_CHAR:
9184
0
      case OP_CHARI:
9185
0
      case OP_NOT:
9186
0
      case OP_NOTI:
9187
0
      case OP_EXACT:
9188
0
      case OP_EXACTI:
9189
0
      case OP_NOTEXACT:
9190
0
      case OP_NOTEXACTI:
9191
0
      case OP_UPTO:
9192
0
      case OP_UPTOI:
9193
0
      case OP_NOTUPTO:
9194
0
      case OP_NOTUPTOI:
9195
0
      case OP_MINUPTO:
9196
0
      case OP_MINUPTOI:
9197
0
      case OP_NOTMINUPTO:
9198
0
      case OP_NOTMINUPTOI:
9199
0
      case OP_POSUPTO:
9200
0
      case OP_POSUPTOI:
9201
0
      case OP_NOTPOSUPTO:
9202
0
      case OP_NOTPOSUPTOI:
9203
0
      case OP_STAR:
9204
0
      case OP_STARI:
9205
0
      case OP_NOTSTAR:
9206
0
      case OP_NOTSTARI:
9207
0
      case OP_MINSTAR:
9208
0
      case OP_MINSTARI:
9209
0
      case OP_NOTMINSTAR:
9210
0
      case OP_NOTMINSTARI:
9211
0
      case OP_POSSTAR:
9212
0
      case OP_POSSTARI:
9213
0
      case OP_NOTPOSSTAR:
9214
0
      case OP_NOTPOSSTARI:
9215
0
      case OP_PLUS:
9216
0
      case OP_PLUSI:
9217
0
      case OP_NOTPLUS:
9218
0
      case OP_NOTPLUSI:
9219
0
      case OP_MINPLUS:
9220
0
      case OP_MINPLUSI:
9221
0
      case OP_NOTMINPLUS:
9222
0
      case OP_NOTMINPLUSI:
9223
0
      case OP_POSPLUS:
9224
0
      case OP_POSPLUSI:
9225
0
      case OP_NOTPOSPLUS:
9226
0
      case OP_NOTPOSPLUSI:
9227
0
      case OP_QUERY:
9228
0
      case OP_QUERYI:
9229
0
      case OP_NOTQUERY:
9230
0
      case OP_NOTQUERYI:
9231
0
      case OP_MINQUERY:
9232
0
      case OP_MINQUERYI:
9233
0
      case OP_NOTMINQUERY:
9234
0
      case OP_NOTMINQUERYI:
9235
0
      case OP_POSQUERY:
9236
0
      case OP_POSQUERYI:
9237
0
      case OP_NOTPOSQUERY:
9238
0
      case OP_NOTPOSQUERYI:
9239
0
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9240
0
      break;
9241
0
      }
9242
#else
9243
    (void)(utf);  /* Keep compiler happy by referencing function argument */
9244
#endif  /* MAYBE_UTF_MULTI */
9245
0
    }
9246
0
  }
9247
0
}
9248
9249
9250
9251
/*************************************************
9252
*    Check for asserted fixed first code unit    *
9253
*************************************************/
9254
9255
/* During compilation, the "first code unit" settings from forward assertions
9256
are discarded, because they can cause conflicts with actual literals that
9257
follow. However, if we end up without a first code unit setting for an
9258
unanchored pattern, it is worth scanning the regex to see if there is an
9259
initial asserted first code unit. If all branches start with the same asserted
9260
code unit, or with a non-conditional bracket all of whose alternatives start
9261
with the same asserted code unit (recurse ad lib), then we return that code
9262
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9263
REQ_NONE in the flags.
9264
9265
Arguments:
9266
  code       points to start of compiled pattern
9267
  flags      points to the first code unit flags
9268
  inassert   non-zero if in an assertion
9269
9270
Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9271
*/
9272
9273
static uint32_t
9274
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9275
8
{
9276
8
uint32_t c = 0;
9277
8
uint32_t cflags = REQ_NONE;
9278
9279
8
*flags = REQ_NONE;
9280
8
do {
9281
8
   uint32_t d;
9282
8
   uint32_t dflags;
9283
8
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9284
8
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9285
8
   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9286
8
   PCRE2_UCHAR op = *scode;
9287
9288
8
   switch(op)
9289
8
     {
9290
8
     default:
9291
8
     return 0;
9292
9293
0
     case OP_BRA:
9294
0
     case OP_BRAPOS:
9295
0
     case OP_CBRA:
9296
0
     case OP_SCBRA:
9297
0
     case OP_CBRAPOS:
9298
0
     case OP_SCBRAPOS:
9299
0
     case OP_ASSERT:
9300
0
     case OP_ASSERT_NA:
9301
0
     case OP_ONCE:
9302
0
     case OP_SCRIPT_RUN:
9303
0
     d = find_firstassertedcu(scode, &dflags, inassert +
9304
0
       ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9305
0
     if (dflags >= REQ_NONE) return 0;
9306
0
     if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9307
0
       else if (c != d || cflags != dflags) return 0;
9308
0
     break;
9309
9310
0
     case OP_EXACT:
9311
0
     scode += IMM2_SIZE;
9312
0
     PCRE2_FALLTHROUGH /* Fall through */
9313
9314
0
     case OP_CHAR:
9315
0
     case OP_PLUS:
9316
0
     case OP_MINPLUS:
9317
0
     case OP_POSPLUS:
9318
0
     if (inassert == 0) return 0;
9319
0
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9320
0
       else if (c != scode[1]) return 0;
9321
0
     break;
9322
9323
0
     case OP_EXACTI:
9324
0
     scode += IMM2_SIZE;
9325
0
     PCRE2_FALLTHROUGH /* Fall through */
9326
9327
0
     case OP_CHARI:
9328
0
     case OP_PLUSI:
9329
0
     case OP_MINPLUSI:
9330
0
     case OP_POSPLUSI:
9331
0
     if (inassert == 0) return 0;
9332
9333
     /* If the character is more than one code unit long, we cannot set its
9334
     first code unit when matching caselessly. Later scanning may pick up
9335
     multiple code units. */
9336
9337
0
#ifdef SUPPORT_UNICODE
9338
0
#if PCRE2_CODE_UNIT_WIDTH == 8
9339
0
     if (scode[1] >= 0x80) return 0;
9340
#elif PCRE2_CODE_UNIT_WIDTH == 16
9341
     if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9342
#endif
9343
0
#endif
9344
9345
0
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9346
0
       else if (c != scode[1]) return 0;
9347
0
     break;
9348
8
     }
9349
9350
0
   code += GET(code, 1);
9351
0
   }
9352
8
while (*code == OP_ALT);
9353
9354
0
*flags = cflags;
9355
0
return c;
9356
8
}
9357
9358
9359
9360
/*************************************************
9361
*             Skip in parsed pattern             *
9362
*************************************************/
9363
9364
/* This function is called to skip parts of the parsed pattern when finding the
9365
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9366
the end of the branch, it is called to skip over an internal lookaround or
9367
(DEFINE) group, and it is also called to skip to the end of a class, during
9368
which it will never encounter nested groups (but there's no need to have
9369
special code for that).
9370
9371
When called to find the end of a branch or group, pptr must point to the first
9372
meta code inside the branch, not the branch-starting code. In other cases it
9373
can point to the item that causes the function to be called.
9374
9375
Arguments:
9376
  pptr       current pointer to skip from
9377
  skiptype   PSKIP_CLASS when skipping to end of class
9378
             PSKIP_ALT when META_ALT ends the skip
9379
             PSKIP_KET when only META_KET ends the skip
9380
9381
Returns:     new value of pptr
9382
             NULL if META_END is reached - should never occur
9383
               or for an unknown meta value - likewise
9384
*/
9385
9386
static uint32_t *
9387
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9388
0
{
9389
0
uint32_t nestlevel = 0;
9390
9391
0
for (;; pptr++)
9392
0
  {
9393
0
  uint32_t meta = META_CODE(*pptr);
9394
9395
0
  switch(meta)
9396
0
    {
9397
0
    default:  /* Just skip over most items */
9398
0
    if (meta < META_END) continue;  /* Literal */
9399
0
    break;
9400
9401
    /* The parsed regex is malformed; we have reached the end and did
9402
    not find the end of the construct which we are skipping over. */
9403
9404
    /* LCOV_EXCL_START */
9405
0
    case META_END:
9406
0
    PCRE2_DEBUG_UNREACHABLE();
9407
0
    return NULL;
9408
    /* LCOV_EXCL_STOP */
9409
9410
    /* The data for these items is variable in length. */
9411
9412
0
    case META_BACKREF:  /* Offset is present only if group >= 10 */
9413
0
    if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9414
0
    break;
9415
9416
0
    case META_ESCAPE:
9417
0
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9418
0
      pptr += 1;     /* Skip prop data */
9419
0
    break;
9420
9421
0
    case META_MARK:     /* Add the length of the name. */
9422
0
    case META_COMMIT_ARG:
9423
0
    case META_PRUNE_ARG:
9424
0
    case META_SKIP_ARG:
9425
0
    case META_THEN_ARG:
9426
0
    pptr += pptr[1];
9427
0
    break;
9428
9429
    /* These are the "active" items in this loop. */
9430
9431
0
    case META_CLASS_END:
9432
0
    if (skiptype == PSKIP_CLASS) return pptr;
9433
0
    break;
9434
9435
0
    case META_ATOMIC:
9436
0
    case META_CAPTURE:
9437
0
    case META_COND_ASSERT:
9438
0
    case META_COND_DEFINE:
9439
0
    case META_COND_NAME:
9440
0
    case META_COND_NUMBER:
9441
0
    case META_COND_RNAME:
9442
0
    case META_COND_RNUMBER:
9443
0
    case META_COND_VERSION:
9444
0
    case META_SCS:
9445
0
    case META_LOOKAHEAD:
9446
0
    case META_LOOKAHEADNOT:
9447
0
    case META_LOOKAHEAD_NA:
9448
0
    case META_LOOKBEHIND:
9449
0
    case META_LOOKBEHINDNOT:
9450
0
    case META_LOOKBEHIND_NA:
9451
0
    case META_NOCAPTURE:
9452
0
    case META_SCRIPT_RUN:
9453
0
    nestlevel++;
9454
0
    break;
9455
9456
0
    case META_ALT:
9457
0
    if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9458
0
    break;
9459
9460
0
    case META_KET:
9461
0
    if (nestlevel == 0) return pptr;
9462
0
    nestlevel--;
9463
0
    break;
9464
0
    }
9465
9466
  /* The extra data item length for each meta is in a table. */
9467
9468
0
  meta = (meta >> 16) & 0x7fff;
9469
0
  if (meta >= sizeof(meta_extra_lengths)) return NULL;
9470
0
  pptr += meta_extra_lengths[meta];
9471
0
  }
9472
9473
/* LCOV_EXCL_START */
9474
0
PCRE2_UNREACHABLE(); /* Control never reaches here */
9475
/* LCOV_EXCL_STOP */
9476
0
}
9477
9478
9479
9480
/*************************************************
9481
*       Find length of a parsed group            *
9482
*************************************************/
9483
9484
/* This is called for nested groups within a branch of a lookbehind whose
9485
length is being computed. On entry, the pointer must be at the first element
9486
after the group initializing code. On exit it points to OP_KET. Caching is used
9487
to improve processing speed when the same capturing group occurs many times.
9488
9489
Arguments:
9490
  pptrptr     pointer to pointer in the parsed pattern
9491
  minptr      where to return the minimum length
9492
  isinline    FALSE if a reference or recursion; TRUE for inline group
9493
  errcodeptr  pointer to the errorcode
9494
  lcptr       pointer to the loop counter
9495
  group       number of captured group or -1 for a non-capturing group
9496
  recurses    chain of recurse_check to catch mutual recursion
9497
  cb          pointer to the compile data
9498
9499
Returns:      the maximum group length or a negative number
9500
*/
9501
9502
static int
9503
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9504
  int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9505
0
{
9506
0
uint32_t *gi = cb->groupinfo + 2 * group;
9507
0
int branchlength, branchminlength;
9508
0
int grouplength = -1;
9509
0
int groupminlength = INT_MAX;
9510
9511
/* The cache can be used only if there is no possibility of there being two
9512
groups with the same number. We do not need to set the end pointer for a group
9513
that is being processed as a back reference or recursion, but we must do so for
9514
an inline group. */
9515
9516
0
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9517
0
  {
9518
0
  uint32_t groupinfo = gi[0];
9519
0
  if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9520
0
  if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9521
0
    {
9522
0
    if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9523
0
    *minptr = gi[1];
9524
0
    return groupinfo & GI_FIXED_LENGTH_MASK;
9525
0
    }
9526
0
  }
9527
9528
/* Scan the group. In this case we find the end pointer of necessity. */
9529
9530
0
for(;;)
9531
0
  {
9532
0
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9533
0
    recurses, cb);
9534
0
  if (branchlength < 0) goto ISNOTFIXED;
9535
0
  if (branchlength > grouplength) grouplength = branchlength;
9536
0
  if (branchminlength < groupminlength) groupminlength = branchminlength;
9537
0
  if (**pptrptr == META_KET) break;
9538
0
  *pptrptr += 1;   /* Skip META_ALT */
9539
0
  }
9540
9541
0
if (group > 0)
9542
0
  {
9543
0
  gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9544
0
  gi[1] = groupminlength;
9545
0
  }
9546
9547
0
*minptr = groupminlength;
9548
0
return grouplength;
9549
9550
0
ISNOTFIXED:
9551
0
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9552
0
return -1;
9553
0
}
9554
9555
9556
9557
/*************************************************
9558
*        Find length of a parsed branch          *
9559
*************************************************/
9560
9561
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9562
giving an error if the length is not limited. On entry, *pptrptr points to the
9563
first element inside the branch. On exit it is set to point to the ALT or KET.
9564
9565
Arguments:
9566
  pptrptr     pointer to pointer in the parsed pattern
9567
  minptr      where to return the minimum length
9568
  errcodeptr  pointer to error code
9569
  lcptr       pointer to loop counter
9570
  recurses    chain of recurse_check to catch mutual recursion
9571
  cb          pointer to compile block
9572
9573
Returns:      the maximum length, or a negative value on error
9574
*/
9575
9576
static int
9577
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9578
  parsed_recurse_check *recurses, compile_block *cb)
9579
0
{
9580
0
int branchlength = 0;
9581
0
int branchminlength = 0;
9582
0
int grouplength, groupminlength;
9583
0
uint32_t lastitemlength = 0;
9584
0
uint32_t lastitemminlength = 0;
9585
0
uint32_t *pptr = *pptrptr;
9586
0
PCRE2_SIZE offset;
9587
0
parsed_recurse_check this_recurse;
9588
9589
/* A large and/or complex regex can take too long to process. This can happen
9590
more often when (?| groups are present in the pattern because their length
9591
cannot be cached. */
9592
9593
0
if ((*lcptr)++ > 2000)
9594
0
  {
9595
0
  *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9596
0
  return -1;
9597
0
  }
9598
9599
/* Scan the branch, accumulating the length. */
9600
9601
0
for (;; pptr++)
9602
0
  {
9603
0
  parsed_recurse_check *r;
9604
0
  uint32_t *gptr, *gptrend;
9605
0
  uint32_t escape;
9606
0
  uint32_t min, max;
9607
0
  uint32_t group = 0;
9608
0
  uint32_t itemlength = 0;
9609
0
  uint32_t itemminlength = 0;
9610
9611
0
  if (*pptr < META_END)
9612
0
    {
9613
0
    itemlength = itemminlength = 1;
9614
0
    }
9615
9616
0
  else switch (META_CODE(*pptr))
9617
0
    {
9618
0
    case META_KET:
9619
0
    case META_ALT:
9620
0
    goto EXIT;
9621
9622
    /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9623
    actual termination. */
9624
9625
0
    case META_ACCEPT:
9626
0
    case META_FAIL:
9627
0
    pptr = parsed_skip(pptr, PSKIP_ALT);
9628
0
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9629
0
    goto EXIT;
9630
9631
0
    case META_MARK:
9632
0
    case META_COMMIT_ARG:
9633
0
    case META_PRUNE_ARG:
9634
0
    case META_SKIP_ARG:
9635
0
    case META_THEN_ARG:
9636
0
    pptr += pptr[1] + 1;
9637
0
    break;
9638
9639
0
    case META_CIRCUMFLEX:
9640
0
    case META_COMMIT:
9641
0
    case META_DOLLAR:
9642
0
    case META_PRUNE:
9643
0
    case META_SKIP:
9644
0
    case META_THEN:
9645
0
    break;
9646
9647
0
    case META_OPTIONS:
9648
0
    pptr += 2;
9649
0
    break;
9650
9651
0
    case META_BIGVALUE:
9652
0
    itemlength = itemminlength = 1;
9653
0
    pptr += 1;
9654
0
    break;
9655
9656
0
    case META_CLASS:
9657
0
    case META_CLASS_NOT:
9658
0
    itemlength = itemminlength = 1;
9659
0
    pptr = parsed_skip(pptr, PSKIP_CLASS);
9660
0
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9661
0
    break;
9662
9663
0
    case META_CLASS_EMPTY_NOT:
9664
0
    case META_DOT:
9665
0
    itemlength = itemminlength = 1;
9666
0
    break;
9667
9668
0
    case META_CALLOUT_NUMBER:
9669
0
    pptr += 3;
9670
0
    break;
9671
9672
0
    case META_CALLOUT_STRING:
9673
0
    pptr += 3 + SIZEOFFSET;
9674
0
    break;
9675
9676
    /* Only some escapes consume a character. Of those, \R can match one or two
9677
    characters, but \X is never allowed because it matches an unknown number of
9678
    characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9679
9680
0
    case META_ESCAPE:
9681
0
    escape = META_DATA(*pptr);
9682
0
    if (escape == ESC_X) return -1;
9683
0
    if (escape == ESC_R)
9684
0
      {
9685
0
      itemminlength = 1;
9686
0
      itemlength = 2;
9687
0
      }
9688
0
    else if (escape > ESC_b && escape < ESC_Z)
9689
0
      {
9690
0
#if PCRE2_CODE_UNIT_WIDTH != 32
9691
0
      if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9692
0
        {
9693
0
        *errcodeptr = ERR36;
9694
0
        return -1;
9695
0
        }
9696
0
#endif
9697
0
      itemlength = itemminlength = 1;
9698
0
      if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9699
0
      }
9700
0
    break;
9701
9702
    /* Lookaheads do not contribute to the length of this branch, but they may
9703
    contain lookbehinds within them whose lengths need to be set. */
9704
9705
0
    case META_LOOKAHEAD:
9706
0
    case META_LOOKAHEADNOT:
9707
0
    case META_LOOKAHEAD_NA:
9708
0
    case META_SCS:
9709
0
    *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9710
0
    if (*errcodeptr != 0) return -1;
9711
9712
    /* Ignore any qualifiers that follow a lookahead assertion. */
9713
9714
0
    switch (pptr[1])
9715
0
      {
9716
0
      case META_ASTERISK:
9717
0
      case META_ASTERISK_PLUS:
9718
0
      case META_ASTERISK_QUERY:
9719
0
      case META_PLUS:
9720
0
      case META_PLUS_PLUS:
9721
0
      case META_PLUS_QUERY:
9722
0
      case META_QUERY:
9723
0
      case META_QUERY_PLUS:
9724
0
      case META_QUERY_QUERY:
9725
0
      pptr++;
9726
0
      break;
9727
9728
0
      case META_MINMAX:
9729
0
      case META_MINMAX_PLUS:
9730
0
      case META_MINMAX_QUERY:
9731
0
      pptr += 3;
9732
0
      break;
9733
9734
0
      default:
9735
0
      break;
9736
0
      }
9737
0
    break;
9738
9739
    /* A nested lookbehind does not contribute any length to this lookbehind,
9740
    but must itself be checked and have its lengths set. Note that
9741
    set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket
9742
    of the group, so no need to update it here. */
9743
9744
0
    case META_LOOKBEHIND:
9745
0
    case META_LOOKBEHINDNOT:
9746
0
    case META_LOOKBEHIND_NA:
9747
0
    if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9748
0
      return -1;
9749
0
    break;
9750
9751
    /* Back references and recursions are handled by very similar code. At this
9752
    stage, the names generated in the parsing pass are available, but the main
9753
    name table has not yet been created. So for the named varieties, scan the
9754
    list of names in order to get the number of the first one in the pattern,
9755
    and whether or not this name is duplicated. */
9756
9757
0
    case META_BACKREF_BYNAME:
9758
0
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9759
0
      goto ISNOTFIXED;
9760
0
    PCRE2_FALLTHROUGH /* Fall through */
9761
0
9762
0
    case META_RECURSE_BYNAME:
9763
0
      {
9764
0
      PCRE2_SPTR name;
9765
0
      BOOL is_dupname = FALSE;
9766
0
      named_group *ng;
9767
0
      uint32_t meta_code = META_CODE(*pptr);
9768
0
      uint32_t length = *(++pptr);
9769
9770
0
      GETPLUSOFFSET(offset, pptr);
9771
0
      name = cb->start_pattern + offset;
9772
0
      ng = PRIV(compile_find_named_group)(name, length, cb);
9773
9774
0
      if (ng == NULL)
9775
0
        {
9776
0
        *errcodeptr = ERR15;  /* Non-existent subpattern */
9777
0
        cb->erroroffset = offset;
9778
0
        return -1;
9779
0
        }
9780
9781
0
      group = ng->number;
9782
0
      is_dupname = (ng->hash_dup & NAMED_GROUP_IS_DUPNAME) != 0;
9783
9784
      /* A numerical back reference can be fixed length if duplicate capturing
9785
      groups are not being used. A non-duplicate named back reference can also
9786
      be handled. */
9787
9788
0
      if (meta_code == META_RECURSE_BYNAME ||
9789
0
          (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9790
0
        goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9791
0
      }
9792
0
    goto ISNOTFIXED;                     /* Duplicate name or number */
9793
9794
    /* The offset values for back references < 10 are in a separate vector
9795
    because otherwise they would use more than two parsed pattern elements on
9796
    64-bit systems. */
9797
9798
0
    case META_BACKREF:
9799
0
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9800
0
        (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9801
0
      goto ISNOTFIXED;
9802
0
    group = META_DATA(*pptr);
9803
0
    if (group < 10)
9804
0
      {
9805
0
      offset = cb->small_ref_offset[group];
9806
0
      goto RECURSE_OR_BACKREF_LENGTH;
9807
0
      }
9808
9809
0
    PCRE2_FALLTHROUGH /* Fall through */
9810
0
    /* For groups >= 10 - picking up group twice does no harm. */
9811
0
9812
0
    /* A true recursion implies not fixed length, but a subroutine call may
9813
0
    be OK. Back reference "recursions" are also failed. */
9814
0
9815
0
    case META_RECURSE:
9816
0
    group = META_DATA(*pptr);
9817
0
    GETPLUSOFFSET(offset, pptr);
9818
9819
0
    RECURSE_OR_BACKREF_LENGTH:
9820
0
    if (group > cb->bracount)
9821
0
      {
9822
0
      cb->erroroffset = offset;
9823
0
      *errcodeptr = ERR15;  /* Non-existent subpattern */
9824
0
      return -1;
9825
0
      }
9826
0
    if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9827
0
    for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9828
0
      {
9829
0
      if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9830
0
        else if (*gptr == (META_CAPTURE | group)) break;
9831
0
      }
9832
9833
    /* We must start the search for the end of the group at the first meta code
9834
    inside the group. Otherwise it will be treated as an enclosed group. */
9835
9836
0
    gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9837
0
    if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9838
0
    if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9839
0
    for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9840
0
    if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9841
0
    this_recurse.prev = recurses;
9842
0
    this_recurse.groupptr = gptr;
9843
9844
    /* We do not need to know the position of the end of the group, that is,
9845
    gptr is not used after the call to get_grouplength(). Setting the second
9846
    argument FALSE stops it scanning for the end when the length can be found
9847
    in the cache. */
9848
9849
0
    gptr++;
9850
0
    grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9851
0
      lcptr, group, &this_recurse, cb);
9852
0
    if (grouplength < 0)
9853
0
      {
9854
0
      if (*errcodeptr == 0) goto ISNOTFIXED;
9855
0
      return -1;  /* Error already set */
9856
0
      }
9857
0
    itemlength = grouplength;
9858
0
    itemminlength = groupminlength;
9859
0
    break;
9860
9861
    /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9862
    the length of this branch. Skip from the following item to the next
9863
    unpaired ket. */
9864
9865
0
    case META_COND_DEFINE:
9866
0
    pptr = parsed_skip(pptr + 1, PSKIP_KET);
9867
0
    break;
9868
9869
    /* Check other nested groups - advance past the initial data for each type
9870
    and then seek a fixed length with get_grouplength(). */
9871
9872
0
    case META_COND_NAME:
9873
0
    case META_COND_NUMBER:
9874
0
    case META_COND_RNAME:
9875
0
    case META_COND_RNUMBER:
9876
0
    pptr += 2 + SIZEOFFSET;
9877
0
    goto CHECK_GROUP;
9878
9879
0
    case META_COND_ASSERT:
9880
0
    pptr += 1;
9881
0
    goto CHECK_GROUP;
9882
9883
0
    case META_COND_VERSION:
9884
0
    pptr += 4;
9885
0
    goto CHECK_GROUP;
9886
9887
0
    case META_CAPTURE:
9888
0
    group = META_DATA(*pptr);
9889
0
    PCRE2_FALLTHROUGH /* Fall through */
9890
9891
0
    case META_ATOMIC:
9892
0
    case META_NOCAPTURE:
9893
0
    case META_SCRIPT_RUN:
9894
0
    pptr++;
9895
0
    CHECK_GROUP:
9896
0
    grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9897
0
      lcptr, group, recurses, cb);
9898
0
    if (grouplength < 0) return -1;
9899
0
    itemlength = grouplength;
9900
0
    itemminlength = groupminlength;
9901
0
    break;
9902
9903
0
    case META_QUERY:
9904
0
    case META_QUERY_PLUS:
9905
0
    case META_QUERY_QUERY:
9906
0
    min = 0;
9907
0
    max = 1;
9908
0
    goto REPETITION;
9909
9910
    /* Exact repetition is OK; variable repetition is not. A repetition of zero
9911
    must subtract the length that has already been added. */
9912
9913
0
    case META_MINMAX:
9914
0
    case META_MINMAX_PLUS:
9915
0
    case META_MINMAX_QUERY:
9916
0
    min = pptr[1];
9917
0
    max = pptr[2];
9918
0
    pptr += 2;
9919
9920
0
    REPETITION:
9921
0
    if (max != REPEAT_UNLIMITED)
9922
0
      {
9923
0
      if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9924
0
          max != 0 &&
9925
0
          (INT_MAX - branchlength)/lastitemlength < max - 1)
9926
0
        {
9927
0
        *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9928
0
        return -1;
9929
0
        }
9930
0
      if (min == 0) branchminlength -= lastitemminlength;
9931
0
        else itemminlength = (min - 1) * lastitemminlength;
9932
0
      if (max == 0) branchlength -= lastitemlength;
9933
0
        else itemlength = (max - 1) * lastitemlength;
9934
0
      break;
9935
0
      }
9936
0
    PCRE2_FALLTHROUGH /* Fall through */
9937
0
9938
0
    /* Any other item means this branch does not have a fixed length. */
9939
0
9940
0
    default:
9941
0
    ISNOTFIXED:
9942
0
    *errcodeptr = ERR25;   /* Not fixed length */
9943
0
    return -1;
9944
0
    }
9945
9946
  /* Add the item length to the branchlength, checking for integer overflow and
9947
  for the branch length exceeding the overall limit. Later, if there is at
9948
  least one variable-length branch in the group, there is a test for the
9949
  (smaller) variable-length branch length limit. */
9950
9951
0
  if (INT_MAX - branchlength < (int)itemlength ||
9952
0
      (branchlength += itemlength) > LOOKBEHIND_MAX)
9953
0
    {
9954
0
    *errcodeptr = ERR87;
9955
0
    return -1;
9956
0
    }
9957
9958
0
  branchminlength += itemminlength;
9959
9960
  /* Save this item length for use if the next item is a quantifier. */
9961
9962
0
  lastitemlength = itemlength;
9963
0
  lastitemminlength = itemminlength;
9964
0
  }
9965
9966
0
EXIT:
9967
0
*pptrptr = pptr;
9968
0
*minptr = branchminlength;
9969
0
return branchlength;
9970
9971
/* LCOV_EXCL_START */
9972
0
PARSED_SKIP_FAILED:
9973
0
PCRE2_DEBUG_UNREACHABLE();
9974
0
*errcodeptr = ERR90;  /* Unhandled META code - internal error */
9975
0
return -1;
9976
/* LCOV_EXCL_STOP */
9977
0
}
9978
9979
9980
9981
/*************************************************
9982
*        Set lengths in a lookbehind             *
9983
*************************************************/
9984
9985
/* This function is called for each lookbehind, to set the lengths in its
9986
branches. An error occurs if any branch does not have a limited maximum length
9987
that is less than the limit (65535). On exit, the pointer must be left on the
9988
final ket.
9989
9990
The function also maintains the max_lookbehind value. Any lookbehind branch
9991
that contains a nested lookbehind may actually look further back than the
9992
length of the branch. The additional amount is passed back from
9993
get_branchlength() as an "extra" value.
9994
9995
Arguments:
9996
  pptrptr     pointer to pointer in the parsed pattern
9997
  errcodeptr  pointer to error code
9998
  lcptr       pointer to loop counter
9999
  recurses    chain of recurse_check to catch mutual recursion
10000
  cb          pointer to compile block
10001
10002
Returns:      TRUE if all is well
10003
              FALSE otherwise, with error code and offset set
10004
*/
10005
10006
static BOOL
10007
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
10008
  parsed_recurse_check *recurses, compile_block *cb)
10009
0
{
10010
0
PCRE2_SIZE offset;
10011
0
uint32_t *bptr = *pptrptr;
10012
0
uint32_t *gbptr = bptr;
10013
0
int maxlength = 0;
10014
0
int minlength = INT_MAX;
10015
0
BOOL variable = FALSE;
10016
10017
0
READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
10018
0
*pptrptr += SIZEOFFSET;
10019
10020
/* Each branch can have a different maximum length, but we can keep only a
10021
single minimum for the whole group, because there's nowhere to save individual
10022
values in the META_ALT item. */
10023
10024
0
do
10025
0
  {
10026
0
  int branchlength, branchminlength;
10027
10028
0
  *pptrptr += 1;
10029
0
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
10030
0
    recurses, cb);
10031
10032
0
  if (branchlength < 0)
10033
0
    {
10034
    /* The errorcode and offset may already be set from a nested lookbehind. */
10035
0
    if (*errcodeptr == 0) *errcodeptr = ERR25;
10036
0
    if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
10037
0
    return FALSE;
10038
0
    }
10039
10040
0
  if (branchlength != branchminlength) variable = TRUE;
10041
0
  if (branchminlength < minlength) minlength = branchminlength;
10042
0
  if (branchlength > maxlength) maxlength = branchlength;
10043
0
  if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
10044
0
  *bptr |= branchlength;  /* branchlength never more than 65535 */
10045
0
  bptr = *pptrptr;
10046
0
  }
10047
0
while (META_CODE(*bptr) == META_ALT);
10048
10049
/* If any branch is of variable length, the whole lookbehind is of variable
10050
length. If the maximum length of any branch exceeds the maximum for variable
10051
lookbehinds, give an error. Otherwise, the minimum length is set in the word
10052
that follows the original group META value. For a fixed-length lookbehind, this
10053
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
10054
possibly different) length. */
10055
10056
0
if (variable)
10057
0
  {
10058
0
  gbptr[1] = minlength;
10059
0
  if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)
10060
0
    {
10061
0
    *errcodeptr = ERR100;
10062
0
    cb->erroroffset = offset;
10063
0
    return FALSE;
10064
0
    }
10065
0
  }
10066
0
else gbptr[1] = LOOKBEHIND_MAX;
10067
10068
0
return TRUE;
10069
0
}
10070
10071
10072
10073
/*************************************************
10074
*         Check parsed pattern lookbehinds       *
10075
*************************************************/
10076
10077
/* This function is called at the end of parsing a pattern if any lookbehinds
10078
were encountered. It scans the parsed pattern for them, calling
10079
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
10080
the error offset is marked unset. The enables the functions above not to
10081
override settings from deeper nestings.
10082
10083
This function is called recursively from get_branchlength() for lookaheads in
10084
order to process any lookbehinds that they may contain. It stops when it hits a
10085
non-nested closing parenthesis in this case, returning a pointer to it.
10086
10087
Arguments
10088
  pptr      points to where to start (start of pattern or start of lookahead)
10089
  retptr    if not NULL, return the ket pointer here
10090
  recurses  chain of recurse_check to catch mutual recursion
10091
  cb        points to the compile block
10092
  lcptr     points to loop counter
10093
10094
Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
10095
*/
10096
10097
static int
10098
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
10099
  parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
10100
0
{
10101
0
int errorcode = 0;
10102
0
int nestlevel = 0;
10103
10104
0
cb->erroroffset = PCRE2_UNSET;
10105
10106
0
for (; *pptr != META_END; pptr++)
10107
0
  {
10108
0
  if (*pptr < META_END) continue;  /* Literal */
10109
10110
0
  switch (META_CODE(*pptr))
10111
0
    {
10112
    /* The following erroroffset is a bogus but safe value. This branch should
10113
    be avoided by providing a proper implementation for all supported cases
10114
    below. */
10115
10116
    /* LCOV_EXCL_START */
10117
0
    default:
10118
0
    PCRE2_DEBUG_UNREACHABLE();
10119
0
    cb->erroroffset = 0;
10120
0
    return ERR70;  /* Unrecognized meta code */
10121
    /* LCOV_EXCL_STOP */
10122
10123
0
    case META_ESCAPE:
10124
0
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
10125
0
      pptr += 1;    /* Skip prop data */
10126
0
    break;
10127
10128
0
    case META_KET:
10129
0
    if (--nestlevel < 0)
10130
0
      {
10131
0
      if (retptr != NULL) *retptr = pptr;
10132
0
      return 0;
10133
0
      }
10134
0
    break;
10135
10136
0
    case META_ATOMIC:
10137
0
    case META_CAPTURE:
10138
0
    case META_COND_ASSERT:
10139
0
    case META_SCS:
10140
0
    case META_LOOKAHEAD:
10141
0
    case META_LOOKAHEADNOT:
10142
0
    case META_LOOKAHEAD_NA:
10143
0
    case META_NOCAPTURE:
10144
0
    case META_SCRIPT_RUN:
10145
0
    nestlevel++;
10146
0
    break;
10147
10148
0
    case META_ACCEPT:
10149
0
    case META_ALT:
10150
0
    case META_ASTERISK:
10151
0
    case META_ASTERISK_PLUS:
10152
0
    case META_ASTERISK_QUERY:
10153
0
    case META_BACKREF:
10154
0
    case META_CIRCUMFLEX:
10155
0
    case META_CLASS:
10156
0
    case META_CLASS_EMPTY:
10157
0
    case META_CLASS_EMPTY_NOT:
10158
0
    case META_CLASS_END:
10159
0
    case META_CLASS_NOT:
10160
0
    case META_COMMIT:
10161
0
    case META_DOLLAR:
10162
0
    case META_DOT:
10163
0
    case META_FAIL:
10164
0
    case META_PLUS:
10165
0
    case META_PLUS_PLUS:
10166
0
    case META_PLUS_QUERY:
10167
0
    case META_PRUNE:
10168
0
    case META_QUERY:
10169
0
    case META_QUERY_PLUS:
10170
0
    case META_QUERY_QUERY:
10171
0
    case META_RANGE_ESCAPED:
10172
0
    case META_RANGE_LITERAL:
10173
0
    case META_SKIP:
10174
0
    case META_THEN:
10175
0
    break;
10176
10177
0
    case META_OFFSET:
10178
0
    case META_RECURSE:
10179
0
    pptr += SIZEOFFSET;
10180
0
    break;
10181
10182
0
    case META_BACKREF_BYNAME:
10183
0
    case META_RECURSE_BYNAME:
10184
0
    pptr += 1 + SIZEOFFSET;
10185
0
    break;
10186
10187
0
    case META_COND_DEFINE:
10188
0
    pptr += SIZEOFFSET;
10189
0
    nestlevel++;
10190
0
    break;
10191
10192
0
    case META_COND_NAME:
10193
0
    case META_COND_NUMBER:
10194
0
    case META_COND_RNAME:
10195
0
    case META_COND_RNUMBER:
10196
0
    pptr += 1 + SIZEOFFSET;
10197
0
    nestlevel++;
10198
0
    break;
10199
10200
0
    case META_COND_VERSION:
10201
0
    pptr += 3;
10202
0
    nestlevel++;
10203
0
    break;
10204
10205
0
    case META_CALLOUT_STRING:
10206
0
    pptr += 3 + SIZEOFFSET;
10207
0
    break;
10208
10209
0
    case META_BIGVALUE:
10210
0
    case META_POSIX:
10211
0
    case META_POSIX_NEG:
10212
0
    case META_CAPTURE_NAME:
10213
0
    case META_CAPTURE_NUMBER:
10214
0
    pptr += 1;
10215
0
    break;
10216
10217
0
    case META_MINMAX:
10218
0
    case META_MINMAX_QUERY:
10219
0
    case META_MINMAX_PLUS:
10220
0
    case META_OPTIONS:
10221
0
    pptr += 2;
10222
0
    break;
10223
10224
0
    case META_CALLOUT_NUMBER:
10225
0
    pptr += 3;
10226
0
    break;
10227
10228
0
    case META_MARK:
10229
0
    case META_COMMIT_ARG:
10230
0
    case META_PRUNE_ARG:
10231
0
    case META_SKIP_ARG:
10232
0
    case META_THEN_ARG:
10233
0
    pptr += 1 + pptr[1];
10234
0
    break;
10235
10236
    /* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to
10237
    the final ket of the group, so no need to update it here. */
10238
10239
0
    case META_LOOKBEHIND:
10240
0
    case META_LOOKBEHINDNOT:
10241
0
    case META_LOOKBEHIND_NA:
10242
0
    if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10243
0
      return errorcode;
10244
0
    break;
10245
0
    }
10246
0
  }
10247
10248
0
return 0;
10249
0
}
10250
10251
10252
10253
/*************************************************
10254
*     External function to compile a pattern     *
10255
*************************************************/
10256
10257
/* This function reads a regular expression in the form of a string and returns
10258
a pointer to a block of store holding a compiled version of the expression.
10259
10260
Arguments:
10261
  pattern       the regular expression
10262
  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10263
  options       option bits
10264
  errorptr      pointer to errorcode
10265
  erroroffset   pointer to error offset
10266
  ccontext      points to a compile context or is NULL
10267
10268
Returns:        pointer to compiled data block, or NULL on error,
10269
                with errorcode and erroroffset set
10270
*/
10271
10272
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10273
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10274
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10275
16
{
10276
16
BOOL utf;                             /* Set TRUE for UTF mode */
10277
16
BOOL ucp;                             /* Set TRUE for UCP mode */
10278
16
BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10279
16
BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10280
16
pcre2_real_code *re = NULL;           /* What we will return */
10281
16
compile_block cb;                     /* "Static" compile-time data */
10282
16
const uint8_t *tables;                /* Char tables base pointer */
10283
10284
16
PCRE2_UCHAR null_str[1] = { 0xcd };   /* Dummy for handling null inputs */
10285
16
PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10286
16
PCRE2_UCHAR *codestart;               /* Start of compiled code */
10287
16
PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10288
16
uint32_t *pptr;                       /* Current pointer in parsed pattern */
10289
10290
16
PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10291
16
PCRE2_SIZE usedlength;                /* Actual length used */
10292
16
PCRE2_SIZE re_blocksize;              /* Size of memory block */
10293
16
PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10294
10295
16
uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10296
16
uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10297
16
uint32_t setflags = 0;                /* NL and BSR set flags */
10298
16
uint32_t xoptions;                    /* Flags from context, modified */
10299
10300
16
uint32_t skipatstart;                 /* When checking (*UTF) etc */
10301
16
uint32_t limit_heap  = UINT32_MAX;
10302
16
uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10303
16
uint32_t limit_depth = UINT32_MAX;
10304
10305
16
int newline = 0;                      /* Unset; can be set by the pattern */
10306
16
int bsr = 0;                          /* Unset; can be set by the pattern */
10307
16
int errorcode = 0;                    /* Initialize to avoid compiler warn */
10308
16
int regexrc;                          /* Return from compile */
10309
10310
16
uint32_t i;                           /* Local loop counter */
10311
10312
/* Enable all optimizations by default. */
10313
16
uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :
10314
16
                                          PCRE2_OPTIMIZATION_ALL;
10315
10316
/* Comments at the head of this file explain about these variables. */
10317
10318
16
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10319
16
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10320
16
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10321
10322
/* The workspace is used in different ways in the different compiling phases.
10323
It needs to be 16-bit aligned for the preliminary parsing scan. */
10324
10325
16
uint32_t c16workspace[C16_WORK_SIZE];
10326
16
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10327
10328
10329
/* -------------- Check arguments and set up the pattern ----------------- */
10330
10331
/* There must be error code and offset pointers. */
10332
10333
16
if (errorptr == NULL)
10334
0
  {
10335
0
  if (erroroffset != NULL) *erroroffset = 0;
10336
0
  return NULL;
10337
0
  }
10338
16
if (erroroffset == NULL)
10339
0
  {
10340
0
  if (errorptr != NULL) *errorptr = ERR120;
10341
0
  return NULL;
10342
0
  }
10343
16
*errorptr = ERR0;
10344
16
*erroroffset = 0;
10345
10346
/* There must be a pattern, but NULL is allowed with zero length. */
10347
10348
16
if (pattern == NULL)
10349
0
  {
10350
0
  if (patlen == 0)
10351
0
    pattern = null_str;
10352
0
  else
10353
0
    {
10354
0
    *errorptr = ERR16;
10355
0
    return NULL;
10356
0
    }
10357
0
  }
10358
10359
/* A NULL compile context means "use a default context" */
10360
10361
16
if (ccontext == NULL)
10362
0
  ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10363
10364
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10365
10366
16
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10367
10368
/* Check that all undefined public option bits are zero. */
10369
10370
16
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10371
16
    (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10372
0
  {
10373
0
  *errorptr = ERR17;
10374
0
  return NULL;
10375
0
  }
10376
10377
16
if ((options & PCRE2_LITERAL) != 0 &&
10378
0
    ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10379
0
     (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10380
0
  {
10381
0
  *errorptr = ERR92;
10382
0
  return NULL;
10383
0
  }
10384
10385
/* A zero-terminated pattern is indicated by the special length value
10386
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10387
10388
16
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10389
16
  patlen = PRIV(strlen)(pattern);
10390
16
(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */
10391
10392
16
if (patlen > ccontext->max_pattern_length)
10393
0
  {
10394
0
  *errorptr = ERR88;
10395
0
  return NULL;
10396
0
  }
10397
10398
/* Optimization flags in 'options' can override those in the compile context.
10399
This is because some options to disable optimizations were added before the
10400
optimization flags word existed, and we need to continue supporting them
10401
for backwards compatibility. */
10402
10403
16
if ((options & PCRE2_NO_AUTO_POSSESS) != 0)
10404
0
  optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;
10405
16
if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
10406
0
  optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;
10407
16
if ((options & PCRE2_NO_START_OPTIMIZE) != 0)
10408
0
  optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;
10409
10410
/* From here on, all returns from this function should end up going via the
10411
EXIT label. */
10412
10413
10414
/* ------------ Initialize the "static" compile data -------------- */
10415
10416
16
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10417
10418
16
cb.lcc = tables + lcc_offset;          /* Individual */
10419
16
cb.fcc = tables + fcc_offset;          /*   character */
10420
16
cb.cbits = tables + cbits_offset;      /*      tables */
10421
16
cb.ctypes = tables + ctypes_offset;
10422
10423
16
cb.assert_depth = 0;
10424
16
cb.bracount = 0;
10425
16
cb.cx = ccontext;
10426
16
cb.dupnames = FALSE;
10427
16
cb.end_pattern = pattern + patlen;
10428
16
cb.erroroffset = 0;
10429
16
cb.external_flags = 0;
10430
16
cb.external_options = options;
10431
16
cb.groupinfo = stack_groupinfo;
10432
16
cb.had_recurse = FALSE;
10433
16
cb.lastcapture = 0;
10434
16
cb.max_lookbehind = 0;                               /* Max encountered */
10435
16
cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10436
16
cb.name_entry_size = 0;
10437
16
cb.name_table = NULL;
10438
16
cb.named_groups = named_groups;
10439
16
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10440
16
cb.names_found = 0;
10441
16
cb.parens_depth = 0;
10442
16
cb.parsed_pattern = stack_parsed_pattern;
10443
16
cb.req_varyopt = 0;
10444
16
cb.start_code = cworkspace;
10445
16
cb.start_pattern = pattern;
10446
16
cb.start_workspace = cworkspace;
10447
16
cb.workspace_size = COMPILE_WORK_SIZE;
10448
16
cb.first_data = NULL;
10449
16
cb.last_data = NULL;
10450
16
#ifdef SUPPORT_WIDE_CHARS
10451
16
cb.char_lists_size = 0;
10452
16
#endif
10453
10454
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10455
references to help in deciding whether (.*) can be treated as anchored or not.
10456
*/
10457
10458
16
cb.top_backref = 0;
10459
16
cb.backref_map = 0;
10460
10461
/* Escape sequences \1 to \9 are always back references, but as they are only
10462
two characters long, only two elements can be used in the parsed_pattern
10463
vector. The first contains the reference, and we'd like to use the second to
10464
record the offset in the pattern, so that forward references to non-existent
10465
groups can be diagnosed later with an offset. However, on 64-bit systems,
10466
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10467
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10468
references have enough space for the offset to be put into the parsed pattern.
10469
*/
10470
10471
176
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10472
10473
10474
/* --------------- Start looking at the pattern --------------- */
10475
10476
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10477
the start of the pattern, and remember the offset to the actual regex. With
10478
valgrind support, make the terminator of a zero-terminated pattern
10479
inaccessible. This catches bugs that would otherwise only show up for
10480
non-zero-terminated patterns. */
10481
10482
#ifdef SUPPORT_VALGRIND
10483
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10484
#endif
10485
10486
16
xoptions = ccontext->extra_options;
10487
16
ptr = pattern;
10488
16
skipatstart = 0;
10489
10490
16
if ((options & PCRE2_LITERAL) == 0)
10491
16
  {
10492
16
  while (patlen - skipatstart >= 2 &&
10493
16
         ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10494
0
         ptr[skipatstart+1] == CHAR_ASTERISK)
10495
0
    {
10496
0
    for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10497
0
      {
10498
0
      const pso *p = pso_list + i;
10499
10500
0
      if (patlen - skipatstart - 2 >= p->length &&
10501
0
          PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)
10502
0
        {
10503
0
        uint32_t c, pp;
10504
10505
0
        skipatstart += p->length + 2;
10506
0
        switch(p->type)
10507
0
          {
10508
0
          case PSO_OPT:
10509
0
          cb.external_options |= p->value;
10510
0
          break;
10511
10512
0
          case PSO_XOPT:
10513
0
          xoptions |= p->value;
10514
0
          break;
10515
10516
0
          case PSO_FLG:
10517
0
          setflags |= p->value;
10518
0
          break;
10519
10520
0
          case PSO_NL:
10521
0
          newline = p->value;
10522
0
          setflags |= PCRE2_NL_SET;
10523
0
          break;
10524
10525
0
          case PSO_BSR:
10526
0
          bsr = p->value;
10527
0
          setflags |= PCRE2_BSR_SET;
10528
0
          break;
10529
10530
0
          case PSO_LIMM:
10531
0
          case PSO_LIMD:
10532
0
          case PSO_LIMH:
10533
0
          c = 0;
10534
0
          pp = skipatstart;
10535
0
          while (pp < patlen && IS_DIGIT(ptr[pp]))
10536
0
            {
10537
0
            if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10538
0
            c = c*10 + (ptr[pp++] - CHAR_0);
10539
0
            }
10540
0
          if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
10541
0
            {
10542
0
            errorcode = ERR60;
10543
0
            ptr += pp;
10544
0
            utf = FALSE;  /* Used by HAD_EARLY_ERROR */
10545
0
            goto HAD_EARLY_ERROR;
10546
0
            }
10547
0
          if (p->type == PSO_LIMH) limit_heap = c;
10548
0
            else if (p->type == PSO_LIMM) limit_match = c;
10549
0
            else limit_depth = c;
10550
0
          skipatstart = ++pp;
10551
0
          break;
10552
10553
0
          case PSO_OPTMZ:
10554
0
          optim_flags &= ~(p->value);
10555
10556
          /* For backward compatibility the three original VERBs to disable
10557
          optimizations need to also update the corresponding bit in the
10558
          external options. */
10559
10560
0
          switch(p->value)
10561
0
            {
10562
0
            case PCRE2_OPTIM_AUTO_POSSESS:
10563
0
            cb.external_options |= PCRE2_NO_AUTO_POSSESS;
10564
0
            break;
10565
10566
0
            case PCRE2_OPTIM_DOTSTAR_ANCHOR:
10567
0
            cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;
10568
0
            break;
10569
10570
0
            case PCRE2_OPTIM_START_OPTIMIZE:
10571
0
            cb.external_options |= PCRE2_NO_START_OPTIMIZE;
10572
0
            break;
10573
0
            }
10574
10575
0
          break;
10576
10577
          /* LCOV_EXCL_START */
10578
0
          default:
10579
          /* All values in the enum need an explicit entry for this switch
10580
          but until a better way to prevent coding mistakes is invented keep
10581
          a catch all that triggers a debug build assert as a failsafe */
10582
0
          PCRE2_DEBUG_UNREACHABLE();
10583
          /* LCOV_EXCL_STOP */
10584
0
          }
10585
0
        break;   /* Out of the table scan loop */
10586
0
        }
10587
0
      }
10588
0
    if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10589
0
    }
10590
16
    PCRE2_ASSERT(skipatstart <= patlen);
10591
16
  }
10592
10593
/* End of pattern-start options; advance to start of real regex. */
10594
10595
16
ptr += skipatstart;
10596
10597
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10598
10599
#ifndef SUPPORT_UNICODE
10600
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10601
  {
10602
  errorcode = ERR32;
10603
  goto HAD_EARLY_ERROR;
10604
  }
10605
#endif
10606
10607
/* Check UTF. We have the original options in 'options', with that value as
10608
modified by (*UTF) etc in cb->external_options. The extra option
10609
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10610
surrogate code points cannot be represented in UTF-16. */
10611
10612
16
utf = (cb.external_options & PCRE2_UTF) != 0;
10613
16
if (utf)
10614
4
  {
10615
4
  if ((options & PCRE2_NEVER_UTF) != 0)
10616
0
    {
10617
0
    errorcode = ERR74;
10618
0
    goto HAD_EARLY_ERROR;
10619
0
    }
10620
4
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10621
0
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10622
0
    goto HAD_ERROR;  /* Offset was set by valid_utf() */
10623
10624
#if PCRE2_CODE_UNIT_WIDTH == 16
10625
  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10626
    {
10627
    errorcode = ERR91;
10628
    goto HAD_EARLY_ERROR;
10629
    }
10630
#endif
10631
4
  }
10632
10633
/* Check UCP lockout. */
10634
10635
16
ucp = (cb.external_options & PCRE2_UCP) != 0;
10636
16
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10637
0
  {
10638
0
  errorcode = ERR75;
10639
0
  goto HAD_EARLY_ERROR;
10640
0
  }
10641
10642
/* PCRE2_EXTRA_TURKISH_CASING checks */
10643
10644
16
if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)
10645
0
  {
10646
0
  if (!utf && !ucp)
10647
0
    {
10648
0
    errorcode = ERR104;
10649
0
    goto HAD_EARLY_ERROR;
10650
0
    }
10651
10652
0
#if PCRE2_CODE_UNIT_WIDTH == 8
10653
0
  if (!utf)
10654
0
    {
10655
0
    errorcode = ERR105;
10656
0
    goto HAD_EARLY_ERROR;
10657
0
    }
10658
0
#endif
10659
10660
0
  if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)
10661
0
    {
10662
0
    errorcode = ERR106;
10663
0
    goto HAD_EARLY_ERROR;
10664
0
    }
10665
0
  }
10666
10667
/* Process the BSR setting. */
10668
10669
16
if (bsr == 0) bsr = ccontext->bsr_convention;
10670
10671
/* Process the newline setting. */
10672
10673
16
if (newline == 0) newline = ccontext->newline_convention;
10674
16
cb.nltype = NLTYPE_FIXED;
10675
16
switch(newline)
10676
16
  {
10677
0
  case PCRE2_NEWLINE_CR:
10678
0
  cb.nllen = 1;
10679
0
  cb.nl[0] = CHAR_CR;
10680
0
  break;
10681
10682
0
  case PCRE2_NEWLINE_LF:
10683
0
  cb.nllen = 1;
10684
0
  cb.nl[0] = CHAR_NL;
10685
0
  break;
10686
10687
0
  case PCRE2_NEWLINE_NUL:
10688
0
  cb.nllen = 1;
10689
0
  cb.nl[0] = CHAR_NUL;
10690
0
  break;
10691
10692
0
  case PCRE2_NEWLINE_CRLF:
10693
0
  cb.nllen = 2;
10694
0
  cb.nl[0] = CHAR_CR;
10695
0
  cb.nl[1] = CHAR_NL;
10696
0
  break;
10697
10698
16
  case PCRE2_NEWLINE_ANY:
10699
16
  cb.nltype = NLTYPE_ANY;
10700
16
  break;
10701
10702
0
  case PCRE2_NEWLINE_ANYCRLF:
10703
0
  cb.nltype = NLTYPE_ANYCRLF;
10704
0
  break;
10705
10706
  /* LCOV_EXCL_START */
10707
0
  default:
10708
0
  PCRE2_DEBUG_UNREACHABLE();
10709
0
  errorcode = ERR56;
10710
0
  goto HAD_EARLY_ERROR;
10711
  /* LCOV_EXCL_STOP */
10712
16
  }
10713
10714
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10715
their numerical equivalents, so that this information is always available for
10716
the remaining processing. (2) At the same time, parse the pattern and put a
10717
processed version into the parsed_pattern vector. This has escapes interpreted
10718
and comments removed (amongst other things). */
10719
10720
/* Ensure that the parsed pattern buffer is big enough. For many smaller
10721
patterns the vector on the stack (which was set up above) can be used. */
10722
10723
16
parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);
10724
10725
/* Allow for 2x uint32_t at the start and 2 at the end, for
10726
PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */
10727
10728
16
if ((ccontext->extra_options &
10729
16
     (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10730
0
  parsed_size_needed += 4;
10731
10732
/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */
10733
10734
16
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10735
0
  parsed_size_needed += 4;
10736
10737
16
parsed_size_needed += 1;  /* For the final META_END */
10738
10739
16
if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)
10740
0
  {
10741
0
  uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10742
0
    parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);
10743
0
  if (heap_parsed_pattern == NULL)
10744
0
    {
10745
0
    *errorptr = ERR21;
10746
0
    goto EXIT;
10747
0
    }
10748
0
  cb.parsed_pattern = heap_parsed_pattern;
10749
0
  }
10750
16
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;
10751
10752
/* Do the parsing scan. */
10753
10754
16
errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);
10755
16
if (errorcode != 0) goto HAD_CB_ERROR;
10756
10757
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10758
lengths. Workspace is needed to remember whether numbered groups are or are not
10759
of limited length, and if limited, what the minimum and maximum lengths are.
10760
This caching saves re-computing the length of any group that is referenced more
10761
than once, which is particularly relevant when recursion is involved.
10762
Unnumbered groups do not have this exposure because they cannot be referenced.
10763
If there are sufficiently few groups, the default index vector on the stack, as
10764
set up above, can be used. Otherwise we have to get/free some heap memory. The
10765
vector must be initialized to zero. */
10766
10767
16
if (has_lookbehind)
10768
0
  {
10769
0
  int loopcount = 0;
10770
0
  if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10771
0
    {
10772
0
    cb.groupinfo = ccontext->memctl.malloc(
10773
0
      (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10774
0
    if (cb.groupinfo == NULL)
10775
0
      {
10776
0
      errorcode = ERR21;
10777
0
      cb.erroroffset = 0;
10778
0
      goto HAD_CB_ERROR;
10779
0
      }
10780
0
    }
10781
0
  memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10782
0
  errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10783
0
  if (errorcode != 0) goto HAD_CB_ERROR;
10784
0
  }
10785
10786
/* For debugging, there is a function that shows the parsed pattern vector. */
10787
10788
#ifdef DEBUG_SHOW_PARSED
10789
fprintf(stderr, "+++ Pre-scan complete:\n");
10790
show_parsed(&cb);
10791
#endif
10792
10793
/* For debugging capturing information this code can be enabled. */
10794
10795
#ifdef DEBUG_SHOW_CAPTURES
10796
  {
10797
  named_group *ng = cb.named_groups;
10798
  fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10799
  for (i = 0; i < cb.names_found; i++, ng++)
10800
    {
10801
    fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10802
    }
10803
  }
10804
#endif
10805
10806
/* Pretend to compile the pattern while actually just accumulating the amount
10807
of memory required in the 'length' variable. This behaviour is triggered by
10808
passing a non-NULL final argument to compile_regex(). We pass a block of
10809
workspace (cworkspace) for it to compile parts of the pattern into; the
10810
compiled code is discarded when it is no longer needed, so hopefully this
10811
workspace will never overflow, though there is a test for its doing so.
10812
10813
On error, errorcode will be set non-zero, so we don't need to look at the
10814
result of the function. The initial options have been put into the cb block,
10815
but we still have to pass a separate options variable (the first argument)
10816
because the options may change as the pattern is processed. */
10817
10818
16
cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10819
16
pptr = cb.parsed_pattern;
10820
16
code = cworkspace;
10821
16
*code = OP_BRA;
10822
10823
16
(void)compile_regex(cb.external_options, xoptions, &code, &pptr,
10824
16
   &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10825
16
   &cb, &length);
10826
10827
16
if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10828
10829
/* This should be caught in compile_regex(), but just in case... */
10830
10831
16
#if defined SUPPORT_WIDE_CHARS
10832
16
PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);
10833
16
if (length > MAX_PATTERN_SIZE ||
10834
16
    MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))
10835
#else
10836
if (length > MAX_PATTERN_SIZE)
10837
#endif
10838
0
  {
10839
0
  errorcode = ERR20;
10840
0
  cb.erroroffset = 0;
10841
0
  goto HAD_CB_ERROR;
10842
0
  }
10843
10844
/* Compute the size of, then, if not too large, get and initialize the data
10845
block for storing the compiled pattern and names table. Integer overflow should
10846
no longer be possible because nowadays we limit the maximum value of
10847
cb.names_found and cb.name_entry_size. */
10848
10849
16
re_blocksize =
10850
16
  CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10851
10852
16
#if defined SUPPORT_WIDE_CHARS
10853
16
if (cb.char_lists_size != 0)
10854
0
  {
10855
0
#if PCRE2_CODE_UNIT_WIDTH != 32
10856
  /* Align to 32 bit first. This ensures the
10857
  allocated area will also be 32 bit aligned. */
10858
0
  re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));
10859
0
#endif
10860
0
  re_blocksize += cb.char_lists_size;
10861
0
  }
10862
16
#endif
10863
10864
16
re_blocksize += CU2BYTES(length);
10865
10866
16
if (re_blocksize > ccontext->max_pattern_compiled_length)
10867
0
  {
10868
0
  errorcode = ERR101;
10869
0
  cb.erroroffset = 0;
10870
0
  goto HAD_CB_ERROR;
10871
0
  }
10872
10873
16
re_blocksize += sizeof(pcre2_real_code);
10874
16
re = (pcre2_real_code *)
10875
16
  ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10876
16
if (re == NULL)
10877
0
  {
10878
0
  errorcode = ERR21;
10879
0
  cb.erroroffset = 0;
10880
0
  goto HAD_CB_ERROR;
10881
0
  }
10882
10883
/* The compiler may put padding at the end of the pcre2_real_code structure in
10884
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10885
compiled pattern is copied (for example, when serialized) undefined bytes are
10886
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10887
write to the last 8 bytes of the structure before setting the fields. */
10888
10889
16
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10890
16
re->memctl = ccontext->memctl;
10891
16
re->tables = tables;
10892
16
re->executable_jit = NULL;
10893
16
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10894
16
re->blocksize = re_blocksize;
10895
16
re->code_start = re_blocksize - CU2BYTES(length);
10896
16
re->magic_number = MAGIC_NUMBER;
10897
16
re->compile_options = options;
10898
16
re->overall_options = cb.external_options;
10899
16
re->extra_options = xoptions;
10900
16
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10901
16
re->limit_heap = limit_heap;
10902
16
re->limit_match = limit_match;
10903
16
re->limit_depth = limit_depth;
10904
16
re->first_codeunit = 0;
10905
16
re->last_codeunit = 0;
10906
16
re->bsr_convention = bsr;
10907
16
re->newline_convention = newline;
10908
16
re->max_lookbehind = 0;
10909
16
re->minlength = 0;
10910
16
re->top_bracket = 0;
10911
16
re->top_backref = 0;
10912
16
re->name_entry_size = cb.name_entry_size;
10913
16
re->name_count = cb.names_found;
10914
16
re->optimization_flags = optim_flags;
10915
10916
/* The basic block is immediately followed by the name table, and the compiled
10917
code follows after that. */
10918
10919
16
codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
10920
10921
/* Update the compile data block for the actual compile. The starting points of
10922
the name/number translation table and of the code are passed around in the
10923
compile data block. The start/end pattern and initial options are already set
10924
from the pre-compile phase, as is the name_entry_size field. */
10925
10926
16
cb.parens_depth = 0;
10927
16
cb.assert_depth = 0;
10928
16
cb.lastcapture = 0;
10929
16
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10930
16
cb.start_code = codestart;
10931
16
cb.req_varyopt = 0;
10932
16
cb.had_accept = FALSE;
10933
16
cb.had_pruneorskip = FALSE;
10934
16
#ifdef SUPPORT_WIDE_CHARS
10935
16
cb.char_lists_size = 0;
10936
16
#endif
10937
10938
10939
/* If any named groups were found, create the name/number table from the list
10940
created in the pre-pass. */
10941
10942
16
if (cb.names_found > 0)
10943
0
  {
10944
0
  named_group *ng = cb.named_groups;
10945
0
  uint32_t tablecount = 0;
10946
10947
  /* Length 0 represents duplicates, and they have already been handled. */
10948
0
  for (i = 0; i < cb.names_found; i++, ng++)
10949
0
    if (ng->length > 0)
10950
0
      tablecount = PRIV(compile_add_name_to_table)(&cb, ng, tablecount);
10951
10952
0
  PCRE2_ASSERT(tablecount == cb.names_found);
10953
0
  }
10954
10955
/* Set up a starting, non-extracting bracket, then compile the expression. On
10956
error, errorcode will be set non-zero, so we don't need to look at the result
10957
of the function here. */
10958
10959
16
pptr = cb.parsed_pattern;
10960
16
code = (PCRE2_UCHAR *)codestart;
10961
16
*code = OP_BRA;
10962
16
regexrc = compile_regex(re->overall_options, re->extra_options, &code,
10963
16
  &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10964
16
  NULL, &cb, NULL);
10965
16
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10966
16
re->top_bracket = cb.bracount;
10967
16
re->top_backref = cb.top_backref;
10968
16
re->max_lookbehind = cb.max_lookbehind;
10969
10970
16
if (cb.had_accept)
10971
0
  {
10972
0
  reqcu = 0;                     /* Must disable after (*ACCEPT) */
10973
0
  reqcuflags = REQ_NONE;
10974
0
  re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10975
0
  }
10976
10977
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10978
but the estimated length exceeds the really used length, adjust the value of
10979
re->blocksize, and if valgrind support is configured, mark the extra allocated
10980
memory as unaddressable, so that any out-of-bound reads can be detected. */
10981
10982
16
*code++ = OP_END;
10983
16
usedlength = code - codestart;
10984
/* LCOV_EXCL_START */
10985
16
if (usedlength > length)
10986
0
  {
10987
0
  PCRE2_DEBUG_UNREACHABLE();
10988
0
  errorcode = ERR23;  /* Overflow of code block - internal error */
10989
0
  cb.erroroffset = 0;
10990
0
  goto HAD_CB_ERROR;
10991
0
  }
10992
/* LCOV_EXCL_STOP */
10993
10994
16
re->blocksize -= CU2BYTES(length - usedlength);
10995
#ifdef SUPPORT_VALGRIND
10996
VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10997
#endif
10998
10999
/* Scan the pattern for recursion/subroutine calls and convert the group
11000
numbers into offsets. Maintain a small cache so that repeated groups containing
11001
recursions are efficiently handled. */
11002
11003
16
#define RSCAN_CACHE_SIZE 8
11004
11005
16
if (errorcode == 0 && cb.had_recurse)
11006
0
  {
11007
0
  PCRE2_UCHAR *rcode;
11008
0
  PCRE2_SPTR rgroup;
11009
0
  unsigned int ccount = 0;
11010
0
  int start = RSCAN_CACHE_SIZE;
11011
0
  recurse_cache rc[RSCAN_CACHE_SIZE];
11012
11013
0
  for (rcode = find_recurse(codestart, utf);
11014
0
       rcode != NULL;
11015
0
       rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))
11016
0
    {
11017
0
    int p, groupnumber;
11018
11019
0
    groupnumber = (int)GET(rcode, 1);
11020
0
    if (groupnumber == 0) rgroup = codestart; else
11021
0
      {
11022
0
      PCRE2_SPTR search_from = codestart;
11023
0
      rgroup = NULL;
11024
0
      for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
11025
0
        {
11026
0
        if (groupnumber == rc[p].groupnumber)
11027
0
          {
11028
0
          rgroup = rc[p].group;
11029
0
          break;
11030
0
          }
11031
11032
        /* Group n+1 must always start to the right of group n, so we can save
11033
        search time below when the new group number is greater than any of the
11034
        previously found groups. */
11035
11036
0
        if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
11037
0
        }
11038
11039
0
      if (rgroup == NULL)
11040
0
        {
11041
0
        rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
11042
        /* LCOV_EXCL_START */
11043
0
        if (rgroup == NULL)
11044
0
          {
11045
0
          PCRE2_DEBUG_UNREACHABLE();
11046
0
          errorcode = ERR53;
11047
0
          break;
11048
0
          }
11049
        /* LCOV_EXCL_STOP */
11050
11051
0
        if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
11052
0
        rc[start].groupnumber = groupnumber;
11053
0
        rc[start].group = rgroup;
11054
0
        if (ccount < RSCAN_CACHE_SIZE) ccount++;
11055
0
        }
11056
0
      }
11057
11058
0
    PUT(rcode, 1, (uint32_t)(rgroup - codestart));
11059
0
    }
11060
0
  }
11061
11062
/* In rare debugging situations we sometimes need to look at the compiled code
11063
at this stage. */
11064
11065
#ifdef DEBUG_CALL_PRINTINT
11066
pcre2_printint(re, stderr, TRUE);
11067
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
11068
#endif
11069
11070
/* Unless disabled, check whether any single character iterators can be
11071
auto-possessified. The function overwrites the appropriate opcode values, so
11072
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
11073
used in this code because at least one compiler gives a warning about loss of
11074
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
11075
function call. */
11076
11077
16
if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)
11078
16
  {
11079
16
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
11080
16
  int possessify_rc = PRIV(auto_possessify)(temp, &cb);
11081
  /* LCOV_EXCL_START */
11082
16
  if (possessify_rc != 0)
11083
0
    {
11084
0
    PCRE2_DEBUG_UNREACHABLE();
11085
0
    errorcode = ERR80;
11086
0
    cb.erroroffset = 0;
11087
0
    }
11088
  /* LCOV_EXCL_STOP */
11089
16
  }
11090
11091
/* Failed to compile, or error while post-processing. */
11092
11093
16
if (errorcode != 0) goto HAD_CB_ERROR;
11094
11095
/* Successful compile. If the anchored option was not passed, set it if
11096
we can determine that the pattern is anchored by virtue of ^ characters or \A
11097
or anything else, such as starting with non-atomic .* when DOTALL is set and
11098
there are no occurrences of *PRUNE or *SKIP (though there is an option to
11099
disable this case). */
11100
11101
16
if ((re->overall_options & PCRE2_ANCHORED) == 0)
11102
16
  {
11103
16
  BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11104
16
  if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11105
16
    re->overall_options |= PCRE2_ANCHORED;
11106
16
  }
11107
11108
/* Set up the first code unit or startline flag, the required code unit, and
11109
then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE
11110
is disabled, as the data it would create will not be used. Note that a first code
11111
unit (but not the startline flag) is useful for anchored patterns because it
11112
can still give a quick "no match" and also avoid searching for a last code
11113
unit. */
11114
11115
16
if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
11116
16
  {
11117
16
  int minminlength = 0;  /* For minimal minlength from first/required CU */
11118
16
  int study_rc;
11119
11120
  /* If we do not have a first code unit, see if there is one that is asserted
11121
  (these are not saved during the compile because they can cause conflicts with
11122
  actual literals that follow). */
11123
11124
16
  if (firstcuflags >= REQ_NONE) {
11125
8
    uint32_t assertedcuflags = 0;
11126
8
    uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);
11127
    /* It would be wrong to use the asserted first code unit as `firstcu` for
11128
     * regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)
11129
     * For that example, if we set both firstcu and reqcu to 'a', it would mean
11130
     * the subject string needs to be at least 2 characters long, which is wrong.
11131
     * With more analysis, we would be able to set firstcu in more cases. */
11132
8
    if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {
11133
0
      firstcu = assertedcu;
11134
0
      firstcuflags = assertedcuflags;
11135
0
    }
11136
8
  }
11137
11138
  /* Save the data for a first code unit. The existence of one means the
11139
  minimum length must be at least 1. */
11140
11141
16
  if (firstcuflags < REQ_NONE)
11142
8
    {
11143
8
    re->first_codeunit = firstcu;
11144
8
    re->flags |= PCRE2_FIRSTSET;
11145
8
    minminlength++;
11146
11147
    /* Handle caseless first code units. */
11148
11149
8
    if ((firstcuflags & REQ_CASELESS) != 0)
11150
0
      {
11151
0
      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
11152
0
        {
11153
0
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
11154
0
        }
11155
11156
      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
11157
      In 8-bit UTF mode, code units in the range 128-255 are introductory code
11158
      units and cannot have another case, but if UCP is set they may do. */
11159
11160
0
#ifdef SUPPORT_UNICODE
11161
0
#if PCRE2_CODE_UNIT_WIDTH == 8
11162
0
      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
11163
0
        re->flags |= PCRE2_FIRSTCASELESS;
11164
#else
11165
      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
11166
               UCD_OTHERCASE(firstcu) != firstcu)
11167
        re->flags |= PCRE2_FIRSTCASELESS;
11168
#endif
11169
0
#endif  /* SUPPORT_UNICODE */
11170
0
      }
11171
8
    }
11172
11173
  /* When there is no first code unit, for non-anchored patterns, see if we can
11174
  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
11175
  branches start with ^ and also when all branches start with non-atomic .* for
11176
  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
11177
  that disables this case.) */
11178
11179
8
  else if ((re->overall_options & PCRE2_ANCHORED) == 0)
11180
0
    {
11181
0
    BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11182
0
    if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11183
0
      re->flags |= PCRE2_STARTLINE;
11184
0
    }
11185
11186
  /* Handle the "required code unit", if one is set. In the UTF case we can
11187
  increment the minimum minimum length only if we are sure this really is a
11188
  different character and not a non-starting code unit of the first character,
11189
  because the minimum length count is in characters, not code units. */
11190
11191
16
  if (reqcuflags < REQ_NONE)
11192
16
    {
11193
#if PCRE2_CODE_UNIT_WIDTH == 16
11194
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
11195
        firstcuflags >= REQ_NONE ||                 /* First not set */
11196
        (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
11197
        (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
11198
#elif PCRE2_CODE_UNIT_WIDTH == 8
11199
16
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
11200
4
        firstcuflags >= REQ_NONE ||                 /* First not set */
11201
0
        (firstcu & 0x80) == 0 ||                    /* First is ASCII */
11202
0
        (reqcu & 0x80) == 0)                        /* Req is ASCII */
11203
16
#endif
11204
16
      {
11205
16
      minminlength++;
11206
16
      }
11207
11208
    /* In the case of an anchored pattern, set up the value only if it follows
11209
    a variable length item in the pattern. */
11210
11211
16
    if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
11212
16
        (reqcuflags & REQ_VARY) != 0)
11213
16
      {
11214
16
      re->last_codeunit = reqcu;
11215
16
      re->flags |= PCRE2_LASTSET;
11216
11217
      /* Handle caseless required code units as for first code units (above). */
11218
11219
16
      if ((reqcuflags & REQ_CASELESS) != 0)
11220
0
        {
11221
0
        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
11222
0
          {
11223
0
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
11224
0
          }
11225
0
#ifdef SUPPORT_UNICODE
11226
0
#if PCRE2_CODE_UNIT_WIDTH == 8
11227
0
      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
11228
0
        re->flags |= PCRE2_LASTCASELESS;
11229
#else
11230
      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
11231
               UCD_OTHERCASE(reqcu) != reqcu)
11232
        re->flags |= PCRE2_LASTCASELESS;
11233
#endif
11234
0
#endif  /* SUPPORT_UNICODE */
11235
0
        }
11236
16
      }
11237
16
    }
11238
11239
  /* Study the compiled pattern to set up information such as a bitmap of
11240
  starting code units and a minimum matching length. */
11241
11242
16
  study_rc = PRIV(study)(re);
11243
  /* LCOV_EXCL_START */
11244
16
  if (study_rc != 0)
11245
0
    {
11246
0
    PCRE2_DEBUG_UNREACHABLE();
11247
0
    errorcode = ERR31;
11248
0
    cb.erroroffset = 0;
11249
0
    goto HAD_CB_ERROR;
11250
0
    }
11251
  /* LCOV_EXCL_STOP */
11252
11253
  /* If study() set a bitmap of starting code units, it implies a minimum
11254
  length of at least one. */
11255
11256
16
  if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
11257
0
    minminlength = 1;
11258
11259
  /* If the minimum length set (or not set) by study() is less than the minimum
11260
  implied by required code units, override it. */
11261
11262
16
  if (re->minlength < minminlength) re->minlength = minminlength;
11263
16
  }   /* End of start-of-match optimizations. */
11264
11265
/* Control ends up here in all cases. When running under valgrind, make a
11266
pattern's terminating zero defined again. If memory was obtained for the parsed
11267
version of the pattern, free it before returning. Also free the list of named
11268
groups if a larger one had to be obtained, and likewise the group information
11269
vector. */
11270
11271
16
#ifdef SUPPORT_UNICODE
11272
/* All items must be freed. */
11273
16
PCRE2_ASSERT(cb.first_data == NULL);
11274
16
#endif
11275
11276
16
EXIT:
11277
#ifdef SUPPORT_VALGRIND
11278
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
11279
#endif
11280
16
if (cb.parsed_pattern != stack_parsed_pattern)
11281
0
  ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
11282
16
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
11283
0
  ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
11284
16
if (cb.groupinfo != stack_groupinfo)
11285
0
  ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
11286
11287
16
return re;    /* Will be NULL after an error */
11288
11289
/* Errors discovered in parse_regex() set the offset value in the compile
11290
block. Errors discovered before it is called must compute it from the ptr
11291
value. After parse_regex() is called, the offset in the compile block is set to
11292
the end of the pattern, but certain errors in compile_regex() may reset it if
11293
an offset is available in the parsed pattern. */
11294
11295
0
HAD_CB_ERROR:
11296
0
ptr = pattern + cb.erroroffset;
11297
11298
0
HAD_EARLY_ERROR:
11299
/* Ensure we don't return out-of-range erroroffset. */
11300
0
PCRE2_ASSERT(ptr >= pattern);
11301
0
PCRE2_ASSERT(ptr <= (pattern + patlen));
11302
/* Ensure that the erroroffset never slices a UTF-encoded character in half.
11303
If the input is invalid, then we return an offset just before the first invalid
11304
character, so the text to the left of the offset must always be valid. */
11305
#if defined PCRE2_DEBUG && defined SUPPORT_UNICODE
11306
if (ptr > pattern && utf)
11307
  {
11308
  PCRE2_SPTR prev = ptr - 1;
11309
  PCRE2_SIZE dummyoffset;
11310
  BACKCHAR(prev);
11311
  PCRE2_ASSERT(prev >= pattern);
11312
  PCRE2_ASSERT(PRIV(valid_utf)(prev, ptr - prev, &dummyoffset) == 0);
11313
  }
11314
#endif
11315
0
*erroroffset = ptr - pattern;
11316
11317
0
HAD_ERROR:
11318
0
*errorptr = errorcode;
11319
0
pcre2_code_free(re);
11320
0
re = NULL;
11321
11322
0
if (cb.first_data != NULL)
11323
0
  {
11324
0
  compile_data* current_data = cb.first_data;
11325
0
  do
11326
0
    {
11327
0
    compile_data* next_data = current_data->next;
11328
0
    cb.cx->memctl.free(current_data, cb.cx->memctl.memory_data);
11329
0
    current_data = next_data;
11330
0
    }
11331
0
  while (current_data != NULL);
11332
0
  }
11333
11334
0
goto EXIT;
11335
0
}
11336
11337
/* These #undefs are here to enable unity builds with CMake. */
11338
11339
#undef NLBLOCK /* Block containing newline information */
11340
#undef PSSTART /* Field containing processed string start */
11341
#undef PSEND   /* Field containing processed string end */
11342
11343
/* End of pcre2_compile.c */