Coverage Report

Created: 2025-11-16 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/pcre/pcre2lib/pcre2_compile.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#ifdef HAVE_CONFIG_H
43
#include "config.h"
44
#endif
45
46
0
#define NLBLOCK cb             /* Block containing newline information */
47
#define PSSTART start_pattern  /* Field containing processed string start */
48
0
#define PSEND   end_pattern    /* Field containing processed string end */
49
50
#include "pcre2_internal.h"
51
52
/* In rare error cases debugging might require calling pcre2_printint(). */
53
54
#if 0
55
#ifdef EBCDIC
56
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57
#else
58
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59
#endif
60
#include "pcre2_printint.c"
61
#define DEBUG_CALL_PRINTINT
62
#endif
63
64
/* Other debugging code can be enabled by these defines. */
65
66
/* #define DEBUG_SHOW_CAPTURES */
67
/* #define DEBUG_SHOW_PARSED */
68
69
/* There are a few things that vary with different code unit sizes. Handle them
70
by defining macros in order to minimize #if usage. */
71
72
#if PCRE2_CODE_UNIT_WIDTH == 8
73
#define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74
26
#define XDIGIT(c)                xdigitab[c]
75
76
#else  /* Either 16-bit or 32-bit */
77
#define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78
79
#if PCRE2_CODE_UNIT_WIDTH == 16
80
#define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81
82
#else  /* 32-bit */
83
#define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84
#endif
85
#endif
86
87
/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88
consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89
them will be able to (i.e. assume a 64-bit world). */
90
91
#if PCRE2_SIZE_MAX <= UINT32_MAX
92
#define PUTOFFSET(s,p) *p++ = s
93
#define GETOFFSET(s,p) s = *p++
94
#define GETPLUSOFFSET(s,p) s = *(++p)
95
#define READPLUSOFFSET(s,p) s = p[1]
96
#define SKIPOFFSET(p) p++
97
#define SIZEOFFSET 1
98
#else
99
#define PUTOFFSET(s,p) \
100
167
  { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101
#define GETOFFSET(s,p) \
102
  { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103
#define GETPLUSOFFSET(s,p) \
104
107
  { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105
#define READPLUSOFFSET(s,p) \
106
2
  { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107
0
#define SKIPOFFSET(p) p += 2
108
2
#define SIZEOFFSET 2
109
#endif
110
111
/* Macros for manipulating elements of the parsed pattern vector. */
112
113
456k
#define META_CODE(x)   (x & 0xffff0000u)
114
452k
#define META_DATA(x)   (x & 0x0000ffffu)
115
#define META_DIFF(x,y) ((x-y)>>16)
116
117
/* Function definitions to allow mutual recursion */
118
119
#ifdef SUPPORT_UNICODE
120
static unsigned int
121
  add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t,
122
    compile_block *, const uint32_t *, unsigned int);
123
#endif
124
125
static int
126
  compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
127
    uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
128
    open_capitem *, compile_block *, PCRE2_SIZE *);
129
130
static int
131
  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
132
    compile_block *);
133
134
static BOOL
135
  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136
    compile_block *);
137
138
static int
139
  check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140
    compile_block *, int *);
141
142
143
/*************************************************
144
*      Code parameters and static tables         *
145
*************************************************/
146
147
5.00k
#define MAX_GROUP_NUMBER   65535u
148
60.7k
#define MAX_REPEAT_COUNT   65535u
149
60.7k
#define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
150
151
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152
different ways in the different pattern scans. The parsing and group-
153
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154
aligned for this. Having defined the size in code units, we set up
155
C16_WORK_SIZE as the number of elements in the 16-bit vector.
156
157
During the first compiling phase, when determining how much memory is required,
158
the regex is partly compiled into this space, but the compiled parts are
159
discarded as soon as they can be, so that hopefully there will never be an
160
overrun. The code does, however, check for an overrun, which can occur for
161
pathological patterns. The size of the workspace depends on LINK_SIZE because
162
the length of compiled items varies with this.
163
164
In the real compile phase, this workspace is not currently used. */
165
166
1.52k
#define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
167
168
#define C16_WORK_SIZE \
169
  ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170
171
/* A uint32_t vector is used for caching information about the size of
172
capturing groups, to improve performance. A default is created on the stack of
173
this size. */
174
175
2
#define GROUPINFO_DEFAULT_SIZE 256
176
177
/* The overrun tests check for a slightly smaller size so that they detect the
178
overrun before it actually does run off the end of the data block. */
179
180
227k
#define WORK_SIZE_SAFETY_MARGIN (100)
181
182
/* This value determines the size of the initial vector that is used for
183
remembering named groups during the pre-compile. It is allocated on the stack,
184
but if it is too small, it is expanded, in a similar way to the workspace. The
185
value is the number of slots in the list. */
186
187
3.05k
#define NAMED_GROUP_LIST_SIZE  20
188
189
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190
of uint32_t. For short patterns this lives on the stack, with this size. Heap
191
memory is used for longer patterns. */
192
193
1.50k
#define PARSED_PATTERN_DEFAULT_SIZE 1024
194
195
/* Maximum length value to check against when making sure that the variable
196
that holds the compiled pattern length does not overflow. We make it a bit less
197
than INT_MAX to allow for adding in group terminating code units, so that we
198
don't have to check them every time. */
199
200
210k
#define OFLOW_MAX (INT_MAX - 20)
201
202
/* Code values for parsed patterns, which are stored in a vector of 32-bit
203
unsigned ints. Values less than META_END are literal data values. The coding
204
for identifying the item is in the top 16-bits, leaving 16 bits for the
205
additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206
macros are used to manipulate parsed pattern elements.
207
208
NOTE: When these definitions are changed, the table of extra lengths for each
209
code (meta_extra_lengths, just below) must be updated to remain in step. */
210
211
539k
#define META_END              0x80000000u  /* End of pattern */
212
213
46.6k
#define META_ALT              0x80010000u  /* alternation */
214
24
#define META_ATOMIC           0x80020000u  /* atomic group */
215
1.76k
#define META_BACKREF          0x80030000u  /* Back ref */
216
0
#define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
217
163k
#define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
218
0
#define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
219
0
#define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
220
6.53k
#define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
221
6.68k
#define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
222
36.9k
#define META_CLASS            0x800a0000u  /* start non-empty class */
223
20
#define META_CLASS_EMPTY      0x800b0000u  /* empty class */
224
20
#define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
225
190k
#define META_CLASS_END        0x800d0000u  /* end of non-empty class */
226
13.7k
#define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
227
4
#define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
228
0
#define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
229
0
#define META_COND_NAME        0x80110000u  /* (?(<name>)... */
230
0
#define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
231
0
#define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
232
0
#define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
233
0
#define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
234
1.56k
#define META_DOLLAR           0x80160000u  /* $ metacharacter */
235
11.5k
#define META_DOT              0x80170000u  /* . metacharacter */
236
43.1k
#define META_ESCAPE           0x80180000u  /* \d and friends */
237
21.3k
#define META_KET              0x80190000u  /* closing parenthesis */
238
31
#define META_NOCAPTURE        0x801a0000u  /* no capture parens */
239
0
#define META_OPTIONS          0x801b0000u  /* (?i) and friends */
240
325k
#define META_POSIX            0x801c0000u  /* POSIX class item */
241
162k
#define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
242
161k
#define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
243
323k
#define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
244
24
#define META_RECURSE          0x80200000u  /* Recursion */
245
0
#define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
246
4
#define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */
247
248
/* These must be kept together to make it easy to check that an assertion
249
is present where expected in a conditional group. */
250
251
13
#define META_LOOKAHEAD        0x80230000u  /* (?= */
252
7
#define META_LOOKAHEADNOT     0x80240000u  /* (?! */
253
4
#define META_LOOKBEHIND       0x80250000u  /* (?<= */
254
2
#define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */
255
256
/* These cannot be conditions */
257
258
13
#define META_LOOKAHEAD_NA     0x80270000u  /* (*napla: */
259
4
#define META_LOOKBEHIND_NA    0x80280000u  /* (*naplb: */
260
261
/* These must be kept in this order, with consecutive values, and the _ARG
262
versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263
versions. */
264
265
0
#define META_MARK             0x80290000u  /* (*MARK) */
266
32.7k
#define META_ACCEPT           0x802a0000u  /* (*ACCEPT) */
267
22
#define META_FAIL             0x802b0000u  /* (*FAIL) */
268
20
#define META_COMMIT           0x802c0000u  /* These               */
269
0
#define META_COMMIT_ARG       0x802d0000u  /*   pairs             */
270
26
#define META_PRUNE            0x802e0000u  /*     must            */
271
0
#define META_PRUNE_ARG        0x802f0000u  /*       be            */
272
26
#define META_SKIP             0x80300000u  /*         kept        */
273
0
#define META_SKIP_ARG         0x80310000u  /*           in        */
274
26
#define META_THEN             0x80320000u  /*             this    */
275
0
#define META_THEN_ARG         0x80330000u  /*               order */
276
277
/* These must be kept in groups of adjacent 3 values, and all together. */
278
279
1.33M
#define META_ASTERISK         0x80340000u  /* *  */
280
5.03k
#define META_ASTERISK_PLUS    0x80350000u  /* *+ */
281
8.06k
#define META_ASTERISK_QUERY   0x80360000u  /* *? */
282
16.4k
#define META_PLUS             0x80370000u  /* +  */
283
9.97k
#define META_PLUS_PLUS        0x80380000u  /* ++ */
284
15.2k
#define META_PLUS_QUERY       0x80390000u  /* +? */
285
37.7k
#define META_QUERY            0x803a0000u  /* ?  */
286
21.8k
#define META_QUERY_PLUS       0x803b0000u  /* ?+ */
287
29.2k
#define META_QUERY_QUERY      0x803c0000u  /* ?? */
288
7.42k
#define META_MINMAX           0x803d0000u  /* {n,m}  repeat */
289
0
#define META_MINMAX_PLUS      0x803e0000u  /* {n,m}+ repeat */
290
61.1k
#define META_MINMAX_QUERY     0x803f0000u  /* {n,m}? repeat */
291
292
#define META_FIRST_QUANTIFIER META_ASTERISK
293
#define META_LAST_QUANTIFIER  META_MINMAX_QUERY
294
295
/* This is a special "meta code" that is used only to distinguish (*asr: from
296
(*sr: in the table of aphabetic assertions. It is never stored in the parsed
297
pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298
therefore no need for it to have a length entry, so use a high value. */
299
300
0
#define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301
302
/* Table of extra lengths for each of the meta codes. Must be kept in step with
303
the definitions above. For some items these values are a basic length to which
304
a variable amount has to be added. */
305
306
static unsigned char meta_extra_lengths[] = {
307
  0,             /* META_END */
308
  0,             /* META_ALT */
309
  0,             /* META_ATOMIC */
310
  0,             /* META_BACKREF - more if group is >= 10 */
311
  1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
312
  1,             /* META_BIGVALUE */
313
  3,             /* META_CALLOUT_NUMBER */
314
  3+SIZEOFFSET,  /* META_CALLOUT_STRING */
315
  0,             /* META_CAPTURE */
316
  0,             /* META_CIRCUMFLEX */
317
  0,             /* META_CLASS */
318
  0,             /* META_CLASS_EMPTY */
319
  0,             /* META_CLASS_EMPTY_NOT */
320
  0,             /* META_CLASS_END */
321
  0,             /* META_CLASS_NOT */
322
  0,             /* META_COND_ASSERT */
323
  SIZEOFFSET,    /* META_COND_DEFINE */
324
  1+SIZEOFFSET,  /* META_COND_NAME */
325
  1+SIZEOFFSET,  /* META_COND_NUMBER */
326
  1+SIZEOFFSET,  /* META_COND_RNAME */
327
  1+SIZEOFFSET,  /* META_COND_RNUMBER */
328
  3,             /* META_COND_VERSION */
329
  0,             /* META_DOLLAR */
330
  0,             /* META_DOT */
331
  0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332
  0,             /* META_KET */
333
  0,             /* META_NOCAPTURE */
334
  1,             /* META_OPTIONS */
335
  1,             /* META_POSIX */
336
  1,             /* META_POSIX_NEG */
337
  0,             /* META_RANGE_ESCAPED */
338
  0,             /* META_RANGE_LITERAL */
339
  SIZEOFFSET,    /* META_RECURSE */
340
  1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
341
  0,             /* META_SCRIPT_RUN */
342
  0,             /* META_LOOKAHEAD */
343
  0,             /* META_LOOKAHEADNOT */
344
  SIZEOFFSET,    /* META_LOOKBEHIND */
345
  SIZEOFFSET,    /* META_LOOKBEHINDNOT */
346
  0,             /* META_LOOKAHEAD_NA */
347
  SIZEOFFSET,    /* META_LOOKBEHIND_NA */
348
  1,             /* META_MARK - plus the string length */
349
  0,             /* META_ACCEPT */
350
  0,             /* META_FAIL */
351
  0,             /* META_COMMIT */
352
  1,             /* META_COMMIT_ARG - plus the string length */
353
  0,             /* META_PRUNE */
354
  1,             /* META_PRUNE_ARG - plus the string length */
355
  0,             /* META_SKIP */
356
  1,             /* META_SKIP_ARG - plus the string length */
357
  0,             /* META_THEN */
358
  1,             /* META_THEN_ARG - plus the string length */
359
  0,             /* META_ASTERISK */
360
  0,             /* META_ASTERISK_PLUS */
361
  0,             /* META_ASTERISK_QUERY */
362
  0,             /* META_PLUS */
363
  0,             /* META_PLUS_PLUS */
364
  0,             /* META_PLUS_QUERY */
365
  0,             /* META_QUERY */
366
  0,             /* META_QUERY_PLUS */
367
  0,             /* META_QUERY_QUERY */
368
  2,             /* META_MINMAX */
369
  2,             /* META_MINMAX_PLUS */
370
  2              /* META_MINMAX_QUERY */
371
};
372
373
/* Types for skipping parts of a parsed pattern. */
374
375
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376
377
/* Macro for setting individual bits in class bitmaps. It took some
378
experimenting to figure out how to stop gcc 5.3.0 from warning with
379
-Wconversion. This version gets a warning:
380
381
  #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382
383
Let's hope the apparently less efficient version isn't actually so bad if the
384
compiler is clever with identical subexpressions. */
385
386
389k
#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387
388
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389
variables, which are concerned with first and required code units. A value
390
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391
matching xxcu variable is set, and the low valued bits are relevant. */
392
393
395k
#define REQ_UNSET     0xffffffffu  /* Not yet found anything */
394
68.7k
#define REQ_NONE      0xfffffffeu  /* Found not fixed character */
395
5.13k
#define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
396
94.6k
#define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
397
398
/* These flags are used in the groupinfo vector. */
399
400
0
#define GI_SET_FIXED_LENGTH    0x80000000u
401
0
#define GI_NOT_FIXED_LENGTH    0x40000000u
402
0
#define GI_FIXED_LENGTH_MASK   0x0000ffffu
403
404
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405
and is fast (a good compiler can turn it into a subtraction and unsigned
406
comparison). */
407
408
6.58k
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409
410
/* Table to identify hex digits. The tables in chartables are dependent on the
411
locale, and may mark arbitrary characters as digits. We want to recognize only
412
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413
costs 256 bytes, but it is a lot faster than doing character value tests (at
414
least in some simple cases I timed), and in some applications one wants PCRE2
415
to compile efficiently as well as match efficiently. The value in the table is
416
the binary hex digit value, or 0xff for non-hex digits. */
417
418
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419
UTF-8 mode. */
420
421
#ifndef EBCDIC
422
static const uint8_t xdigitab[] =
423
  {
424
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
425
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
426
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
427
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
428
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
429
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
430
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
431
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
432
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
433
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
434
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
435
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
436
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
437
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
438
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
439
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
440
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456
457
#else
458
459
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460
461
static const uint8_t xdigitab[] =
462
  {
463
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
464
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
465
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
466
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
467
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
468
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
469
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
470
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
471
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
472
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
473
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
474
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
475
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
476
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
477
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
479
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
480
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
481
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
482
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
483
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
484
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
485
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
486
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
487
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
488
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
489
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
490
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
491
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
492
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
493
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
494
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
495
#endif  /* EBCDIC */
496
497
498
/* Table for handling alphanumeric escaped characters. Positive returns are
499
simple data values; negative values are for special things like \d and so on.
500
Zero means further processing is needed (for things like \x), or the escape is
501
invalid. */
502
503
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
504
in UTF-8 mode. It runs from '0' to 'z'. */
505
506
#ifndef EBCDIC
507
54.4k
#define ESCAPES_FIRST       CHAR_0
508
27.2k
#define ESCAPES_LAST        CHAR_z
509
24
#define UPPER_CASE(c)       (c-32)
510
511
static const short int escapes[] = {
512
     0,                       0,
513
     0,                       0,
514
     0,                       0,
515
     0,                       0,
516
     0,                       0,
517
     CHAR_COLON,              CHAR_SEMICOLON,
518
     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
519
     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
520
     CHAR_COMMERCIAL_AT,      -ESC_A,
521
     -ESC_B,                  -ESC_C,
522
     -ESC_D,                  -ESC_E,
523
     0,                       -ESC_G,
524
     -ESC_H,                  0,
525
     0,                       -ESC_K,
526
     0,                       0,
527
     -ESC_N,                  0,
528
     -ESC_P,                  -ESC_Q,
529
     -ESC_R,                  -ESC_S,
530
     0,                       0,
531
     -ESC_V,                  -ESC_W,
532
     -ESC_X,                  0,
533
     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
534
     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
535
     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
536
     CHAR_GRAVE_ACCENT,       CHAR_BEL,
537
     -ESC_b,                  0,
538
     -ESC_d,                  CHAR_ESC,
539
     CHAR_FF,                 0,
540
     -ESC_h,                  0,
541
     0,                       -ESC_k,
542
     0,                       0,
543
     CHAR_LF,                 0,
544
     -ESC_p,                  0,
545
     CHAR_CR,                 -ESC_s,
546
     CHAR_HT,                 0,
547
     -ESC_v,                  -ESC_w,
548
     0,                       0,
549
     -ESC_z
550
};
551
552
#else
553
554
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555
It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556
is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557
because it is defined as 'a', which of course picks up the ASCII value. */
558
559
#if 'a' == 0x81                    /* Check for a real EBCDIC environment */
560
#define ESCAPES_FIRST       CHAR_a
561
#define ESCAPES_LAST        CHAR_9
562
#define UPPER_CASE(c)       (c+64)
563
#else                              /* Testing in an ASCII environment */
564
#define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
565
#define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
566
#define UPPER_CASE(c)  (c-32)
567
#endif
568
569
static const short int escapes[] = {
570
/*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
571
/*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
572
/*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
573
/*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
574
/*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
575
/*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
576
/*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
577
/*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
578
/*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
579
/*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
580
/*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
581
/*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
582
/*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
583
/*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
584
/*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
585
/*  F8 */      0,        0
586
};
587
588
/* We also need a table of characters that may follow \c in an EBCDIC
589
environment for characters 0-31. */
590
591
static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592
593
#endif   /* EBCDIC */
594
595
596
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597
searched linearly. Put all the names into a single string, in order to reduce
598
the number of relocations when a shared library is dynamically linked. The
599
string is built from string macros so that it works in UTF-8 mode on EBCDIC
600
platforms. */
601
602
typedef struct verbitem {
603
  unsigned int len;          /* Length of verb name */
604
  uint32_t meta;             /* Base META_ code */
605
  int has_arg;               /* Argument requirement */
606
} verbitem;
607
608
static const char verbnames[] =
609
  "\0"                       /* Empty name is a shorthand for MARK */
610
  STRING_MARK0
611
  STRING_ACCEPT0
612
  STRING_F0
613
  STRING_FAIL0
614
  STRING_COMMIT0
615
  STRING_PRUNE0
616
  STRING_SKIP0
617
  STRING_THEN;
618
619
static const verbitem verbs[] = {
620
  { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
621
  { 4, META_MARK,   +1 },
622
  { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
623
  { 1, META_FAIL,   -1 },
624
  { 4, META_FAIL,   -1 },
625
  { 6, META_COMMIT,  0 },
626
  { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
627
  { 4, META_SKIP,    0 },
628
  { 4, META_THEN,    0 }
629
};
630
631
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632
633
/* Verb opcodes, indexed by their META code offset from META_MARK. */
634
635
static const uint32_t verbops[] = {
636
  OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637
  OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638
639
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640
641
typedef struct alasitem {
642
  unsigned int len;          /* Length of name */
643
  uint32_t meta;             /* Base META_ code */
644
} alasitem;
645
646
static const char alasnames[] =
647
  STRING_pla0
648
  STRING_plb0
649
  STRING_napla0
650
  STRING_naplb0
651
  STRING_nla0
652
  STRING_nlb0
653
  STRING_positive_lookahead0
654
  STRING_positive_lookbehind0
655
  STRING_non_atomic_positive_lookahead0
656
  STRING_non_atomic_positive_lookbehind0
657
  STRING_negative_lookahead0
658
  STRING_negative_lookbehind0
659
  STRING_atomic0
660
  STRING_sr0
661
  STRING_asr0
662
  STRING_script_run0
663
  STRING_atomic_script_run;
664
665
static const alasitem alasmeta[] = {
666
  {  3, META_LOOKAHEAD         },
667
  {  3, META_LOOKBEHIND        },
668
  {  5, META_LOOKAHEAD_NA      },
669
  {  5, META_LOOKBEHIND_NA     },
670
  {  3, META_LOOKAHEADNOT      },
671
  {  3, META_LOOKBEHINDNOT     },
672
  { 18, META_LOOKAHEAD         },
673
  { 19, META_LOOKBEHIND        },
674
  { 29, META_LOOKAHEAD_NA      },
675
  { 30, META_LOOKBEHIND_NA     },
676
  { 18, META_LOOKAHEADNOT      },
677
  { 19, META_LOOKBEHINDNOT     },
678
  {  6, META_ATOMIC            },
679
  {  2, META_SCRIPT_RUN        }, /* sr = script run */
680
  {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681
  { 10, META_SCRIPT_RUN        }, /* script run */
682
  { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
683
};
684
685
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686
687
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688
689
static uint32_t chartypeoffset[] = {
690
  OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
691
  OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692
693
/* Tables of names of POSIX character classes and their lengths. The names are
694
now all in a single string, to reduce the number of relocations when a shared
695
library is dynamically loaded. The list of lengths is terminated by a zero
696
length entry. The first three must be alpha, lower, upper, as this is assumed
697
for handling case independence. The indices for several classes are needed, so
698
identify them. */
699
700
static const char posix_names[] =
701
  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702
  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703
  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704
  STRING_word0  STRING_xdigit;
705
706
static const uint8_t posix_name_lengths[] = {
707
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708
709
0
#define PC_DIGIT   7
710
0
#define PC_GRAPH   8
711
0
#define PC_PRINT   9
712
0
#define PC_PUNCT  10
713
0
#define PC_XDIGIT 13
714
715
/* Table of class bit maps for each POSIX class. Each class is formed from a
716
base map, with an optional addition or removal of another map. Then, for some
717
classes, there is some additional tweaking: for [:blank:] the vertical space
718
characters are removed, and for [:alpha:] and [:alnum:] the underscore
719
character is removed. The triples in the table consist of the base map offset,
720
second map offset or -1 if no second map, and a non-negative value for map
721
addition or a negative value for map subtraction (if there are two maps). The
722
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
723
remove vertical space characters, 2 => remove underscore. */
724
725
static const int posix_class_maps[] = {
726
  cbit_word,   cbit_digit, -2,            /* alpha */
727
  cbit_lower,  -1,          0,            /* lower */
728
  cbit_upper,  -1,          0,            /* upper */
729
  cbit_word,   -1,          2,            /* alnum - word without underscore */
730
  cbit_print,  cbit_cntrl,  0,            /* ascii */
731
  cbit_space,  -1,          1,            /* blank - a GNU extension */
732
  cbit_cntrl,  -1,          0,            /* cntrl */
733
  cbit_digit,  -1,          0,            /* digit */
734
  cbit_graph,  -1,          0,            /* graph */
735
  cbit_print,  -1,          0,            /* print */
736
  cbit_punct,  -1,          0,            /* punct */
737
  cbit_space,  -1,          0,            /* space */
738
  cbit_word,   -1,          0,            /* word - a Perl extension */
739
  cbit_xdigit, -1,          0             /* xdigit */
740
};
741
742
#ifdef SUPPORT_UNICODE
743
744
/* The POSIX class Unicode property substitutes that are used in UCP mode must
745
be in the order of the POSIX class names, defined above. */
746
747
static int posix_substitutes[] = {
748
  PT_GC, ucp_L,     /* alpha */
749
  PT_PC, ucp_Ll,    /* lower */
750
  PT_PC, ucp_Lu,    /* upper */
751
  PT_ALNUM, 0,      /* alnum */
752
  -1, 0,            /* ascii, treat as non-UCP */
753
  -1, 1,            /* blank, treat as \h */
754
  PT_PC, ucp_Cc,    /* cntrl */
755
  PT_PC, ucp_Nd,    /* digit */
756
  PT_PXGRAPH, 0,    /* graph */
757
  PT_PXPRINT, 0,    /* print */
758
  PT_PXPUNCT, 0,    /* punct */
759
  PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
760
  PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
761
  PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
762
};
763
#define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
764
#endif  /* SUPPORT_UNICODE */
765
766
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
767
are allowed. */
768
769
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
770
1.52k
  (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
771
1.52k
   PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
772
1.52k
   PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
773
774
#define PUBLIC_COMPILE_OPTIONS \
775
1.52k
  (PUBLIC_LITERAL_COMPILE_OPTIONS| \
776
1.52k
   PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
777
1.52k
   PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
778
1.52k
   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
779
1.52k
   PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
780
1.52k
   PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
781
1.52k
   PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
782
783
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
784
1.52k
   (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_CASELESS_RESTRICT)
785
786
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
787
1.52k
   (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
788
1.52k
    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
789
1.52k
    PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
790
1.52k
    PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
791
1.52k
    PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
792
1.52k
    PCRE2_EXTRA_ASCII_DIGIT)
793
794
/* Compile time error code numbers. They are given names so that they can more
795
easily be tracked. When a new number is added, the tables called eint1 and
796
eint2 in pcre2posix.c may need to be updated, and a new error text must be
797
added to compile_error_texts in pcre2_error.c. Also, the error codes in
798
pcre2.h.in must be updated - their values are exactly 100 greater than these
799
values. */
800
801
enum { ERR0 = COMPILE_ERROR_BASE,
802
       ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
803
       ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
804
       ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
805
       ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
806
       ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
807
       ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
808
       ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
809
       ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
810
       ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
811
       ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99, ERR100,
812
       ERR101 };
813
814
/* This is a table of start-of-pattern options such as (*UTF) and settings such
815
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
816
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
817
generic and always supported. */
818
819
enum { PSO_OPT,     /* Value is an option bit */
820
       PSO_FLG,     /* Value is a flag bit */
821
       PSO_NL,      /* Value is a newline type */
822
       PSO_BSR,     /* Value is a \R type */
823
       PSO_LIMH,    /* Read integer value for heap limit */
824
       PSO_LIMM,    /* Read integer value for match limit */
825
       PSO_LIMD     /* Read integer value for depth limit */
826
     };
827
828
typedef struct pso {
829
  const uint8_t *name;
830
  uint16_t length;
831
  uint16_t type;
832
  uint32_t value;
833
} pso;
834
835
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
836
837
static const pso pso_list[] = {
838
  { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
839
  { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
840
  { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
841
  { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
842
  { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
843
  { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
844
  { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
845
  { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
846
  { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
847
  { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
848
  { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
849
  { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
850
  { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
851
  { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
852
  { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
853
  { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
854
  { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
855
  { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
856
  { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
857
  { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
858
  { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
859
};
860
861
/* This table is used when converting repeating opcodes into possessified
862
versions as a result of an explicit possessive quantifier such as ++. A zero
863
value means there is no possessified version - in those cases the item in
864
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
865
because all relevant opcodes are less than that. */
866
867
static const uint8_t opcode_possessify[] = {
868
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
869
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
870
871
  0,                       /* NOTI */
872
  OP_POSSTAR, 0,           /* STAR, MINSTAR */
873
  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
874
  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
875
  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
876
  0,                       /* EXACT */
877
  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
878
879
  OP_POSSTARI, 0,          /* STARI, MINSTARI */
880
  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
881
  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
882
  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
883
  0,                       /* EXACTI */
884
  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
885
886
  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
887
  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
888
  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
889
  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
890
  0,                       /* NOTEXACT */
891
  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
892
893
  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
894
  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
895
  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
896
  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
897
  0,                       /* NOTEXACTI */
898
  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
899
900
  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
901
  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
902
  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
903
  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
904
  0,                       /* TYPEEXACT */
905
  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
906
907
  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
908
  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
909
  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
910
  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
911
  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
912
913
  0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
914
  0, 0,                    /* REF, REFI */
915
  0, 0,                    /* DNREF, DNREFI */
916
  0, 0                     /* RECURSE, CALLOUT */
917
};
918
919
920
#ifdef DEBUG_SHOW_PARSED
921
/*************************************************
922
*     Show the parsed pattern for debugging      *
923
*************************************************/
924
925
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
926
can be enabled. */
927
928
static void show_parsed(compile_block *cb)
929
{
930
uint32_t *pptr = cb->parsed_pattern;
931
932
for (;;)
933
  {
934
  int max, min;
935
  PCRE2_SIZE offset;
936
  uint32_t i;
937
  uint32_t length;
938
  uint32_t meta_arg = META_DATA(*pptr);
939
940
  fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
941
942
  if (*pptr < META_END)
943
    {
944
    if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
945
    pptr++;
946
    }
947
948
  else switch (META_CODE(*pptr++))
949
    {
950
    default:
951
    fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
952
    return;
953
954
    case META_END:
955
    fprintf(stderr, "META_END\n");
956
    return;
957
958
    case META_CAPTURE:
959
    fprintf(stderr, "META_CAPTURE %d", meta_arg);
960
    break;
961
962
    case META_RECURSE:
963
    GETOFFSET(offset, pptr);
964
    fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
965
    break;
966
967
    case META_BACKREF:
968
    if (meta_arg < 10)
969
      offset = cb->small_ref_offset[meta_arg];
970
    else
971
      GETOFFSET(offset, pptr);
972
    fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
973
    break;
974
975
    case META_ESCAPE:
976
    if (meta_arg == ESC_P || meta_arg == ESC_p)
977
      {
978
      uint32_t ptype = *pptr >> 16;
979
      uint32_t pvalue = *pptr++ & 0xffff;
980
      fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
981
        ptype, pvalue);
982
      }
983
    else
984
      {
985
      uint32_t cc;
986
      /* There's just one escape we might have here that isn't negated in the
987
      escapes table. */
988
      if (meta_arg == ESC_g) cc = CHAR_g;
989
      else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
990
        {
991
        if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
992
        }
993
      if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
994
      fprintf(stderr, "META \\%c", cc);
995
      }
996
    break;
997
998
    case META_MINMAX:
999
    min = *pptr++;
1000
    max = *pptr++;
1001
    if (max != REPEAT_UNLIMITED)
1002
      fprintf(stderr, "META {%d,%d}", min, max);
1003
    else
1004
      fprintf(stderr, "META {%d,}", min);
1005
    break;
1006
1007
    case META_MINMAX_QUERY:
1008
    min = *pptr++;
1009
    max = *pptr++;
1010
    if (max != REPEAT_UNLIMITED)
1011
      fprintf(stderr, "META {%d,%d}?", min, max);
1012
    else
1013
      fprintf(stderr, "META {%d,}?", min);
1014
    break;
1015
1016
    case META_MINMAX_PLUS:
1017
    min = *pptr++;
1018
    max = *pptr++;
1019
    if (max != REPEAT_UNLIMITED)
1020
      fprintf(stderr, "META {%d,%d}+", min, max);
1021
    else
1022
      fprintf(stderr, "META {%d,}+", min);
1023
    break;
1024
1025
    case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1026
    case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1027
    case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1028
    case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1029
    case META_DOT: fprintf(stderr, "META_DOT"); break;
1030
    case META_ASTERISK: fprintf(stderr, "META *"); break;
1031
    case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1032
    case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1033
    case META_PLUS: fprintf(stderr, "META +"); break;
1034
    case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1035
    case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1036
    case META_QUERY: fprintf(stderr, "META ?"); break;
1037
    case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1038
    case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1039
1040
    case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1041
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1042
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1043
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1044
    case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1045
    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1046
    case META_KET: fprintf(stderr, "META )"); break;
1047
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1048
1049
    case META_CLASS: fprintf(stderr, "META ["); break;
1050
    case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1051
    case META_CLASS_END: fprintf(stderr, "META ]"); break;
1052
    case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1053
    case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1054
1055
    case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1056
    case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1057
1058
    case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1059
    case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1060
1061
    case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1062
    case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1063
    case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1064
    case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1065
    case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1066
    case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1067
1068
    case META_OPTIONS:
1069
    fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
1070
    pptr += 2;
1071
    break;
1072
1073
    case META_LOOKBEHIND:
1074
    fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
1075
    pptr += 2;
1076
    break;
1077
1078
    case META_LOOKBEHIND_NA:
1079
    fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
1080
    pptr += 2;
1081
    break;
1082
1083
    case META_LOOKBEHINDNOT:
1084
    fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
1085
    pptr += 2;
1086
    break;
1087
1088
    case META_CALLOUT_NUMBER:
1089
    fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1090
       pptr[1]);
1091
    pptr += 3;
1092
    break;
1093
1094
    case META_CALLOUT_STRING:
1095
      {
1096
      uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1097
      uint32_t patlength = *pptr++;    /* Length of next pattern item */
1098
      fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1099
      GETOFFSET(offset, pptr);
1100
      fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1101
      }
1102
    break;
1103
1104
    case META_RECURSE_BYNAME:
1105
    fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1106
    GETOFFSET(offset, pptr);
1107
    fprintf(stderr, "%zd", offset);
1108
    break;
1109
1110
    case META_BACKREF_BYNAME:
1111
    fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1112
    GETOFFSET(offset, pptr);
1113
    fprintf(stderr, "%zd", offset);
1114
    break;
1115
1116
    case META_COND_NUMBER:
1117
    fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1118
    GETOFFSET(offset, pptr);
1119
    fprintf(stderr, "%zd", offset);
1120
    pptr++;
1121
    break;
1122
1123
    case META_COND_DEFINE:
1124
    fprintf(stderr, "META (?(DEFINE) offset=");
1125
    GETOFFSET(offset, pptr);
1126
    fprintf(stderr, "%zd", offset);
1127
    break;
1128
1129
    case META_COND_VERSION:
1130
    fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1131
    fprintf(stderr, "%d.", *pptr++);
1132
    fprintf(stderr, "%d)", *pptr++);
1133
    break;
1134
1135
    case META_COND_NAME:
1136
    fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1137
    GETOFFSET(offset, pptr);
1138
    fprintf(stderr, "%zd", offset);
1139
    break;
1140
1141
    case META_COND_RNAME:
1142
    fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1143
    GETOFFSET(offset, pptr);
1144
    fprintf(stderr, "%zd", offset);
1145
    break;
1146
1147
    /* This is kept as a name, because it might be. */
1148
1149
    case META_COND_RNUMBER:
1150
    fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1151
    GETOFFSET(offset, pptr);
1152
    fprintf(stderr, "%zd", offset);
1153
    break;
1154
1155
    case META_MARK:
1156
    fprintf(stderr, "META (*MARK:");
1157
    goto SHOWARG;
1158
1159
    case META_COMMIT_ARG:
1160
    fprintf(stderr, "META (*COMMIT:");
1161
    goto SHOWARG;
1162
1163
    case META_PRUNE_ARG:
1164
    fprintf(stderr, "META (*PRUNE:");
1165
    goto SHOWARG;
1166
1167
    case META_SKIP_ARG:
1168
    fprintf(stderr, "META (*SKIP:");
1169
    goto SHOWARG;
1170
1171
    case META_THEN_ARG:
1172
    fprintf(stderr, "META (*THEN:");
1173
    SHOWARG:
1174
    length = *pptr++;
1175
    for (i = 0; i < length; i++)
1176
      {
1177
      uint32_t cc = *pptr++;
1178
      if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1179
        else fprintf(stderr, "\\x{%x}", cc);
1180
      }
1181
    fprintf(stderr, ") length=%u", length);
1182
    break;
1183
    }
1184
  fprintf(stderr, "\n");
1185
  }
1186
return;
1187
}
1188
#endif  /* DEBUG_SHOW_PARSED */
1189
1190
1191
1192
/*************************************************
1193
*               Copy compiled code               *
1194
*************************************************/
1195
1196
/* Compiled JIT code cannot be copied, so the new compiled block has no
1197
associated JIT data. */
1198
1199
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1200
pcre2_code_copy(const pcre2_code *code)
1201
0
{
1202
0
PCRE2_SIZE* ref_count;
1203
0
pcre2_code *newcode;
1204
1205
0
if (code == NULL) return NULL;
1206
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1207
0
if (newcode == NULL) return NULL;
1208
0
memcpy(newcode, code, code->blocksize);
1209
0
newcode->executable_jit = NULL;
1210
1211
/* If the code is one that has been deserialized, increment the reference count
1212
in the decoded tables. */
1213
1214
0
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1215
0
  {
1216
0
  ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1217
0
  (*ref_count)++;
1218
0
  }
1219
1220
0
return newcode;
1221
0
}
1222
1223
1224
1225
/*************************************************
1226
*     Copy compiled code and character tables    *
1227
*************************************************/
1228
1229
/* Compiled JIT code cannot be copied, so the new compiled block has no
1230
associated JIT data. This version of code_copy also makes a separate copy of
1231
the character tables. */
1232
1233
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1234
pcre2_code_copy_with_tables(const pcre2_code *code)
1235
0
{
1236
0
PCRE2_SIZE* ref_count;
1237
0
pcre2_code *newcode;
1238
0
uint8_t *newtables;
1239
1240
0
if (code == NULL) return NULL;
1241
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1242
0
if (newcode == NULL) return NULL;
1243
0
memcpy(newcode, code, code->blocksize);
1244
0
newcode->executable_jit = NULL;
1245
1246
0
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1247
0
  code->memctl.memory_data);
1248
0
if (newtables == NULL)
1249
0
  {
1250
0
  code->memctl.free((void *)newcode, code->memctl.memory_data);
1251
0
  return NULL;
1252
0
  }
1253
0
memcpy(newtables, code->tables, TABLES_LENGTH);
1254
0
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1255
0
*ref_count = 1;
1256
1257
0
newcode->tables = newtables;
1258
0
newcode->flags |= PCRE2_DEREF_TABLES;
1259
0
return newcode;
1260
0
}
1261
1262
1263
1264
/*************************************************
1265
*               Free compiled code               *
1266
*************************************************/
1267
1268
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1269
pcre2_code_free(pcre2_code *code)
1270
566
{
1271
566
PCRE2_SIZE* ref_count;
1272
1273
566
if (code != NULL)
1274
0
  {
1275
#ifdef SUPPORT_JIT
1276
  if (code->executable_jit != NULL)
1277
    PRIV(jit_free)(code->executable_jit, &code->memctl);
1278
#endif
1279
1280
0
  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1281
0
    {
1282
    /* Decoded tables belong to the codes after deserialization, and they must
1283
    be freed when there are no more references to them. The *ref_count should
1284
    always be > 0. */
1285
1286
0
    ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1287
0
    if (*ref_count > 0)
1288
0
      {
1289
0
      (*ref_count)--;
1290
0
      if (*ref_count == 0)
1291
0
        code->memctl.free((void *)code->tables, code->memctl.memory_data);
1292
0
      }
1293
0
    }
1294
1295
0
  code->memctl.free(code, code->memctl.memory_data);
1296
0
  }
1297
566
}
1298
1299
1300
1301
/*************************************************
1302
*         Read a number, possibly signed         *
1303
*************************************************/
1304
1305
/* This function is used to read numbers in the pattern. The initial pointer
1306
must be at the sign or first digit of the number. When relative values
1307
(introduced by + or -) are allowed, they are relative group numbers, and the
1308
result must be greater than zero.
1309
1310
Arguments:
1311
  ptrptr      points to the character pointer variable
1312
  ptrend      points to the end of the input string
1313
  allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1314
  max_value   the largest number allowed
1315
  max_error   the error to give for an over-large number
1316
  intptr      where to put the result
1317
  errcodeptr  where to put an error code
1318
1319
Returns:      TRUE  - a number was read
1320
              FALSE - errorcode == 0 => no number was found
1321
                      errorcode != 0 => an error occurred
1322
*/
1323
1324
static BOOL
1325
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1326
  uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1327
1.28k
{
1328
1.28k
int sign = 0;
1329
1.28k
uint32_t n = 0;
1330
1.28k
PCRE2_SPTR ptr = *ptrptr;
1331
1.28k
BOOL yield = FALSE;
1332
1333
1.28k
*errorcodeptr = 0;
1334
1335
1.28k
if (allow_sign >= 0 && ptr < ptrend)
1336
6
  {
1337
6
  if (*ptr == CHAR_PLUS)
1338
3
    {
1339
3
    sign = +1;
1340
3
    max_value -= allow_sign;
1341
3
    ptr++;
1342
3
    }
1343
3
  else if (*ptr == CHAR_MINUS)
1344
0
    {
1345
0
    sign = -1;
1346
0
    ptr++;
1347
0
    }
1348
6
  }
1349
1350
1.28k
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1351
3.40k
while (ptr < ptrend && IS_DIGIT(*ptr))
1352
2.12k
  {
1353
2.12k
  n = n * 10 + *ptr++ - CHAR_0;
1354
2.12k
  if (n > max_value)
1355
8
    {
1356
8
    *errorcodeptr = max_error;
1357
8
    goto EXIT;
1358
8
    }
1359
2.12k
  }
1360
1361
1.27k
if (allow_sign >= 0 && sign != 0)
1362
0
  {
1363
0
  if (n == 0)
1364
0
    {
1365
0
    *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1366
0
    goto EXIT;
1367
0
    }
1368
1369
0
  if (sign > 0) n += allow_sign;
1370
0
  else if ((int)n > allow_sign)
1371
0
    {
1372
0
    *errorcodeptr = ERR15;  /* Non-existent subpattern */
1373
0
    goto EXIT;
1374
0
    }
1375
0
  else n = allow_sign + 1 - n;
1376
0
  }
1377
1378
1.27k
yield = TRUE;
1379
1380
1.28k
EXIT:
1381
1.28k
*intptr = n;
1382
1.28k
*ptrptr = ptr;
1383
1.28k
return yield;
1384
1.27k
}
1385
1386
1387
1388
/*************************************************
1389
*         Read repeat counts                     *
1390
*************************************************/
1391
1392
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1393
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1394
larger value is used for "unlimited". We have to use signed arguments for
1395
read_number() because it is capable of returning a signed value. As of Perl
1396
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1397
tabs after { and before } and between the numbers and the comma, so we do too.
1398
1399
Arguments:
1400
  ptrptr         points to pointer to character after '{'
1401
  ptrend         pointer to end of input
1402
  minp           if not NULL, pointer to int for min
1403
  maxp           if not NULL, pointer to int for max
1404
  errorcodeptr   points to error code variable
1405
1406
Returns:         FALSE if not a repeat quantifier, errorcode set zero
1407
                 FALSE on error, with errorcode set non-zero
1408
                 TRUE on success, with pointer updated to point after '}'
1409
*/
1410
1411
static BOOL
1412
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1413
  uint32_t *maxp, int *errorcodeptr)
1414
1.44k
{
1415
1.44k
PCRE2_SPTR p = *ptrptr;
1416
1.44k
PCRE2_SPTR pp;
1417
1.44k
BOOL yield = FALSE;
1418
1.44k
BOOL had_minimum = FALSE;
1419
1.44k
int32_t min = 0;
1420
1.44k
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1421
1422
1.44k
*errorcodeptr = 0;
1423
1.52k
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1424
1425
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1426
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1427
error. */
1428
1429
1.44k
pp = p;
1430
1.44k
if (pp < ptrend && IS_DIGIT(*pp))
1431
150
  {
1432
150
  had_minimum = TRUE;
1433
416
  while (++pp < ptrend && IS_DIGIT(*pp)) {}
1434
150
  }
1435
1436
1.45k
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1437
1.44k
if (pp >= ptrend) return FALSE;
1438
1439
1.42k
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1440
190
  {
1441
190
  if (!had_minimum) return FALSE;
1442
190
  }
1443
1.23k
else
1444
1.23k
  {
1445
1.23k
  if (*pp++ != CHAR_COMMA) return FALSE;
1446
44
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1447
6
  if (pp >= ptrend) return FALSE;
1448
6
  if (IS_DIGIT(*pp))
1449
0
    {
1450
0
    while (++pp < ptrend && IS_DIGIT(*pp)) {}
1451
0
    }
1452
6
  else if (!had_minimum) return FALSE;
1453
2
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1454
2
  if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1455
2
  }
1456
1457
/* Now process the quantifier for real. We know it must be {n} or (n,} or {,m}
1458
or {n,m}. The only error that read_number() can return is for a number that is
1459
too big. If *errorcodeptr is returned as zero it means no number was found. */
1460
1461
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1462
check m >= n because n defaults to zero. */
1463
1464
4
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1465
0
  {
1466
0
  if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1467
0
  p++;  /* Skip comma and subsequent spaces */
1468
0
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1469
0
  if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1470
0
    {
1471
0
    if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1472
0
    }
1473
0
  }
1474
1475
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1476
1477
4
else
1478
4
  {
1479
4
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1480
4
  if (*p == CHAR_RIGHT_CURLY_BRACKET)
1481
4
    {
1482
4
    max = min;
1483
4
    }
1484
0
  else   /* Handle {n,} or {n,m} */
1485
0
    {
1486
0
    p++;    /* Skip comma and subsequent spaces */
1487
0
    while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1488
0
    if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1489
0
      {
1490
0
      if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1491
0
      }
1492
1493
0
    if (max < min)
1494
0
      {
1495
0
      *errorcodeptr = ERR4;
1496
0
      goto EXIT;
1497
0
      }
1498
0
    }
1499
4
  }
1500
1501
/* Valid quantifier exists */
1502
1503
4
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1504
4
p++;
1505
4
yield = TRUE;
1506
4
if (minp != NULL) *minp = (uint32_t)min;
1507
4
if (maxp != NULL) *maxp = (uint32_t)max;
1508
1509
/* Update the pattern pointer */
1510
1511
4
EXIT:
1512
4
*ptrptr = p;
1513
4
return yield;
1514
4
}
1515
1516
1517
1518
/*************************************************
1519
*            Handle escapes                      *
1520
*************************************************/
1521
1522
/* This function is called when a \ has been encountered. It either returns a
1523
positive value for a simple escape such as \d, or 0 for a data character, which
1524
is placed in chptr. A backreference to group n is returned as negative n. On
1525
entry, ptr is pointing at the character after \. On exit, it points after the
1526
final code unit of the escape sequence.
1527
1528
This function is also called from pcre2_substitute() to handle escape sequences
1529
in replacement strings. In this case, the cb argument is NULL, and in the case
1530
of escapes that have further processing, only sequences that define a data
1531
character are recognised. The isclass argument is not relevant; the options
1532
argument is the final value of the compiled pattern's options.
1533
1534
Arguments:
1535
  ptrptr         points to the input position pointer
1536
  ptrend         points to the end of the input
1537
  chptr          points to a returned data character
1538
  errorcodeptr   points to the errorcode variable (containing zero)
1539
  options        the current options bits
1540
  xoptions       the current extra options bits
1541
  isclass        TRUE if inside a character class
1542
  cb             compile data block or NULL when called from pcre2_substitute()
1543
1544
Returns:         zero => a data character
1545
                 positive => a special escape sequence
1546
                 negative => a numerical back reference
1547
                 on error, errorcodeptr is set non-zero
1548
*/
1549
1550
int
1551
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1552
  int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass,
1553
  compile_block *cb)
1554
30.7k
{
1555
30.7k
BOOL utf = (options & PCRE2_UTF) != 0;
1556
30.7k
BOOL alt_bsux =
1557
30.7k
  ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1558
30.7k
PCRE2_SPTR ptr = *ptrptr;
1559
30.7k
uint32_t c, cc;
1560
30.7k
int escape = 0;
1561
30.7k
int i;
1562
1563
/* If backslash is at the end of the string, it's an error. */
1564
1565
30.7k
if (ptr >= ptrend)
1566
0
  {
1567
0
  *errorcodeptr = ERR1;
1568
0
  return 0;
1569
0
  }
1570
1571
30.7k
GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1572
30.7k
*errorcodeptr = 0;              /* Be optimistic */
1573
1574
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1575
value test saves a memory lookup for code points outside the alphanumeric
1576
range. */
1577
1578
30.7k
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1579
1580
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1581
positive value is a literal value for something like \n. A negative value is
1582
the negation of one of the ESC_ macros that is passed back for handling by the
1583
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1584
is supported. If the value is zero, further processing is handled below. */
1585
1586
23.7k
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1587
21.9k
  {
1588
21.9k
  if (i > 0)
1589
4.11k
    {
1590
4.11k
    c = (uint32_t)i;
1591
4.11k
    if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1592
0
      c = CHAR_LF;
1593
4.11k
    }
1594
17.8k
  else  /* Negative table entry */
1595
17.8k
    {
1596
17.8k
    escape = -i;                    /* Else return a special escape */
1597
17.8k
    if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1598
803
      cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1599
1600
    /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1601
    Unicode code points, as well as plain \N for "not newline". PCRE does not
1602
    support \N{name}. However, it does support quantification such as \N{2,3},
1603
    so if \N{ is not followed by U+dddd we check for a quantifier. */
1604
1605
17.8k
    if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1606
0
      {
1607
0
      PCRE2_SPTR p = ptr + 1;
1608
1609
      /* Perl ignores spaces and tabs after { */
1610
1611
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1612
1613
      /* \N{U+ can be handled by the \x{ code. However, this construction is
1614
      not valid in EBCDIC environments because it specifies a Unicode
1615
      character, not a codepoint in the local code. For example \N{U+0041}
1616
      must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1617
      casing semantics for the entire pattern, so allow it only in UTF (i.e.
1618
      Unicode) mode. */
1619
1620
0
      if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1621
0
        {
1622
#ifdef EBCDIC
1623
        *errorcodeptr = ERR93;
1624
#else
1625
0
        if (utf)
1626
0
          {
1627
0
          ptr = p + 2;
1628
0
          escape = 0;   /* Not a fancy escape after all */
1629
0
          goto COME_FROM_NU;
1630
0
          }
1631
0
        else *errorcodeptr = ERR93;
1632
0
#endif
1633
0
        }
1634
1635
      /* Give an error if what follows is not a quantifier, but don't override
1636
      an error set by the quantifier reader (e.g. number overflow). */
1637
1638
0
      else
1639
0
        {
1640
0
        if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1641
0
             *errorcodeptr == 0)
1642
0
          *errorcodeptr = ERR37;
1643
0
        }
1644
0
      }
1645
17.8k
    }
1646
21.9k
  }
1647
1648
/* Escapes that need further processing, including those that are unknown, have
1649
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1650
\o, and \x are recognized (\u and \U can never appear as they are used for case
1651
forcing). */
1652
1653
1.80k
else
1654
1.80k
  {
1655
1.80k
  int s;
1656
1.80k
  PCRE2_SPTR oldptr;
1657
1.80k
  BOOL overflow;
1658
1659
  /* Filter calls from pcre2_substitute(). */
1660
1661
1.80k
  if (cb == NULL)
1662
0
    {
1663
0
    if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1664
0
      {
1665
0
      *errorcodeptr = ERR3;
1666
0
      return 0;
1667
0
      }
1668
0
    alt_bsux = FALSE;   /* Do not modify \x handling */
1669
0
    }
1670
1671
1.80k
  switch (c)
1672
1.80k
    {
1673
    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1674
    error. */
1675
1676
4
    case CHAR_F:
1677
6
    case CHAR_l:
1678
10
    case CHAR_L:
1679
10
    *errorcodeptr = ERR37;
1680
10
    break;
1681
1682
    /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1683
    is set. Otherwise, \u must be followed by exactly four hex digits or, if
1684
    PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1685
    Otherwise it is a lowercase u letter. This gives some compatibility with
1686
    ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1687
    allowed. When \u{ is not followed by hex digits, a special return is given
1688
    because otherwise \u{ 12} (for example) would be treated as u{12}. */
1689
1690
5
    case CHAR_u:
1691
5
    if (!alt_bsux) *errorcodeptr = ERR37; else
1692
0
      {
1693
0
      uint32_t xc;
1694
1695
0
      if (ptr >= ptrend) break;
1696
0
      if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1697
0
          (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1698
0
        {
1699
0
        PCRE2_SPTR hptr = ptr + 1;
1700
1701
0
        cc = 0;
1702
0
        while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1703
0
          {
1704
0
          if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1705
0
            {
1706
0
            *errorcodeptr = ERR77;
1707
0
            ptr = hptr;   /* Show where */
1708
0
            break;        /* *hptr != } will cause another break below */
1709
0
            }
1710
0
          cc = (cc << 4) | xc;
1711
0
          hptr++;
1712
0
          }
1713
1714
0
        if (hptr == ptr + 1 ||   /* No hex digits */
1715
0
            hptr >= ptrend ||    /* Hit end of input */
1716
0
            *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1717
0
          {
1718
0
          escape = ESC_ub;    /* Special return */
1719
0
          ptr++;              /* Skip { */
1720
0
          break;              /* Hex escape not recognized */
1721
0
          }
1722
1723
0
        c = cc;          /* Accept the code point */
1724
0
        ptr = hptr + 1;
1725
0
        }
1726
1727
0
      else  /* Must be exactly 4 hex digits */
1728
0
        {
1729
0
        if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1730
0
        if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1731
0
        if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1732
0
        cc = (cc << 4) | xc;
1733
0
        if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1734
0
        cc = (cc << 4) | xc;
1735
0
        if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1736
0
        c = (cc << 4) | xc;
1737
0
        ptr += 4;
1738
0
        }
1739
1740
0
      if (utf)
1741
0
        {
1742
0
        if (c > 0x10ffffU) *errorcodeptr = ERR77;
1743
0
        else
1744
0
          if (c >= 0xd800 && c <= 0xdfff &&
1745
0
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1746
0
                *errorcodeptr = ERR73;
1747
0
        }
1748
0
      else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1749
0
      }
1750
5
    break;
1751
1752
    /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1753
    in which case it is an upper case letter. */
1754
1755
5
    case CHAR_U:
1756
0
    if (!alt_bsux) *errorcodeptr = ERR37;
1757
0
    break;
1758
1759
    /* In a character class, \g is just a literal "g". Outside a character
1760
    class, \g must be followed by one of a number of specific things:
1761
1762
    (1) A number, either plain or braced. If positive, it is an absolute
1763
    backreference. If negative, it is a relative backreference. This is a Perl
1764
    5.10 feature.
1765
1766
    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1767
    is part of Perl's movement towards a unified syntax for back references. As
1768
    this is synonymous with \k{name}, we fudge it up by pretending it really
1769
    was \k{name}.
1770
1771
    (3) For Oniguruma compatibility we also support \g followed by a name or a
1772
    number either in angle brackets or in single quotes. However, these are
1773
    (possibly recursive) subroutine calls, _not_ backreferences. We return
1774
    the ESC_g code.
1775
1776
    Summary: Return a negative number for a numerical back reference, ESC_k for
1777
    a named back reference, and ESC_g for a named or numbered subroutine call.
1778
    */
1779
1780
3
    case CHAR_g:
1781
3
    if (isclass) break;
1782
1783
3
    if (ptr >= ptrend)
1784
0
      {
1785
0
      *errorcodeptr = ERR57;
1786
0
      break;
1787
0
      }
1788
1789
3
    if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790
0
      {
1791
0
      escape = ESC_g;
1792
0
      break;
1793
0
      }
1794
1795
    /* If there is a brace delimiter, try to read a numerical reference. If
1796
    there isn't one, assume we have a name and treat it as \k. */
1797
1798
3
    if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799
0
      {
1800
0
      PCRE2_SPTR p = ptr + 1;
1801
1802
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803
0
      if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804
0
          errorcodeptr))
1805
0
        {
1806
0
        if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1807
0
        break;
1808
0
        }
1809
0
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810
1811
0
      if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812
0
        {
1813
0
        *errorcodeptr = ERR57;
1814
0
        break;
1815
0
        }
1816
0
      ptr = p + 1;
1817
0
      }
1818
1819
    /* Read an undelimited number */
1820
1821
3
    else
1822
3
      {
1823
3
      if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1824
3
          errorcodeptr))
1825
3
        {
1826
3
        if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1827
3
        break;
1828
3
        }
1829
3
      }
1830
1831
0
    if (s <= 0)
1832
0
      {
1833
0
      *errorcodeptr = ERR15;
1834
0
      break;
1835
0
      }
1836
1837
0
    escape = -s;
1838
0
    break;
1839
1840
    /* The handling of escape sequences consisting of a string of digits
1841
    starting with one that is not zero is not straightforward. Perl has changed
1842
    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1843
    recommended to avoid the ambiguities in the old syntax.
1844
1845
    Outside a character class, the digits are read as a decimal number. If the
1846
    number is less than 10, or if there are that many previous extracting left
1847
    brackets, it is a back reference. Otherwise, up to three octal digits are
1848
    read to form an escaped character code. Thus \123 is likely to be octal 123
1849
    (cf \0123, which is octal 012 followed by the literal 3).
1850
1851
    Inside a character class, \ followed by a digit is always either a literal
1852
    8 or 9 or an octal number. */
1853
1854
902
    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1855
1.29k
    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1856
1857
1.29k
    if (!isclass)
1858
1.25k
      {
1859
1.25k
      oldptr = ptr;
1860
1.25k
      ptr--;   /* Back to the digit */
1861
1862
      /* As we know we are at a digit, the only possible error from
1863
      read_number() is a number that is too large to be a group number. In this
1864
      case we fall through handle this as not a group reference. If we have
1865
      read a small enough number, check for a back reference.
1866
1867
      \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1868
      are octal escapes if there are not that many previous captures. */
1869
1870
1.25k
      if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1871
1.24k
          (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1872
899
        {
1873
899
        if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1874
899
          else escape = -s;     /* Indicates a back reference */
1875
899
        break;
1876
899
        }
1877
1878
356
      ptr = oldptr;      /* Put the pointer back and fall through */
1879
356
      }
1880
1881
    /* Handle a digit following \ when the number is not a back reference, or
1882
    we are within a character class. If the first digit is 8 or 9, Perl used to
1883
    generate a binary zero and then treat the digit as a following literal. At
1884
    least by Perl 5.18 this changed so as not to insert the binary zero. */
1885
1886
395
    if (c >= CHAR_8) break;
1887
1888
    /* Fall through */
1889
1890
    /* \0 always starts an octal number, but we may drop through to here with a
1891
    larger first octal digit. The original code used just to take the least
1892
    significant 8 bits of octal numbers (I think this is what early Perls used
1893
    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1894
    but no more than 3 octal digits. */
1895
1896
819
    case CHAR_0:
1897
819
    c -= CHAR_0;
1898
2.08k
    while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1899
1.27k
        c = c * 8 + *ptr++ - CHAR_0;
1900
819
#if PCRE2_CODE_UNIT_WIDTH == 8
1901
819
    if (!utf && c > 0xff) *errorcodeptr = ERR51;
1902
819
#endif
1903
819
    break;
1904
1905
    /* \o is a relatively new Perl feature, supporting a more general way of
1906
    specifying character codes in octal. The only supported form is \o{ddd},
1907
    with optional spaces or tabs after { and before }. */
1908
1909
3
    case CHAR_o:
1910
3
    if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1911
3
      {
1912
3
      ptr--;
1913
3
      *errorcodeptr = ERR55;
1914
3
      break;
1915
3
      }
1916
1917
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1918
0
    if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1919
0
      {
1920
0
      *errorcodeptr = ERR78;
1921
0
      break;
1922
0
      }
1923
1924
0
    c = 0;
1925
0
    overflow = FALSE;
1926
0
    while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1927
0
      {
1928
0
      cc = *ptr++;
1929
0
      if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1930
#if PCRE2_CODE_UNIT_WIDTH == 32
1931
      if (c >= 0x20000000l) { overflow = TRUE; break; }
1932
#endif
1933
0
      c = (c << 3) + (cc - CHAR_0);
1934
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1935
0
      if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1936
#elif PCRE2_CODE_UNIT_WIDTH == 16
1937
      if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1938
#elif PCRE2_CODE_UNIT_WIDTH == 32
1939
      if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1940
#endif
1941
0
      }
1942
1943
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1944
1945
0
    if (overflow)
1946
0
      {
1947
0
      while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1948
0
      *errorcodeptr = ERR34;
1949
0
      }
1950
0
    else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1951
0
      {
1952
0
      if (utf && c >= 0xd800 && c <= 0xdfff &&
1953
0
          (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1954
0
        {
1955
0
        ptr--;
1956
0
        *errorcodeptr = ERR73;
1957
0
        }
1958
0
      }
1959
0
    else
1960
0
      {
1961
0
      ptr--;
1962
0
      *errorcodeptr = ERR64;
1963
0
      }
1964
0
    break;
1965
1966
    /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1967
    by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1968
1969
14
    case CHAR_x:
1970
14
    if (alt_bsux)
1971
0
      {
1972
0
      uint32_t xc;
1973
0
      if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1974
0
      if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1975
0
      if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1976
0
      c = (cc << 4) | xc;
1977
0
      ptr += 2;
1978
0
      }
1979
1980
    /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1981
    greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1982
    digits. If not, { used to be treated as a data character. However, Perl
1983
    seems to read hex digits up to the first non-such, and ignore the rest, so
1984
    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1985
    now gives an error. */
1986
1987
14
    else
1988
14
      {
1989
14
      if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1990
0
        {
1991
0
        ptr++;
1992
0
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1993
1994
0
#ifndef EBCDIC
1995
0
        COME_FROM_NU:
1996
0
#endif
1997
0
        if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1998
0
          {
1999
0
          *errorcodeptr = ERR78;
2000
0
          break;
2001
0
          }
2002
0
        c = 0;
2003
0
        overflow = FALSE;
2004
2005
0
        while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2006
0
          {
2007
0
          ptr++;
2008
0
          if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2009
#if PCRE2_CODE_UNIT_WIDTH == 32
2010
          if (c >= 0x10000000l) { overflow = TRUE; break; }
2011
#endif
2012
0
          c = (c << 4) | cc;
2013
0
          if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2014
0
            {
2015
0
            overflow = TRUE;
2016
0
            break;
2017
0
            }
2018
0
          }
2019
2020
        /* Perl ignores spaces and tabs before } */
2021
2022
0
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2023
2024
        /* On overflow, skip remaining hex digits */
2025
2026
0
        if (overflow)
2027
0
          {
2028
0
          while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2029
0
          *errorcodeptr = ERR34;
2030
0
          }
2031
0
        else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2032
0
          {
2033
0
          if (utf && c >= 0xd800 && c <= 0xdfff &&
2034
0
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2035
0
            {
2036
0
            ptr--;
2037
0
            *errorcodeptr = ERR73;
2038
0
            }
2039
0
          }
2040
2041
        /* If the sequence of hex digits (followed by optional space) does not
2042
        end with '}', give an error. We used just to recognize this construct
2043
        and fall through to the normal \x handling, but nowadays Perl gives an
2044
        error, which seems much more sensible, so we do too. */
2045
2046
0
        else
2047
0
          {
2048
0
          ptr--;
2049
0
          *errorcodeptr = ERR67;
2050
0
          }
2051
0
        }   /* End of \x{} processing */
2052
2053
      /* Read a up to two hex digits after \x */
2054
2055
14
      else
2056
14
        {
2057
14
        c = 0;
2058
14
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2059
13
        ptr++;
2060
13
        c = cc;
2061
13
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2062
5
        ptr++;
2063
5
        c = (c << 4) | cc;
2064
5
        }     /* End of \xdd handling */
2065
14
      }       /* End of Perl-style \x handling */
2066
5
    break;
2067
2068
    /* The handling of \c is different in ASCII and EBCDIC environments. In an
2069
    ASCII (or Unicode) environment, an error is given if the character
2070
    following \c is not a printable ASCII character. Otherwise, the following
2071
    character is upper-cased if it is a letter, and after that the 0x40 bit is
2072
    flipped. The result is the value of the escape.
2073
2074
    In an EBCDIC environment the handling of \c is compatible with the
2075
    specification in the perlebcdic document. The following character must be
2076
    a letter or one of small number of special characters. These provide a
2077
    means of defining the character values 0-31.
2078
2079
    For testing the EBCDIC handling of \c in an ASCII environment, recognize
2080
    the EBCDIC value of 'c' explicitly. */
2081
2082
#if defined EBCDIC && 'a' != 0x81
2083
    case 0x83:
2084
#else
2085
32
    case CHAR_c:
2086
32
#endif
2087
32
    if (ptr >= ptrend)
2088
0
      {
2089
0
      *errorcodeptr = ERR2;
2090
0
      break;
2091
0
      }
2092
32
    c = *ptr;
2093
32
    if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2094
2095
    /* Handle \c in an ASCII/Unicode environment. */
2096
2097
32
#ifndef EBCDIC    /* ASCII/UTF-8 coding */
2098
32
    if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2099
0
      {
2100
0
      *errorcodeptr = ERR68;
2101
0
      break;
2102
0
      }
2103
32
    c ^= 0x40;
2104
2105
    /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2106
    255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2107
    POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2108
    The other valid sequences correspond to a list of specific characters. */
2109
2110
#else
2111
    if (c == CHAR_QUESTION_MARK)
2112
      c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2113
    else
2114
      {
2115
      for (i = 0; i < 32; i++)
2116
        {
2117
        if (c == ebcdic_escape_c[i]) break;
2118
        }
2119
      if (i < 32) c = i; else *errorcodeptr = ERR68;
2120
      }
2121
#endif  /* EBCDIC */
2122
2123
32
    ptr++;
2124
32
    break;
2125
2126
    /* Any other alphanumeric following \ is an error. Perl gives an error only
2127
    if in warning mode, but PCRE doesn't have a warning mode. */
2128
2129
12
    default:
2130
12
    *errorcodeptr = ERR3;
2131
12
    *ptrptr = ptr - 1;     /* Point to the character at fault */
2132
12
    return 0;
2133
1.80k
    }
2134
1.80k
  }
2135
2136
/* Set the pointer to the next character before returning. */
2137
2138
30.6k
*ptrptr = ptr;
2139
30.6k
*chptr = c;
2140
30.6k
return escape;
2141
30.7k
}
2142
2143
2144
2145
#ifdef SUPPORT_UNICODE
2146
/*************************************************
2147
*               Handle \P and \p                 *
2148
*************************************************/
2149
2150
/* This function is called after \P or \p has been encountered, provided that
2151
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2152
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2153
after the final code unit of the escape sequence.
2154
2155
Arguments:
2156
  ptrptr         the pattern position pointer
2157
  negptr         a boolean that is set TRUE for negation else FALSE
2158
  ptypeptr       an unsigned int that is set to the type value
2159
  pdataptr       an unsigned int that is set to the detailed property value
2160
  errorcodeptr   the error code variable
2161
  cb             the compile data
2162
2163
Returns:         TRUE if the type value was found, or FALSE for an invalid type
2164
*/
2165
2166
static BOOL
2167
get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2168
  uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2169
447
{
2170
447
PCRE2_UCHAR c;
2171
447
PCRE2_SIZE i, bot, top;
2172
447
PCRE2_SPTR ptr = *ptrptr;
2173
447
PCRE2_UCHAR name[50];
2174
447
PCRE2_UCHAR *vptr = NULL;
2175
447
uint16_t ptscript = PT_NOTSCRIPT;
2176
2177
447
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2178
447
c = *ptr++;
2179
447
*negptr = FALSE;
2180
2181
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2182
negation. */
2183
2184
447
if (c == CHAR_LEFT_CURLY_BRACKET)
2185
3
  {
2186
3
  if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2187
2188
3
  if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2189
0
    {
2190
0
    *negptr = TRUE;
2191
0
    ptr++;
2192
0
    }
2193
2194
43
  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2195
43
    {
2196
43
    if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2197
41
    c = *ptr++;
2198
#if PCRE2_CODE_UNIT_WIDTH != 8
2199
    while (c == '_' || c == '-' || (c <= 0xff && isspace(c)))
2200
#else
2201
54
    while (c == '_' || c == '-' || isspace(c))
2202
13
#endif
2203
13
      {
2204
13
      if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2205
13
      c = *ptr++;
2206
13
      }
2207
41
    if (c == CHAR_NUL) goto ERROR_RETURN;
2208
41
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2209
40
    name[i] = tolower(c);
2210
40
    if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
2211
40
    }
2212
2213
1
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2214
1
  name[i] = 0;
2215
1
  }
2216
2217
/* If { doesn't follow \p or \P there is just one following character, which
2218
must be an ASCII letter. */
2219
2220
444
else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2221
425
  {
2222
425
  name[0] = tolower(c);
2223
425
  name[1] = 0;
2224
425
  }
2225
19
else goto ERROR_RETURN;
2226
2227
426
*ptrptr = ptr;
2228
2229
/* If the property contains ':' or '=' we have class name and value separately
2230
specified. The following are supported:
2231
2232
  . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2233
  . Script (synonym sc) for which the property name is the script name
2234
  . Script_Extensions (synonym scx), ditto
2235
2236
As this is a small number, we currently just check the names directly. If this
2237
grows, a sorted table and a switch will be neater.
2238
2239
For both the script properties, set a PT_xxx value so that (1) they can be
2240
distinguished and (2) invalid script names that happen to be the name of
2241
another property can be diagnosed. */
2242
2243
426
if (vptr != NULL)
2244
1
  {
2245
1
  int offset = 0;
2246
1
  PCRE2_UCHAR sname[8];
2247
2248
1
  *vptr = 0;   /* Terminate property name */
2249
1
  if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2250
1
      PRIV(strcmp_c8)(name, STRING_bc) == 0)
2251
0
    {
2252
0
    offset = 4;
2253
0
    sname[0] = CHAR_b;
2254
0
    sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2255
0
    sname[2] = CHAR_d;
2256
0
    sname[3] = CHAR_i;
2257
0
    }
2258
2259
1
  else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2260
1
           PRIV(strcmp_c8)(name, STRING_sc) == 0)
2261
0
    ptscript = PT_SC;
2262
2263
1
  else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2264
1
           PRIV(strcmp_c8)(name, STRING_scx) == 0)
2265
0
    ptscript = PT_SCX;
2266
2267
1
  else
2268
1
    {
2269
1
    *errorcodeptr = ERR47;
2270
1
    return FALSE;
2271
1
    }
2272
2273
  /* Adjust the string in name[] as needed */
2274
2275
0
  memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2276
0
  if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2277
0
  }
2278
2279
/* Search for a recognized property using binary chop. */
2280
2281
425
bot = 0;
2282
425
top = PRIV(utt_size);
2283
2284
3.05k
while (bot < top)
2285
3.04k
  {
2286
3.04k
  int r;
2287
3.04k
  i = (bot + top) >> 1;
2288
3.04k
  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2289
2290
  /* When a matching property is found, some extra checking is needed when the
2291
  \p{xx:yy} syntax is used and xx is either sc or scx. */
2292
2293
3.04k
  if (r == 0)
2294
418
    {
2295
418
    *pdataptr = PRIV(utt)[i].value;
2296
418
    if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2297
418
      {
2298
418
      *ptypeptr = PRIV(utt)[i].type;
2299
418
      return TRUE;
2300
418
      }
2301
2302
0
    switch (PRIV(utt)[i].type)
2303
0
      {
2304
0
      case PT_SC:
2305
0
      *ptypeptr = PT_SC;
2306
0
      return TRUE;
2307
2308
0
      case PT_SCX:
2309
0
      *ptypeptr = ptscript;
2310
0
      return TRUE;
2311
0
      }
2312
2313
0
    break;  /* Non-script found */
2314
0
    }
2315
2316
2.62k
  if (r > 0) bot = i + 1; else top = i;
2317
2.62k
  }
2318
2319
7
*errorcodeptr = ERR47;   /* Unrecognized property */
2320
7
return FALSE;
2321
2322
21
ERROR_RETURN:            /* Malformed \P or \p */
2323
21
*errorcodeptr = ERR46;
2324
21
*ptrptr = ptr;
2325
21
return FALSE;
2326
425
}
2327
#endif
2328
2329
2330
2331
/*************************************************
2332
*           Check for POSIX class syntax         *
2333
*************************************************/
2334
2335
/* This function is called when the sequence "[:" or "[." or "[=" is
2336
encountered in a character class. It checks whether this is followed by a
2337
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2338
reach an unescaped ']' without the special preceding character, return FALSE.
2339
2340
Originally, this function only recognized a sequence of letters between the
2341
terminators, but it seems that Perl recognizes any sequence of characters,
2342
though of course unknown POSIX names are subsequently rejected. Perl gives an
2343
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2344
didn't consider this to be a POSIX class. Likewise for [:1234:].
2345
2346
The problem in trying to be exactly like Perl is in the handling of escapes. We
2347
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2348
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2349
below handles the special cases \\ and \], but does not try to do any other
2350
escape processing. This makes it different from Perl for cases such as
2351
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2352
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2353
when Perl does, I think.
2354
2355
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2356
It seems that the appearance of a nested POSIX class supersedes an apparent
2357
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2358
a digit. This is handled by returning FALSE if the start of a new group with
2359
the same terminator is encountered, since the next closing sequence must close
2360
the nested group, not the outer one.
2361
2362
In Perl, unescaped square brackets may also appear as part of class names. For
2363
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2364
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2365
seem right at all. PCRE does not allow closing square brackets in POSIX class
2366
names.
2367
2368
Arguments:
2369
  ptr      pointer to the character after the initial [ (colon, dot, equals)
2370
  ptrend   pointer to the end of the pattern
2371
  endptr   where to return a pointer to the terminating ':', '.', or '='
2372
2373
Returns:   TRUE or FALSE
2374
*/
2375
2376
static BOOL
2377
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2378
271
{
2379
271
PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2380
271
terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2381
2382
12.8k
for (; ptrend - ptr >= 2; ptr++)
2383
12.8k
  {
2384
12.8k
  if (*ptr == CHAR_BACKSLASH &&
2385
528
      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2386
193
    ptr++;
2387
2388
12.6k
  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2389
12.5k
            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2390
2391
12.3k
  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2392
1
    {
2393
1
    *endptr = ptr;
2394
1
    return TRUE;
2395
1
    }
2396
12.8k
  }
2397
2398
29
return FALSE;
2399
271
}
2400
2401
2402
2403
/*************************************************
2404
*          Check POSIX class name                *
2405
*************************************************/
2406
2407
/* This function is called to check the name given in a POSIX-style class entry
2408
such as [:alnum:].
2409
2410
Arguments:
2411
  ptr        points to the first letter
2412
  len        the length of the name
2413
2414
Returns:     a value representing the name, or -1 if unknown
2415
*/
2416
2417
static int
2418
check_posix_name(PCRE2_SPTR ptr, int len)
2419
0
{
2420
0
const char *pn = posix_names;
2421
0
int yield = 0;
2422
0
while (posix_name_lengths[yield] != 0)
2423
0
  {
2424
0
  if (len == posix_name_lengths[yield] &&
2425
0
    PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2426
0
  pn += posix_name_lengths[yield] + 1;
2427
0
  yield++;
2428
0
  }
2429
0
return -1;
2430
0
}
2431
2432
2433
2434
/*************************************************
2435
*       Read a subpattern or VERB name           *
2436
*************************************************/
2437
2438
/* This function is called from parse_regex() below whenever it needs to read
2439
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2440
pointer must be to the preceding character. If that character is '*' we are
2441
reading a verb or alpha assertion name. The pointer is updated to point after
2442
the name, for a VERB or alpha assertion name, or after tha name's terminator
2443
for a subpattern name. Returning both the offset and the name pointer is
2444
redundant information, but some callers use one and some the other, so it is
2445
simplest just to return both. When the name is in braces, spaces and tabs are
2446
allowed (and ignored) at either end.
2447
2448
Arguments:
2449
  ptrptr      points to the character pointer variable
2450
  ptrend      points to the end of the input string
2451
  utf         true if the input is UTF-encoded
2452
  terminator  the terminator of a subpattern name must be this
2453
  offsetptr   where to put the offset from the start of the pattern
2454
  nameptr     where to put a pointer to the name in the input
2455
  namelenptr  where to put the length of the name
2456
  errcodeptr  where to put an error code
2457
  cb          pointer to the compile data block
2458
2459
Returns:    TRUE if a name was read
2460
            FALSE otherwise, with error code set
2461
*/
2462
2463
static BOOL
2464
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2465
  PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2466
  int *errorcodeptr, compile_block *cb)
2467
14
{
2468
14
PCRE2_SPTR ptr = *ptrptr;
2469
14
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2470
14
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2471
2472
14
if (is_braced)
2473
0
  while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2474
2475
14
if (ptr >= ptrend)                 /* No characters in name */
2476
0
  {
2477
0
  *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2478
0
                            ERR60; /* Verb not recognized or malformed */
2479
0
  goto FAILED;
2480
0
  }
2481
2482
14
*nameptr = ptr;
2483
14
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2484
2485
/* In UTF mode, a group name may contain letters and decimal digits as defined
2486
by Unicode properties, and underscores, but must not start with a digit. */
2487
2488
14
#ifdef SUPPORT_UNICODE
2489
14
if (utf && is_group)
2490
0
  {
2491
0
  uint32_t c, type;
2492
2493
0
  GETCHAR(c, ptr);
2494
0
  type = UCD_CHARTYPE(c);
2495
2496
0
  if (type == ucp_Nd)
2497
0
    {
2498
0
    *errorcodeptr = ERR44;
2499
0
    goto FAILED;
2500
0
    }
2501
2502
0
  for(;;)
2503
0
    {
2504
0
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2505
0
        c != CHAR_UNDERSCORE) break;
2506
0
    ptr++;
2507
0
    FORWARDCHARTEST(ptr, ptrend);
2508
0
    if (ptr >= ptrend) break;
2509
0
    GETCHAR(c, ptr);
2510
0
    type = UCD_CHARTYPE(c);
2511
0
    }
2512
0
  }
2513
14
else
2514
#else
2515
(void)utf;  /* Avoid compiler warning */
2516
#endif      /* SUPPORT_UNICODE */
2517
2518
/* Handle non-group names and group names in non-UTF modes. A group name must
2519
not start with a digit. If either of the others start with a digit it just
2520
won't be recognized. */
2521
2522
14
  {
2523
14
  if (is_group && IS_DIGIT(*ptr))
2524
0
    {
2525
0
    *errorcodeptr = ERR44;
2526
0
    goto FAILED;
2527
0
    }
2528
2529
31
  while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2530
17
    {
2531
17
    ptr++;
2532
17
    }
2533
14
  }
2534
2535
/* Check name length */
2536
2537
14
if (ptr > *nameptr + MAX_NAME_SIZE)
2538
0
  {
2539
0
  *errorcodeptr = ERR48;
2540
0
  goto FAILED;
2541
0
  }
2542
14
*namelenptr = (uint32_t)(ptr - *nameptr);
2543
2544
/* Subpattern names must not be empty, and their terminator is checked here.
2545
(What follows a verb or alpha assertion name is checked separately.) */
2546
2547
14
if (is_group)
2548
8
  {
2549
8
  if (ptr == *nameptr)
2550
8
    {
2551
8
    *errorcodeptr = ERR62;   /* Subpattern name expected */
2552
8
    goto FAILED;
2553
8
    }
2554
0
  if (is_braced)
2555
0
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2556
0
  if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2557
0
    {
2558
0
    *errorcodeptr = ERR42;
2559
0
    goto FAILED;
2560
0
    }
2561
0
  ptr++;
2562
0
  }
2563
2564
6
*ptrptr = ptr;
2565
6
return TRUE;
2566
2567
8
FAILED:
2568
8
*ptrptr = ptr;
2569
8
return FALSE;
2570
14
}
2571
2572
2573
2574
/*************************************************
2575
*          Manage callouts at start of cycle     *
2576
*************************************************/
2577
2578
/* At the start of a new item in parse_regex() we are able to record the
2579
details of the previous item in a prior callout, and also to set up an
2580
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2581
which would otherwise happen for items such as \Q that contribute nothing to
2582
the parsed pattern.
2583
2584
Arguments:
2585
  ptr              current pattern pointer
2586
  pcalloutptr      points to a pointer to previous callout, or NULL
2587
  auto_callout     TRUE if auto_callouts are enabled
2588
  parsed_pattern   the parsed pattern pointer
2589
  cb               compile block
2590
2591
Returns: possibly updated parsed_pattern pointer.
2592
*/
2593
2594
static uint32_t *
2595
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2596
  uint32_t *parsed_pattern, compile_block *cb)
2597
358k
{
2598
358k
uint32_t *previous_callout = *pcalloutptr;
2599
2600
358k
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2601
0
  cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2602
2603
358k
if (!auto_callout) previous_callout = NULL; else
2604
0
  {
2605
0
  if (previous_callout == NULL ||
2606
0
      previous_callout != parsed_pattern - 4 ||
2607
0
      previous_callout[3] != 255)
2608
0
    {
2609
0
    previous_callout = parsed_pattern;  /* Set up new automatic callout */
2610
0
    parsed_pattern += 4;
2611
0
    previous_callout[0] = META_CALLOUT_NUMBER;
2612
0
    previous_callout[2] = 0;
2613
0
    previous_callout[3] = 255;
2614
0
    }
2615
0
  previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2616
0
  }
2617
2618
358k
*pcalloutptr = previous_callout;
2619
358k
return parsed_pattern;
2620
358k
}
2621
2622
2623
2624
/*************************************************
2625
*          Handle \d, \D, \s, \S, \w, \W         *
2626
*************************************************/
2627
2628
/* This function is called from parse_regex() below, both for freestanding
2629
escapes, and those within classes, to handle those escapes that may change when
2630
Unicode property support is requested. Note that PCRE2_UCP will never be set
2631
without Unicode support because that is checked when pcre2_compile() is called.
2632
2633
Arguments:
2634
  escape          the ESC_... value
2635
  parsed_pattern  where to add the code
2636
  options         options bits
2637
  xoptions        extra options bits
2638
2639
Returns:          updated value of parsed_pattern
2640
*/
2641
static uint32_t *
2642
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2643
  uint32_t xoptions)
2644
9.49k
{
2645
9.49k
uint32_t ascii_option = 0;
2646
9.49k
uint32_t prop = ESC_p;
2647
2648
9.49k
switch(escape)
2649
9.49k
  {
2650
90
  case ESC_D:
2651
90
  prop = ESC_P;
2652
  /* Fall through */
2653
1.61k
  case ESC_d:
2654
1.61k
  ascii_option = PCRE2_EXTRA_ASCII_BSD;
2655
1.61k
  break;
2656
2657
265
  case ESC_S:
2658
265
  prop = ESC_P;
2659
  /* Fall through */
2660
1.28k
  case ESC_s:
2661
1.28k
  ascii_option = PCRE2_EXTRA_ASCII_BSS;
2662
1.28k
  break;
2663
2664
789
  case ESC_W:
2665
789
  prop = ESC_P;
2666
  /* Fall through */
2667
6.60k
  case ESC_w:
2668
6.60k
  ascii_option = PCRE2_EXTRA_ASCII_BSW;
2669
6.60k
  break;
2670
9.49k
  }
2671
2672
9.49k
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2673
8.90k
  {
2674
8.90k
  *parsed_pattern++ = META_ESCAPE + escape;
2675
8.90k
  }
2676
596
else
2677
596
  {
2678
596
  *parsed_pattern++ = META_ESCAPE + prop;
2679
596
  switch(escape)
2680
596
    {
2681
34
    case ESC_d:
2682
38
    case ESC_D:
2683
38
    *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2684
38
    break;
2685
2686
16
    case ESC_s:
2687
190
    case ESC_S:
2688
190
    *parsed_pattern++ = PT_SPACE << 16;
2689
190
    break;
2690
2691
347
    case ESC_w:
2692
368
    case ESC_W:
2693
368
    *parsed_pattern++ = PT_WORD << 16;
2694
368
    break;
2695
596
    }
2696
596
  }
2697
2698
9.49k
return parsed_pattern;
2699
9.49k
}
2700
2701
2702
2703
/*************************************************
2704
*      Parse regex and identify named groups     *
2705
*************************************************/
2706
2707
/* This function is called first of all. It scans the pattern and does two
2708
things: (1) It identifies capturing groups and makes a table of named capturing
2709
groups so that information about them is fully available to both the compiling
2710
scans. (2) It writes a parsed version of the pattern with comments omitted and
2711
escapes processed into the parsed_pattern vector.
2712
2713
Arguments:
2714
  ptr             points to the start of the pattern
2715
  options         compiling dynamic options (may change during the scan)
2716
  has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2717
  cb              pointer to the compile data block
2718
2719
Returns:   zero on success or a non-zero error code, with the
2720
             error offset placed in the cb field
2721
*/
2722
2723
/* A structure and some flags for dealing with nested groups. */
2724
2725
typedef struct nest_save {
2726
  uint16_t  nest_depth;
2727
  uint16_t  reset_group;
2728
  uint16_t  max_group;
2729
  uint16_t  flags;
2730
  uint32_t  options;
2731
  uint32_t  xoptions;
2732
} nest_save;
2733
2734
11
#define NSF_RESET          0x0001u
2735
9
#define NSF_CONDASSERT     0x0002u
2736
9
#define NSF_ATOMICSR       0x0004u
2737
2738
/* Options that are changeable within the pattern must be tracked during
2739
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2740
but all must be tracked so that META_OPTIONS items set the correct values for
2741
the main compiling phase. */
2742
2743
21
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2744
21
  PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2745
21
  PCRE2_UNGREEDY)
2746
2747
21
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2748
21
  PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2749
21
  PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2750
2751
/* States used for analyzing ranges in character classes. The two OK values
2752
must be last. */
2753
2754
enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2755
2756
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2757
the storing of literal values in the main parsed pattern, where they can always
2758
be quantified. */
2759
2760
#if PCRE2_CODE_UNIT_WIDTH == 32
2761
#define PARSED_LITERAL(c, p) \
2762
  { \
2763
  if (c >= META_END) *p++ = META_BIGVALUE; \
2764
  *p++ = c; \
2765
  okquantifier = TRUE; \
2766
  }
2767
#else
2768
453k
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2769
#endif
2770
2771
/* Here's the actual function. */
2772
2773
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2774
  compile_block *cb)
2775
1.50k
{
2776
1.50k
uint32_t c;
2777
1.50k
uint32_t delimiter;
2778
1.50k
uint32_t namelen;
2779
1.50k
uint32_t class_range_state;
2780
1.50k
uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2781
1.50k
uint32_t *verbstartptr = NULL;
2782
1.50k
uint32_t *previous_callout = NULL;
2783
1.50k
uint32_t *parsed_pattern = cb->parsed_pattern;
2784
1.50k
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2785
1.50k
uint32_t *this_parsed_item = NULL;
2786
1.50k
uint32_t *prev_parsed_item = NULL;
2787
1.50k
uint32_t meta_quantifier = 0;
2788
1.50k
uint32_t add_after_mark = 0;
2789
1.50k
uint32_t xoptions = cb->cx->extra_options;
2790
1.50k
uint16_t nest_depth = 0;
2791
1.50k
int after_manual_callout = 0;
2792
1.50k
int expect_cond_assert = 0;
2793
1.50k
int errorcode = 0;
2794
1.50k
int escape;
2795
1.50k
int i;
2796
1.50k
BOOL inescq = FALSE;
2797
1.50k
BOOL inverbname = FALSE;
2798
1.50k
BOOL utf = (options & PCRE2_UTF) != 0;
2799
1.50k
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2800
1.50k
BOOL isdupname;
2801
1.50k
BOOL negate_class;
2802
1.50k
BOOL okquantifier = FALSE;
2803
1.50k
PCRE2_SPTR thisptr;
2804
1.50k
PCRE2_SPTR name;
2805
1.50k
PCRE2_SPTR ptrend = cb->end_pattern;
2806
1.50k
PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2807
1.50k
named_group *ng;
2808
1.50k
nest_save *top_nest, *end_nests;
2809
2810
/* Insert leading items for word and line matching (features provided for the
2811
benefit of pcre2grep). */
2812
2813
1.50k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2814
0
  {
2815
0
  *parsed_pattern++ = META_CIRCUMFLEX;
2816
0
  *parsed_pattern++ = META_NOCAPTURE;
2817
0
  }
2818
1.50k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2819
0
  {
2820
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
2821
0
  *parsed_pattern++ = META_NOCAPTURE;
2822
0
  }
2823
2824
/* If the pattern is actually a literal string, process it separately to avoid
2825
cluttering up the main loop. */
2826
2827
1.50k
if ((options & PCRE2_LITERAL) != 0)
2828
0
  {
2829
0
  while (ptr < ptrend)
2830
0
    {
2831
0
    if (parsed_pattern >= parsed_pattern_end)
2832
0
      {
2833
0
      errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2834
0
      goto FAILED;
2835
0
      }
2836
0
    thisptr = ptr;
2837
0
    GETCHARINCTEST(c, ptr);
2838
0
    if (auto_callout)
2839
0
      parsed_pattern = manage_callouts(thisptr, &previous_callout,
2840
0
        auto_callout, parsed_pattern, cb);
2841
0
    PARSED_LITERAL(c, parsed_pattern);
2842
0
    }
2843
0
  goto PARSED_END;
2844
0
  }
2845
2846
/* Process a real regex which may contain meta-characters. */
2847
2848
1.50k
top_nest = NULL;
2849
1.50k
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2850
2851
/* The size of the nest_save structure might not be a factor of the size of the
2852
workspace. Therefore we must round down end_nests so as to correctly avoid
2853
creating a nest_save that spans the end of the workspace. */
2854
2855
1.50k
end_nests = (nest_save *)((char *)end_nests -
2856
1.50k
  ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2857
2858
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2859
2860
1.50k
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2861
2862
/* Now scan the pattern */
2863
2864
398k
while (ptr < ptrend)
2865
397k
  {
2866
397k
  int prev_expect_cond_assert;
2867
397k
  uint32_t min_repeat = 0, max_repeat = 0;
2868
397k
  uint32_t set, unset, *optset;
2869
397k
  uint32_t xset, xunset, *xoptset;
2870
397k
  uint32_t terminator;
2871
397k
  uint32_t prev_meta_quantifier;
2872
397k
  BOOL prev_okquantifier;
2873
397k
  PCRE2_SPTR tempptr;
2874
397k
  PCRE2_SIZE offset;
2875
2876
397k
  if (parsed_pattern >= parsed_pattern_end)
2877
0
    {
2878
0
    errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2879
0
    goto FAILED;
2880
0
    }
2881
2882
397k
  if (nest_depth > cb->cx->parens_nest_limit)
2883
2
    {
2884
2
    errorcode = ERR19;
2885
2
    goto FAILED;        /* Parentheses too deeply nested */
2886
2
    }
2887
2888
  /* If the last time round this loop something was added, parsed_pattern will
2889
  no longer be equal to this_parsed_item. Remember where the previous item
2890
  started and reset for the next item. Note that sometimes round the loop,
2891
  nothing gets added (e.g. for ignored white space). */
2892
2893
397k
  if (this_parsed_item != parsed_pattern)
2894
389k
    {
2895
389k
    prev_parsed_item = this_parsed_item;
2896
389k
    this_parsed_item = parsed_pattern;
2897
389k
    }
2898
2899
  /* Get next input character, save its position for callout handling. */
2900
2901
397k
  thisptr = ptr;
2902
397k
  GETCHARINCTEST(c, ptr);
2903
2904
  /* Copy quoted literals until \E, allowing for the possibility of automatic
2905
  callouts, except when processing a (*VERB) "name".  */
2906
2907
397k
  if (inescq)
2908
11.7k
    {
2909
11.7k
    if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2910
0
      {
2911
0
      inescq = FALSE;
2912
0
      ptr++;   /* Skip E */
2913
0
      }
2914
11.7k
    else
2915
11.7k
      {
2916
11.7k
      if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
2917
0
        {                           /* expecting a conditional assertion, */
2918
0
        ptr--;                      /* but an empty \Q\E sequence is OK.  */
2919
0
        errorcode = ERR28;
2920
0
        goto FAILED;
2921
0
        }
2922
11.7k
      if (inverbname)
2923
0
        {                          /* Don't use PARSED_LITERAL() because it */
2924
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2925
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2926
#endif
2927
0
        *parsed_pattern++ = c;
2928
0
        }
2929
11.7k
      else
2930
11.7k
        {
2931
11.7k
        if (after_manual_callout-- <= 0)
2932
11.7k
          parsed_pattern = manage_callouts(thisptr, &previous_callout,
2933
11.7k
            auto_callout, parsed_pattern, cb);
2934
11.7k
        PARSED_LITERAL(c, parsed_pattern);
2935
11.7k
        }
2936
11.7k
      meta_quantifier = 0;
2937
11.7k
      }
2938
11.7k
    continue;  /* Next character */
2939
11.7k
    }
2940
2941
  /* If we are processing the "name" part of a (*VERB:NAME) item, all
2942
  characters up to the closing parenthesis are literals except when
2943
  PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2944
  and \E and escaped characters are allowed (no character types such as \d). If
2945
  PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2946
  this by not entering the special (*VERB:NAME) processing - they are then
2947
  picked up below. Note that c is a character, not a code unit, so we must not
2948
  use MAX_255 to test its size because MAX_255 tests code units and is assumed
2949
  TRUE in 8-bit mode. */
2950
2951
385k
  if (inverbname &&
2952
0
       (
2953
        /* EITHER: not both options set */
2954
0
        ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2955
0
                    (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2956
0
#ifdef SUPPORT_UNICODE
2957
        /* OR: character > 255 AND not Unicode Pattern White Space */
2958
0
        (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2959
0
#endif
2960
        /* OR: not a # comment or isspace() white space */
2961
0
        (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2962
0
#ifdef SUPPORT_UNICODE
2963
        /* and not CHAR_NEL when Unicode is supported */
2964
0
          && c != CHAR_NEL
2965
0
#endif
2966
0
       )))
2967
0
    {
2968
0
    PCRE2_SIZE verbnamelength;
2969
2970
0
    switch(c)
2971
0
      {
2972
0
      default:                     /* Don't use PARSED_LITERAL() because it */
2973
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2974
      if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2975
#endif
2976
0
      *parsed_pattern++ = c;
2977
0
      break;
2978
2979
0
      case CHAR_RIGHT_PARENTHESIS:
2980
0
      inverbname = FALSE;
2981
      /* This is the length in characters */
2982
0
      verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2983
      /* But the limit on the length is in code units */
2984
0
      if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2985
0
        {
2986
0
        ptr--;
2987
0
        errorcode = ERR76;
2988
0
        goto FAILED;
2989
0
        }
2990
0
      *verblengthptr = (uint32_t)verbnamelength;
2991
2992
      /* If this name was on a verb such as (*ACCEPT) which does not continue,
2993
      a (*MARK) was generated for the name. We now add the original verb as the
2994
      next item. */
2995
2996
0
      if (add_after_mark != 0)
2997
0
        {
2998
0
        *parsed_pattern++ = add_after_mark;
2999
0
        add_after_mark = 0;
3000
0
        }
3001
0
      break;
3002
3003
0
      case CHAR_BACKSLASH:
3004
0
      if ((options & PCRE2_ALT_VERBNAMES) != 0)
3005
0
        {
3006
0
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3007
0
          xoptions, FALSE, cb);
3008
0
        if (errorcode != 0) goto FAILED;
3009
0
        }
3010
0
      else escape = 0;   /* Treat all as literal */
3011
3012
0
      switch(escape)
3013
0
        {
3014
0
        case 0:                    /* Don't use PARSED_LITERAL() because it */
3015
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3016
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3017
#endif
3018
0
        *parsed_pattern++ = c;
3019
0
        break;
3020
3021
0
        case ESC_ub:
3022
0
        *parsed_pattern++ = CHAR_u;
3023
0
        PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3024
0
        break;
3025
3026
0
        case ESC_Q:
3027
0
        inescq = TRUE;
3028
0
        break;
3029
3030
0
        case ESC_E:           /* Ignore */
3031
0
        break;
3032
3033
0
        default:
3034
0
        errorcode = ERR40;    /* Invalid in verb name */
3035
0
        goto FAILED;
3036
0
        }
3037
0
      }
3038
0
    continue;   /* Next character in pattern */
3039
0
    }
3040
3041
  /* Not a verb name character. At this point we must process everything that
3042
  must not change the quantification state. This is mainly comments, but we
3043
  handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3044
  A+, as in Perl. An isolated \E is ignored. */
3045
3046
385k
  if (c == CHAR_BACKSLASH && ptr < ptrend)
3047
26.6k
    {
3048
26.6k
    if (*ptr == CHAR_Q || *ptr == CHAR_E)
3049
41
      {
3050
41
      inescq = *ptr == CHAR_Q;
3051
41
      ptr++;
3052
41
      continue;
3053
41
      }
3054
26.6k
    }
3055
3056
  /* Skip over whitespace and # comments in extended mode. Note that c is a
3057
  character, not a code unit, so we must not use MAX_255 to test its size
3058
  because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3059
  whitespace characters are those designated as "Pattern White Space" by
3060
  Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3061
  U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3062
  subset of space characters that match \h and \v. */
3063
3064
385k
  if ((options & PCRE2_EXTENDED) != 0)
3065
11
    {
3066
11
    if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3067
10
#ifdef SUPPORT_UNICODE
3068
10
    if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3069
10
#endif
3070
10
    if (c == CHAR_NUMBER_SIGN)
3071
0
      {
3072
0
      while (ptr < ptrend)
3073
0
        {
3074
0
        if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3075
0
          {                       /* IS_NEWLINE sets cb->nllen. */
3076
0
          ptr += cb->nllen;
3077
0
          break;
3078
0
          }
3079
0
        ptr++;
3080
0
#ifdef SUPPORT_UNICODE
3081
0
        if (utf) FORWARDCHARTEST(ptr, ptrend);
3082
0
#endif
3083
0
        }
3084
0
      continue;  /* Next character in pattern */
3085
0
      }
3086
10
    }
3087
3088
  /* Skip over bracketed comments */
3089
3090
385k
  if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3091
4.16k
      ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3092
0
    {
3093
0
    while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3094
0
    if (ptr >= ptrend)
3095
0
      {
3096
0
      errorcode = ERR18;  /* A special error for missing ) in a comment */
3097
0
      goto FAILED;        /* to make it easier to debug. */
3098
0
      }
3099
0
    ptr++;
3100
0
    continue;  /* Next character in pattern */
3101
0
    }
3102
3103
  /* If the next item is not a quantifier, fill in length of any previous
3104
  callout and create an auto callout if required. */
3105
3106
385k
  if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3107
345k
       (c != CHAR_LEFT_CURLY_BRACKET ||
3108
721
         (tempptr = ptr,
3109
721
         !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3110
345k
    {
3111
345k
    if (after_manual_callout-- <= 0)
3112
345k
      {
3113
345k
      parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3114
345k
        parsed_pattern, cb);
3115
345k
      this_parsed_item = parsed_pattern;  /* New start for current item */
3116
345k
      }
3117
345k
    }
3118
3119
  /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3120
  assertion, possibly preceded by a callout. If the value is 1, we have just
3121
  had the callout and expect an assertion. There must be at least 3 more
3122
  characters in all cases. When expect_cond_assert is 2, we know that the
3123
  current character is an opening parenthesis, as otherwise we wouldn't be
3124
  here. However, when it is 1, we need to check, and it's easiest just to check
3125
  always. Note that expect_cond_assert may be negative, since all callouts just
3126
  decrement it. */
3127
3128
385k
  if (expect_cond_assert > 0)
3129
0
    {
3130
0
    BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3131
0
              (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3132
0
    if (ok)
3133
0
      {
3134
0
      if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3135
0
        {
3136
0
        ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3137
0
        }
3138
0
      else switch(ptr[1])  /* Traditional symbolic format */
3139
0
        {
3140
0
        case CHAR_C:
3141
0
        ok = expect_cond_assert == 2;
3142
0
        break;
3143
3144
0
        case CHAR_EQUALS_SIGN:
3145
0
        case CHAR_EXCLAMATION_MARK:
3146
0
        break;
3147
3148
0
        case CHAR_LESS_THAN_SIGN:
3149
0
        ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3150
0
        break;
3151
3152
0
        default:
3153
0
        ok = FALSE;
3154
0
        }
3155
0
      }
3156
3157
0
    if (!ok)
3158
0
      {
3159
0
      ptr--;   /* Adjust error offset */
3160
0
      errorcode = ERR28;
3161
0
      goto FAILED;
3162
0
      }
3163
0
    }
3164
3165
  /* Remember whether we are expecting a conditional assertion, and set the
3166
  default for this item. */
3167
3168
385k
  prev_expect_cond_assert = expect_cond_assert;
3169
385k
  expect_cond_assert = 0;
3170
3171
  /* Remember quantification status for the previous significant item, then set
3172
  default for this item. */
3173
3174
385k
  prev_okquantifier = okquantifier;
3175
385k
  prev_meta_quantifier = meta_quantifier;
3176
385k
  okquantifier = FALSE;
3177
385k
  meta_quantifier = 0;
3178
3179
  /* If the previous significant item was a quantifier, adjust the parsed code
3180
  if there is a following modifier. The base meta value is always followed by
3181
  the PLUS and QUERY values, in that order. We do this here rather than after
3182
  reading a quantifier so that intervening comments and /x whitespace can be
3183
  ignored without having to replicate code. */
3184
3185
385k
  if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3186
7.42k
    {
3187
7.42k
    parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3188
7.42k
      prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3189
5.71k
        0x00020000u : 0x00010000u);
3190
7.42k
    continue;  /* Next character in pattern */
3191
7.42k
    }
3192
3193
  /* Process the next item in the main part of a pattern. */
3194
3195
378k
  switch(c)
3196
378k
    {
3197
283k
    default:              /* Non-special character */
3198
283k
    PARSED_LITERAL(c, parsed_pattern);
3199
283k
    break;
3200
3201
3202
    /* ---- Escape sequence ---- */
3203
3204
26.6k
    case CHAR_BACKSLASH:
3205
26.6k
    tempptr = ptr;
3206
26.6k
    escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3207
26.6k
      xoptions, FALSE, cb);
3208
26.6k
    if (errorcode != 0)
3209
31
      {
3210
59
      ESCAPE_FAILED:
3211
59
      if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3212
59
        goto FAILED;
3213
0
      ptr = tempptr;
3214
0
      if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3215
0
        {
3216
0
        GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3217
0
        }
3218
0
      escape = 0;                 /* Treat as literal character */
3219
0
      }
3220
3221
    /* The escape was a data escape or literal character. */
3222
3223
26.6k
    if (escape == 0)
3224
9.48k
      {
3225
9.48k
      PARSED_LITERAL(c, parsed_pattern);
3226
9.48k
      }
3227
3228
    /* The escape was a back (or forward) reference. We keep the offset in
3229
    order to give a more useful diagnostic for a bad forward reference. For
3230
    references to groups numbered less than 10 we can't use more than two items
3231
    in parsed_pattern because they may be just two characters in the input (and
3232
    in a 64-bit world an offset may need two elements). So for them, the offset
3233
    of the first occurrent is held in a special vector. */
3234
3235
17.1k
    else if (escape < 0)
3236
899
      {
3237
899
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3238
899
      escape = -escape;
3239
899
      *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3240
899
      if (escape < 10)
3241
755
        {
3242
755
        if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3243
235
          cb->small_ref_offset[escape] = offset;
3244
755
        }
3245
144
      else
3246
144
        {
3247
144
        PUTOFFSET(offset, parsed_pattern);
3248
144
        }
3249
899
      okquantifier = TRUE;
3250
899
      }
3251
3252
    /* The escape was a character class such as \d etc. or other special
3253
    escape indicator such as \A or \X. Most of them generate just a single
3254
    parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3255
    value. They are supported only when Unicode is available. The type and
3256
    value are packed into a single 32-bit value so that the whole sequences
3257
    uses only two elements in the parsed_vector. This is because the same
3258
    coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3259
    set.
3260
3261
    There are also some cases where the escape sequence is followed by a name:
3262
    \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3263
    and \g'name' are subroutine calls by name; \g{name} is a synonym for
3264
    \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3265
    and returned as a negative value (handled above). A name is coded as an
3266
    offset into the pattern and a length. */
3267
3268
16.2k
    else switch (escape)
3269
16.2k
      {
3270
125
      case ESC_C:
3271
#ifdef NEVER_BACKSLASH_C
3272
      errorcode = ERR85;
3273
      goto ESCAPE_FAILED;
3274
#else
3275
125
      if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3276
0
        {
3277
0
        errorcode = ERR83;
3278
0
        goto ESCAPE_FAILED;
3279
0
        }
3280
125
#endif
3281
125
      okquantifier = TRUE;
3282
125
      *parsed_pattern++ = META_ESCAPE + escape;
3283
125
      break;
3284
3285
      /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3286
      when \u{ is not followed by hex digits and }. It requests two literal
3287
      characters, u and { and we need this, as otherwise \u{ 12} (for example)
3288
      would be treated as u{12} now that spaces are allowed in quantifiers. */
3289
3290
0
      case ESC_ub:
3291
0
      *parsed_pattern++ = CHAR_u;
3292
0
      PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3293
0
      break;
3294
3295
356
      case ESC_X:
3296
#ifndef SUPPORT_UNICODE
3297
      errorcode = ERR45;   /* Supported only with Unicode support */
3298
      goto ESCAPE_FAILED;
3299
#endif
3300
787
      case ESC_H:
3301
857
      case ESC_h:
3302
3.15k
      case ESC_N:
3303
6.28k
      case ESC_R:
3304
6.73k
      case ESC_V:
3305
7.11k
      case ESC_v:
3306
7.11k
      okquantifier = TRUE;
3307
7.11k
      *parsed_pattern++ = META_ESCAPE + escape;
3308
7.11k
      break;
3309
3310
521
      default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3311
521
      *parsed_pattern++ = META_ESCAPE + escape;
3312
521
      break;
3313
3314
      /* Escapes that may change in UCP mode. */
3315
3316
1.06k
      case ESC_d:
3317
1.13k
      case ESC_D:
3318
2.13k
      case ESC_s:
3319
2.38k
      case ESC_S:
3320
7.94k
      case ESC_w:
3321
8.15k
      case ESC_W:
3322
8.15k
      okquantifier = TRUE;
3323
8.15k
      parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3324
8.15k
        xoptions);
3325
8.15k
      break;
3326
3327
      /* Unicode property matching */
3328
3329
215
      case ESC_P:
3330
318
      case ESC_p:
3331
318
#ifdef SUPPORT_UNICODE
3332
318
        {
3333
318
        BOOL negated;
3334
318
        uint16_t ptype = 0, pdata = 0;
3335
318
        if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3336
28
          goto ESCAPE_FAILED;
3337
290
        if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3338
290
        *parsed_pattern++ = META_ESCAPE + escape;
3339
290
        *parsed_pattern++ = (ptype << 16) | pdata;
3340
290
        okquantifier = TRUE;
3341
290
        }
3342
#else
3343
      errorcode = ERR45;
3344
      goto ESCAPE_FAILED;
3345
#endif
3346
0
      break;  /* End \P and \p */
3347
3348
      /* When \g is used with quotes or angle brackets as delimiters, it is a
3349
      numerical or named subroutine call, and control comes here. When used
3350
      with brace delimiters it is a numberical back reference and does not come
3351
      here because check_escape() returns it directly as a reference. \k is
3352
      always a named back reference. */
3353
3354
0
      case ESC_g:
3355
0
      case ESC_k:
3356
0
      if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3357
0
          *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3358
0
        {
3359
0
        errorcode = (escape == ESC_g)? ERR57 : ERR69;
3360
0
        goto ESCAPE_FAILED;
3361
0
        }
3362
0
      terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3363
0
        CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3364
0
        CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3365
3366
      /* For a non-braced \g, check for a numerical recursion. */
3367
3368
0
      if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3369
0
        {
3370
0
        PCRE2_SPTR p = ptr + 1;
3371
3372
0
        if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3373
0
            &errorcode))
3374
0
          {
3375
0
          if (p >= ptrend || *p != terminator)
3376
0
            {
3377
0
            errorcode = ERR57;
3378
0
            goto ESCAPE_FAILED;
3379
0
            }
3380
0
          ptr = p;
3381
0
          goto SET_RECURSION;
3382
0
          }
3383
0
        if (errorcode != 0) goto ESCAPE_FAILED;
3384
0
        }
3385
3386
      /* Not a numerical recursion. Perl allows spaces and tabs after { and
3387
      before } but not for other delimiters. */
3388
3389
0
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3390
0
          &errorcode, cb)) goto ESCAPE_FAILED;
3391
3392
      /* \k and \g when used with braces are back references, whereas \g used
3393
      with quotes or angle brackets is a recursion */
3394
3395
0
      *parsed_pattern++ =
3396
0
        (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3397
0
          META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3398
0
      *parsed_pattern++ = namelen;
3399
3400
0
      PUTOFFSET(offset, parsed_pattern);
3401
0
      okquantifier = TRUE;
3402
0
      break;  /* End special escape processing */
3403
16.2k
      }
3404
26.5k
    break;    /* End escape sequence processing */
3405
3406
3407
    /* ---- Single-character special items ---- */
3408
3409
26.5k
    case CHAR_CIRCUMFLEX_ACCENT:
3410
2.50k
    *parsed_pattern++ = META_CIRCUMFLEX;
3411
2.50k
    break;
3412
3413
757
    case CHAR_DOLLAR_SIGN:
3414
757
    *parsed_pattern++ = META_DOLLAR;
3415
757
    break;
3416
3417
4.66k
    case CHAR_DOT:
3418
4.66k
    *parsed_pattern++ = META_DOT;
3419
4.66k
    okquantifier = TRUE;
3420
4.66k
    break;
3421
3422
3423
    /* ---- Single-character quantifiers ---- */
3424
3425
6.06k
    case CHAR_ASTERISK:
3426
6.06k
    meta_quantifier = META_ASTERISK;
3427
6.06k
    goto CHECK_QUANTIFIER;
3428
3429
8.12k
    case CHAR_PLUS:
3430
8.12k
    meta_quantifier = META_PLUS;
3431
8.12k
    goto CHECK_QUANTIFIER;
3432
3433
18.6k
    case CHAR_QUESTION_MARK:
3434
18.6k
    meta_quantifier = META_QUERY;
3435
18.6k
    goto CHECK_QUANTIFIER;
3436
3437
3438
    /* ---- Potential {n,m} quantifier ---- */
3439
3440
721
    case CHAR_LEFT_CURLY_BRACKET:
3441
721
    if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3442
721
        &errorcode))
3443
719
      {
3444
719
      if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3445
719
      PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3446
719
      break;                               /* No more quantifier processing */
3447
719
      }
3448
2
    meta_quantifier = META_MINMAX;
3449
    /* Fall through */
3450
3451
3452
    /* ---- Quantifier post-processing ---- */
3453
3454
    /* Check that a quantifier is allowed after the previous item. This
3455
    guarantees that there is a previous item. */
3456
3457
32.8k
    CHECK_QUANTIFIER:
3458
32.8k
    if (!prev_okquantifier)
3459
16
      {
3460
16
      errorcode = ERR9;
3461
16
      goto FAILED_BACK;
3462
16
      }
3463
3464
    /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3465
    quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3466
    sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3467
    wrapping it in non-capturing brackets, but we have to allow for a preceding
3468
    (*MARK) for when (*ACCEPT) has an argument. */
3469
3470
32.7k
    if (*prev_parsed_item == META_ACCEPT)
3471
0
      {
3472
0
      uint32_t *p;
3473
0
      for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3474
0
      *verbstartptr = META_NOCAPTURE;
3475
0
      parsed_pattern[1] = META_KET;
3476
0
      parsed_pattern += 2;
3477
0
      }
3478
3479
    /* Now we can put the quantifier into the parsed pattern vector. At this
3480
    stage, we have only the basic quantifier. The check for a following + or ?
3481
    modifier happens at the top of the loop, after any intervening comments
3482
    have been removed. */
3483
3484
32.7k
    *parsed_pattern++ = meta_quantifier;
3485
32.7k
    if (c == CHAR_LEFT_CURLY_BRACKET)
3486
2
      {
3487
2
      *parsed_pattern++ = min_repeat;
3488
2
      *parsed_pattern++ = max_repeat;
3489
2
      }
3490
32.7k
    break;
3491
3492
3493
    /* ---- Character class ---- */
3494
3495
7.28k
    case CHAR_LEFT_SQUARE_BRACKET:
3496
7.28k
    okquantifier = TRUE;
3497
3498
    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3499
    used for "start of word" and "end of word". As these are otherwise illegal
3500
    sequences, we don't break anything by recognizing them. They are replaced
3501
    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3502
    erroneous and are handled by the normal code below. */
3503
3504
7.28k
    if (ptrend - ptr >= 6 &&
3505
7.27k
         (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3506
7.27k
          PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3507
0
      {
3508
0
      *parsed_pattern++ = META_ESCAPE + ESC_b;
3509
3510
0
      if (ptr[2] == CHAR_LESS_THAN_SIGN)
3511
0
        {
3512
0
        *parsed_pattern++ = META_LOOKAHEAD;
3513
0
        }
3514
0
      else
3515
0
        {
3516
0
        *parsed_pattern++ = META_LOOKBEHIND;
3517
0
        *has_lookbehind = TRUE;
3518
3519
        /* The offset is used only for the "non-fixed length" error; this won't
3520
        occur here, so just store zero. */
3521
3522
0
        PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3523
0
        }
3524
3525
0
      if ((options & PCRE2_UCP) == 0)
3526
0
        *parsed_pattern++ = META_ESCAPE + ESC_w;
3527
0
      else
3528
0
        {
3529
0
        *parsed_pattern++ = META_ESCAPE + ESC_p;
3530
0
        *parsed_pattern++ = PT_WORD << 16;
3531
0
        }
3532
0
      *parsed_pattern++ = META_KET;
3533
0
      ptr += 6;
3534
0
      break;
3535
0
      }
3536
3537
    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3538
    they are encountered at the top level, so we'll do that too. */
3539
3540
7.28k
    if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3541
7.21k
         *ptr == CHAR_EQUALS_SIGN) &&
3542
197
        check_posix_syntax(ptr, ptrend, &tempptr))
3543
1
      {
3544
1
      errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3545
1
      goto FAILED;
3546
1
      }
3547
3548
    /* Process a regular character class. If the first character is '^', set
3549
    the negation flag. If the first few characters (either before or after ^)
3550
    are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3551
    This makes for compatibility with Perl. */
3552
3553
7.28k
    negate_class = FALSE;
3554
9.48k
    while (ptr < ptrend)
3555
9.48k
      {
3556
9.48k
      GETCHARINCTEST(c, ptr);
3557
9.48k
      if (c == CHAR_BACKSLASH)
3558
196
        {
3559
196
        if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3560
141
        else if (ptrend - ptr >= 3 &&
3561
141
             PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3562
0
          ptr += 3;
3563
141
        else
3564
141
          break;
3565
196
        }
3566
9.29k
      else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3567
0
               (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
3568
0
        continue;
3569
9.29k
      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3570
2.15k
        negate_class = TRUE;
3571
7.13k
      else break;
3572
9.48k
      }
3573
3574
    /* Now the real contents of the class; c has the first "real" character.
3575
    Empty classes are permitted only if the option is set. */
3576
3577
7.28k
    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3578
257
        (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3579
0
      {
3580
0
      *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3581
0
      break;  /* End of class processing */
3582
0
      }
3583
3584
    /* Process a non-empty class. */
3585
3586
7.28k
    *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3587
7.28k
    class_range_state = RANGE_NO;
3588
3589
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3590
    because there are holes in the encoding, and simply using the range A-Z
3591
    (for example) would include the characters in the holes. This applies only
3592
    to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3593
    in this respect. In order to accommodate this, we keep track of whether
3594
    character values are literal or not, and a state variable for handling
3595
    ranges. */
3596
3597
    /* Loop for the contents of the class */
3598
3599
7.28k
    for (;;)
3600
151k
      {
3601
151k
      BOOL char_is_literal = TRUE;
3602
3603
      /* Inside \Q...\E everything is literal except \E */
3604
3605
151k
      if (inescq)
3606
1.56k
        {
3607
1.56k
        if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3608
0
          {
3609
0
          inescq = FALSE;                   /* Reset literal state */
3610
0
          ptr++;                            /* Skip the 'E' */
3611
0
          goto CLASS_CONTINUE;
3612
0
          }
3613
1.56k
        goto CLASS_LITERAL;
3614
1.56k
        }
3615
3616
      /* Skip over space and tab (only) in extended-more mode. */
3617
3618
149k
      if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3619
0
          (c == CHAR_SPACE || c == CHAR_HT))
3620
0
        goto CLASS_CONTINUE;
3621
3622
      /* Handle POSIX class names. Perl allows a negation extension of the
3623
      form [:^name:]. A square bracket that doesn't match the syntax is
3624
      treated as a literal. We also recognize the POSIX constructions
3625
      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3626
      5.6 and 5.8 do. */
3627
3628
149k
      if (c == CHAR_LEFT_SQUARE_BRACKET &&
3629
3.96k
          ptrend - ptr >= 3 &&
3630
3.95k
          (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3631
3.92k
           *ptr == CHAR_EQUALS_SIGN) &&
3632
74
          check_posix_syntax(ptr, ptrend, &tempptr))
3633
0
        {
3634
0
        BOOL posix_negate = FALSE;
3635
0
        int posix_class;
3636
3637
        /* Perl treats a hyphen before a POSIX class as a literal, not the
3638
        start of a range. However, it gives a warning in its warning mode. PCRE
3639
        does not have a warning mode, so we give an error, because this is
3640
        likely an error on the user's part. */
3641
3642
0
        if (class_range_state == RANGE_STARTED)
3643
0
          {
3644
0
          errorcode = ERR50;
3645
0
          goto FAILED;
3646
0
          }
3647
3648
0
        if (*ptr != CHAR_COLON)
3649
0
          {
3650
0
          errorcode = ERR13;
3651
0
          goto FAILED_BACK;
3652
0
          }
3653
3654
0
        if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3655
0
          {
3656
0
          posix_negate = TRUE;
3657
0
          ptr++;
3658
0
          }
3659
3660
0
        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3661
0
        if (posix_class < 0)
3662
0
          {
3663
0
          errorcode = ERR30;
3664
0
          goto FAILED;
3665
0
          }
3666
0
        ptr = tempptr + 2;
3667
3668
        /* Perl treats a hyphen after a POSIX class as a literal, not the
3669
        start of a range. However, it gives a warning in its warning mode
3670
        unless the hyphen is the last character in the class. PCRE does not
3671
        have a warning mode, so we give an error, because this is likely an
3672
        error on the user's part. */
3673
3674
0
        if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3675
0
            ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3676
0
          {
3677
0
          errorcode = ERR50;
3678
0
          goto FAILED;
3679
0
          }
3680
3681
        /* Set "a hyphen is not the start of a range" for the -] case, and also
3682
        in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3683
        fuzzers do that kind of thing) and *then* a hyphen. This causes that
3684
        hyphen to be treated as a literal. I don't think it's worth setting up
3685
        special apparatus to do otherwise. */
3686
3687
0
        class_range_state = RANGE_NO;
3688
3689
        /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3690
        of the POSIX classes are converted to use Unicode properties \p or \P
3691
        or, in one case, \h or \H. The substitutes table has two values per
3692
        class, containing the type and value of a \p or \P item. The special
3693
        cases are specified with a negative type: a non-zero value causes \h or
3694
        \H to be used, and a zero value falls through to behave like a non-UCP
3695
        POSIX class. There are now also some extra options that force ASCII for
3696
        some classes. */
3697
3698
0
#ifdef SUPPORT_UNICODE
3699
0
        if ((options & PCRE2_UCP) != 0 &&
3700
0
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3701
0
            !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3702
0
              (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3703
0
          {
3704
0
          int ptype = posix_substitutes[2*posix_class];
3705
0
          int pvalue = posix_substitutes[2*posix_class + 1];
3706
3707
0
          if (ptype >= 0)
3708
0
            {
3709
0
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3710
0
            *parsed_pattern++ = (ptype << 16) | pvalue;
3711
0
            goto CLASS_CONTINUE;
3712
0
            }
3713
3714
0
          if (pvalue != 0)
3715
0
            {
3716
0
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3717
0
            goto CLASS_CONTINUE;
3718
0
            }
3719
3720
          /* Fall through */
3721
0
          }
3722
0
#endif  /* SUPPORT_UNICODE */
3723
3724
        /* Non-UCP POSIX class */
3725
3726
0
        *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3727
0
        *parsed_pattern++ = posix_class;
3728
0
        }
3729
3730
      /* Handle potential start of range */
3731
3732
149k
      else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3733
1.06k
        {
3734
1.06k
        *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3735
1.04k
          META_RANGE_LITERAL : META_RANGE_ESCAPED;
3736
1.06k
        class_range_state = RANGE_STARTED;
3737
1.06k
        }
3738
3739
      /* Handle a literal character */
3740
3741
148k
      else if (c != CHAR_BACKSLASH)
3742
144k
        {
3743
148k
        CLASS_LITERAL:
3744
148k
        if (class_range_state == RANGE_STARTED)
3745
1.04k
          {
3746
1.04k
          if (c == parsed_pattern[-2])       /* Optimize one-char range */
3747
258
            parsed_pattern--;
3748
790
          else if (parsed_pattern[-2] > c)   /* Check range is in order */
3749
29
            {
3750
29
            errorcode = ERR8;
3751
29
            goto FAILED_BACK;
3752
29
            }
3753
761
          else
3754
761
            {
3755
761
            if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3756
71
              parsed_pattern[-1] = META_RANGE_ESCAPED;
3757
761
            PARSED_LITERAL(c, parsed_pattern);
3758
761
            }
3759
1.01k
          class_range_state = RANGE_NO;
3760
1.01k
          }
3761
147k
        else  /* Potential start of range */
3762
147k
          {
3763
147k
          class_range_state = char_is_literal?
3764
145k
            RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3765
147k
          PARSED_LITERAL(c, parsed_pattern);
3766
147k
          }
3767
148k
        }
3768
3769
      /* Handle escapes in a class */
3770
3771
4.05k
      else
3772
4.05k
        {
3773
4.05k
        tempptr = ptr;
3774
4.05k
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3775
4.05k
          xoptions, TRUE, cb);
3776
3777
4.05k
        if (errorcode != 0)
3778
6
          {
3779
6
          if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3780
6
            goto FAILED;
3781
0
          ptr = tempptr;
3782
0
          if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3783
0
            {
3784
0
            GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3785
0
            }
3786
0
          escape = 0;                 /* Treat as literal character */
3787
0
          }
3788
3789
4.04k
        switch(escape)
3790
4.04k
          {
3791
2.42k
          case 0:  /* Escaped character code point is in c */
3792
2.42k
          char_is_literal = FALSE;
3793
2.42k
          goto CLASS_LITERAL;      /* (a few lines above) */
3794
3795
1
          case ESC_b:
3796
1
          c = CHAR_BS;    /* \b is backspace in a class */
3797
1
          char_is_literal = FALSE;
3798
1
          goto CLASS_LITERAL;
3799
3800
3
          case ESC_Q:
3801
3
          inescq = TRUE;  /* Enter literal mode */
3802
3
          goto CLASS_CONTINUE;
3803
3804
67
          case ESC_E:     /* Ignore orphan \E */
3805
67
          goto CLASS_CONTINUE;
3806
3807
1
          case ESC_B:     /* Always an error in a class */
3808
12
          case ESC_R:
3809
12
          case ESC_X:
3810
12
          errorcode = ERR7;
3811
12
          ptr--;
3812
12
          goto FAILED;
3813
4.04k
          }
3814
3815
        /* The second part of a range can be a single-character escape
3816
        sequence (detected above), but not any of the other escapes. Perl
3817
        treats a hyphen as a literal in such circumstances. However, in Perl's
3818
        warning mode, a warning is given, so PCRE now faults it, as it is
3819
        almost certainly a mistake on the user's part. */
3820
3821
1.54k
        if (class_range_state == RANGE_STARTED)
3822
0
          {
3823
0
          errorcode = ERR50;
3824
0
          goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
3825
0
          }
3826
3827
        /* Of the remaining escapes, only those that define characters are
3828
        allowed in a class. None may start a range. */
3829
3830
1.54k
        class_range_state = RANGE_NO;
3831
1.54k
        switch(escape)
3832
1.54k
          {
3833
0
          case ESC_N:
3834
0
          errorcode = ERR71;
3835
0
          goto FAILED;
3836
3837
4
          case ESC_H:
3838
9
          case ESC_h:
3839
65
          case ESC_V:
3840
65
          case ESC_v:
3841
65
          *parsed_pattern++ = META_ESCAPE + escape;
3842
65
          break;
3843
3844
          /* These escapes may be converted to Unicode property tests when
3845
          PCRE2_UCP is set. */
3846
3847
454
          case ESC_d:
3848
478
          case ESC_D:
3849
501
          case ESC_s:
3850
509
          case ESC_S:
3851
759
          case ESC_w:
3852
1.34k
          case ESC_W:
3853
1.34k
          parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3854
1.34k
            xoptions);
3855
1.34k
          break;
3856
3857
          /* Explicit Unicode property matching */
3858
3859
112
          case ESC_P:
3860
129
          case ESC_p:
3861
129
#ifdef SUPPORT_UNICODE
3862
129
            {
3863
129
            BOOL negated;
3864
129
            uint16_t ptype = 0, pdata = 0;
3865
129
            if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3866
1
              goto FAILED;
3867
128
            if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3868
128
            *parsed_pattern++ = META_ESCAPE + escape;
3869
128
            *parsed_pattern++ = (ptype << 16) | pdata;
3870
128
            }
3871
#else
3872
          errorcode = ERR45;
3873
          goto FAILED;
3874
#endif
3875
0
          break;  /* End \P and \p */
3876
3877
5
          default:    /* All others are not allowed in a class */
3878
5
          errorcode = ERR7;
3879
5
          ptr--;
3880
5
          goto FAILED;
3881
1.54k
          }
3882
3883
        /* Perl gives a warning unless a following hyphen is the last character
3884
        in the class. PCRE throws an error. */
3885
3886
1.53k
        if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3887
0
            ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3888
0
          {
3889
0
          errorcode = ERR50;
3890
0
          goto FAILED;
3891
0
          }
3892
1.53k
        }
3893
3894
      /* Proceed to next thing in the class. */
3895
3896
151k
      CLASS_CONTINUE:
3897
151k
      if (ptr >= ptrend)
3898
195
        {
3899
195
        errorcode = ERR6;  /* Missing terminating ']' */
3900
195
        goto FAILED;
3901
195
        }
3902
150k
      GETCHARINCTEST(c, ptr);
3903
150k
      if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3904
150k
      }     /* End of class-processing loop */
3905
3906
    /* -] at the end of a class is a literal '-' */
3907
3908
7.03k
    if (class_range_state == RANGE_STARTED)
3909
19
      {
3910
19
      parsed_pattern[-1] = CHAR_MINUS;
3911
19
      class_range_state = RANGE_NO;
3912
19
      }
3913
3914
7.03k
    *parsed_pattern++ = META_CLASS_END;
3915
7.03k
    break;  /* End of character class */
3916
3917
3918
    /* ---- Opening parenthesis ---- */
3919
3920
4.16k
    case CHAR_LEFT_PARENTHESIS:
3921
4.16k
    if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3922
3923
    /* If ( is not followed by ? it is either a capture or a special verb or an
3924
    alpha assertion or a positive non-atomic lookahead. */
3925
3926
4.16k
    if (*ptr != CHAR_QUESTION_MARK)
3927
4.08k
      {
3928
4.08k
      const char *vn;
3929
3930
      /* Handle capturing brackets (or non-capturing if auto-capture is turned
3931
      off). */
3932
3933
4.08k
      if (*ptr != CHAR_ASTERISK)
3934
4.08k
        {
3935
4.08k
        nest_depth++;
3936
4.08k
        if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3937
4.08k
          {
3938
4.08k
          if (cb->bracount >= MAX_GROUP_NUMBER)
3939
0
            {
3940
0
            errorcode = ERR97;
3941
0
            goto FAILED;
3942
0
            }
3943
4.08k
          cb->bracount++;
3944
4.08k
          *parsed_pattern++ = META_CAPTURE | cb->bracount;
3945
4.08k
          }
3946
0
        else *parsed_pattern++ = META_NOCAPTURE;
3947
4.08k
        }
3948
3949
      /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3950
      quantifier" error rather than "(*MARK) must have an argument". */
3951
3952
6
      else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3953
0
        break;
3954
3955
      /* Handle "alpha assertions" such as (*pla:...). Most of these are
3956
      synonyms for the historical symbolic assertions, but the script run and
3957
      non-atomic lookaround ones are new. They are distinguished by starting
3958
      with a lower case letter. Checking both ends of the alphabet makes this
3959
      work in all character codes. */
3960
3961
6
      else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3962
1
        {
3963
1
        uint32_t meta;
3964
3965
1
        vn = alasnames;
3966
1
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3967
1
          &errorcode, cb)) goto FAILED;
3968
1
        if (ptr >= ptrend || *ptr != CHAR_COLON)
3969
1
          {
3970
1
          errorcode = ERR95;  /* Malformed */
3971
1
          goto FAILED;
3972
1
          }
3973
3974
        /* Scan the table of alpha assertion names */
3975
3976
0
        for (i = 0; i < alascount; i++)
3977
0
          {
3978
0
          if (namelen == alasmeta[i].len &&
3979
0
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
3980
0
            break;
3981
0
          vn += alasmeta[i].len + 1;
3982
0
          }
3983
3984
0
        if (i >= alascount)
3985
0
          {
3986
0
          errorcode = ERR95;  /* Alpha assertion not recognized */
3987
0
          goto FAILED;
3988
0
          }
3989
3990
        /* Check for expecting an assertion condition. If so, only atomic
3991
        lookaround assertions are valid. */
3992
3993
0
        meta = alasmeta[i].meta;
3994
0
        if (prev_expect_cond_assert > 0 &&
3995
0
            (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3996
0
          {
3997
0
          errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3998
0
            ERR98 : ERR28;  /* (Atomic) assertion expected */
3999
0
          goto FAILED;
4000
0
          }
4001
4002
        /* The lookaround alphabetic synonyms can mostly be handled by jumping
4003
        to the code that handles the traditional symbolic forms. */
4004
4005
0
        switch(meta)
4006
0
          {
4007
0
          default:
4008
0
          errorcode = ERR89;  /* Unknown code; should never occur because */
4009
0
          goto FAILED;        /* the meta values come from a table above. */
4010
4011
0
          case META_ATOMIC:
4012
0
          goto ATOMIC_GROUP;
4013
4014
0
          case META_LOOKAHEAD:
4015
0
          goto POSITIVE_LOOK_AHEAD;
4016
4017
0
          case META_LOOKAHEAD_NA:
4018
0
          goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4019
4020
0
          case META_LOOKAHEADNOT:
4021
0
          goto NEGATIVE_LOOK_AHEAD;
4022
4023
0
          case META_LOOKBEHIND:
4024
0
          case META_LOOKBEHINDNOT:
4025
0
          case META_LOOKBEHIND_NA:
4026
0
          *parsed_pattern++ = meta;
4027
0
          ptr--;
4028
0
          goto POST_LOOKBEHIND;
4029
4030
          /* The script run facilities are handled here. Unicode support is
4031
          required (give an error if not, as this is a security issue). Always
4032
          record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4033
          META_ATOMIC and remember that we need two META_KETs at the end. */
4034
4035
0
          case META_SCRIPT_RUN:
4036
0
          case META_ATOMIC_SCRIPT_RUN:
4037
0
#ifdef SUPPORT_UNICODE
4038
0
          *parsed_pattern++ = META_SCRIPT_RUN;
4039
0
          nest_depth++;
4040
0
          ptr++;
4041
0
          if (meta == META_ATOMIC_SCRIPT_RUN)
4042
0
            {
4043
0
            *parsed_pattern++ = META_ATOMIC;
4044
0
            if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4045
0
            else if (++top_nest >= end_nests)
4046
0
              {
4047
0
              errorcode = ERR84;
4048
0
              goto FAILED;
4049
0
              }
4050
0
            top_nest->nest_depth = nest_depth;
4051
0
            top_nest->flags = NSF_ATOMICSR;
4052
0
            top_nest->options = options & PARSE_TRACKED_OPTIONS;
4053
0
            top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4054
0
            }
4055
0
          break;
4056
#else  /* SUPPORT_UNICODE */
4057
          errorcode = ERR96;
4058
          goto FAILED;
4059
#endif
4060
0
          }
4061
0
        }
4062
4063
4064
      /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4065
4066
5
      else
4067
5
        {
4068
5
        vn = verbnames;
4069
5
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4070
5
          &errorcode, cb)) goto FAILED;
4071
5
        if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4072
5
                              *ptr != CHAR_RIGHT_PARENTHESIS))
4073
5
          {
4074
5
          errorcode = ERR60;  /* Malformed */
4075
5
          goto FAILED;
4076
5
          }
4077
4078
        /* Scan the table of verb names */
4079
4080
0
        for (i = 0; i < verbcount; i++)
4081
0
          {
4082
0
          if (namelen == verbs[i].len &&
4083
0
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4084
0
            break;
4085
0
          vn += verbs[i].len + 1;
4086
0
          }
4087
4088
0
        if (i >= verbcount)
4089
0
          {
4090
0
          errorcode = ERR60;  /* Verb not recognized */
4091
0
          goto FAILED;
4092
0
          }
4093
4094
        /* An empty argument is treated as no argument. */
4095
4096
0
        if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4097
0
             ptr[1] == CHAR_RIGHT_PARENTHESIS)
4098
0
          ptr++;    /* Advance to the closing parens */
4099
4100
        /* Check for mandatory non-empty argument; this is (*MARK) */
4101
4102
0
        if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4103
0
          {
4104
0
          errorcode = ERR66;
4105
0
          goto FAILED;
4106
0
          }
4107
4108
        /* Remember where this verb, possibly with a preceding (*MARK), starts,
4109
        for handling quantified (*ACCEPT). */
4110
4111
0
        verbstartptr = parsed_pattern;
4112
0
        okquantifier = (verbs[i].meta == META_ACCEPT);
4113
4114
        /* It appears that Perl allows any characters whatsoever, other than a
4115
        closing parenthesis, to appear in arguments ("names"), so we no longer
4116
        insist on letters, digits, and underscores. Perl does not, however, do
4117
        any interpretation within arguments, and has no means of including a
4118
        closing parenthesis. PCRE supports escape processing but only when it
4119
        is requested by an option. We set inverbname TRUE here, and let the
4120
        main loop take care of this so that escape and \x processing is done by
4121
        the main code above. */
4122
4123
0
        if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4124
0
          {
4125
          /* Some optional arguments can be treated as a preceding (*MARK) */
4126
4127
0
          if (verbs[i].has_arg < 0)
4128
0
            {
4129
0
            add_after_mark = verbs[i].meta;
4130
0
            *parsed_pattern++ = META_MARK;
4131
0
            }
4132
4133
          /* The remaining verbs with arguments (except *MARK) need a different
4134
          opcode. */
4135
4136
0
          else
4137
0
            {
4138
0
            *parsed_pattern++ = verbs[i].meta +
4139
0
              ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4140
0
            }
4141
4142
          /* Set up for reading the name in the main loop. */
4143
4144
0
          verblengthptr = parsed_pattern++;
4145
0
          verbnamestart = ptr;
4146
0
          inverbname = TRUE;
4147
0
          }
4148
0
        else  /* No verb "name" argument */
4149
0
          {
4150
0
          *parsed_pattern++ = verbs[i].meta;
4151
0
          }
4152
0
        }     /* End of (*VERB) handling */
4153
4.08k
      break;  /* Done with this parenthesis */
4154
4.08k
      }       /* End of groups that don't start with (? */
4155
4156
4157
    /* ---- Items starting (? ---- */
4158
4159
    /* The type of item is determined by what follows (?. Handle (?| and option
4160
    changes under "default" because both need a new block on the nest stack.
4161
    Comments starting with (?# are handled above. Note that there is some
4162
    ambiguity about the sequence (?- because if a digit follows it's a relative
4163
    recursion or subroutine call whereas otherwise it's an option unsetting. */
4164
4165
78
    if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4166
4167
78
    switch(*ptr)
4168
78
      {
4169
12
      default:
4170
12
      if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4171
0
        goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4172
4173
      /* We now have either (?| or a (possibly empty) option setting,
4174
      optionally followed by a non-capturing group. */
4175
4176
12
      nest_depth++;
4177
12
      if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4178
0
      else if (++top_nest >= end_nests)
4179
0
        {
4180
0
        errorcode = ERR84;
4181
0
        goto FAILED;
4182
0
        }
4183
12
      top_nest->nest_depth = nest_depth;
4184
12
      top_nest->flags = 0;
4185
12
      top_nest->options = options & PARSE_TRACKED_OPTIONS;
4186
12
      top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4187
4188
      /* Start of non-capturing group that resets the capture count for each
4189
      branch. */
4190
4191
12
      if (*ptr == CHAR_VERTICAL_LINE)
4192
0
        {
4193
0
        top_nest->reset_group = (uint16_t)cb->bracount;
4194
0
        top_nest->max_group = (uint16_t)cb->bracount;
4195
0
        top_nest->flags |= NSF_RESET;
4196
0
        cb->external_flags |= PCRE2_DUPCAPUSED;
4197
0
        *parsed_pattern++ = META_NOCAPTURE;
4198
0
        ptr++;
4199
0
        }
4200
4201
      /* Scan for options imnrsxJU to be set or unset. */
4202
4203
12
      else
4204
12
        {
4205
12
        BOOL hyphenok = TRUE;
4206
12
        uint32_t oldoptions = options;
4207
12
        uint32_t oldxoptions = xoptions;
4208
4209
12
        top_nest->reset_group = 0;
4210
12
        top_nest->max_group = 0;
4211
12
        set = unset = 0;
4212
12
        optset = &set;
4213
12
        xset = xunset = 0;
4214
12
        xoptset = &xset;
4215
4216
        /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4217
4218
12
        if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4219
0
          {
4220
0
          options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4221
0
                       PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4222
0
          xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4223
0
          hyphenok = FALSE;
4224
0
          ptr++;
4225
0
          }
4226
4227
12
        while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4228
12
                               *ptr != CHAR_COLON)
4229
3
          {
4230
3
          switch (*ptr++)
4231
3
            {
4232
0
            case CHAR_MINUS:
4233
0
            if (!hyphenok)
4234
0
              {
4235
0
              errorcode = ERR94;
4236
0
              ptr--;  /* Correct the offset */
4237
0
              goto FAILED;
4238
0
              }
4239
0
            optset = &unset;
4240
0
            xoptset = &xunset;
4241
0
            hyphenok = FALSE;
4242
0
            break;
4243
4244
            /* There are some two-character sequences that start with 'a'. */
4245
4246
0
            case CHAR_a:
4247
0
            if (ptr < ptrend)
4248
0
              {
4249
0
              if (*ptr == CHAR_D)
4250
0
                {
4251
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSD;
4252
0
                ptr++;
4253
0
                break;
4254
0
                }
4255
0
              if (*ptr == CHAR_P)
4256
0
                {
4257
0
                *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4258
0
                ptr++;
4259
0
                break;
4260
0
                }
4261
0
              if (*ptr == CHAR_S)
4262
0
                {
4263
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSS;
4264
0
                ptr++;
4265
0
                break;
4266
0
                }
4267
0
              if (*ptr == CHAR_T)
4268
0
                {
4269
0
                *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4270
0
                ptr++;
4271
0
                break;
4272
0
                }
4273
0
              if (*ptr == CHAR_W)
4274
0
                {
4275
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSW;
4276
0
                ptr++;
4277
0
                break;
4278
0
                }
4279
0
              }
4280
0
            *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4281
0
                        PCRE2_EXTRA_ASCII_BSW|
4282
0
                        PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4283
0
            break;
4284
4285
0
            case CHAR_J:  /* Record that it changed in the external options */
4286
0
            *optset |= PCRE2_DUPNAMES;
4287
0
            cb->external_flags |= PCRE2_JCHANGED;
4288
0
            break;
4289
4290
0
            case CHAR_i: *optset |= PCRE2_CASELESS; break;
4291
0
            case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4292
0
            case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4293
0
            case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4294
0
            case CHAR_s: *optset |= PCRE2_DOTALL; break;
4295
0
            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4296
4297
            /* If x appears twice it sets the extended extended option. */
4298
4299
0
            case CHAR_x:
4300
0
            *optset |= PCRE2_EXTENDED;
4301
0
            if (ptr < ptrend && *ptr == CHAR_x)
4302
0
              {
4303
0
              *optset |= PCRE2_EXTENDED_MORE;
4304
0
              ptr++;
4305
0
              }
4306
0
            break;
4307
4308
3
            default:
4309
3
            errorcode = ERR11;
4310
3
            ptr--;    /* Correct the offset */
4311
3
            goto FAILED;
4312
3
            }
4313
3
          }
4314
4315
        /* If we are setting extended without extended-more, ensure that any
4316
        existing extended-more gets unset. Also, unsetting extended must also
4317
        unset extended-more. */
4318
4319
9
        if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4320
9
            (unset & PCRE2_EXTENDED) != 0)
4321
0
          unset |= PCRE2_EXTENDED_MORE;
4322
4323
9
        options = (options | set) & (~unset);
4324
9
        xoptions = (xoptions | xset) & (~xunset);
4325
4326
        /* If the options ended with ')' this is not the start of a nested
4327
        group with option changes, so the options change at this level.
4328
        In this case, if the previous level set up a nest block, discard the
4329
        one we have just created. Otherwise adjust it for the previous level.
4330
        If the options ended with ':' we are starting a non-capturing group,
4331
        possibly with an options setting. */
4332
4333
9
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4334
9
        if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4335
0
          {
4336
0
          nest_depth--;  /* This is not a nested group after all. */
4337
0
          if (top_nest > (nest_save *)(cb->start_workspace) &&
4338
0
              (top_nest-1)->nest_depth == nest_depth) top_nest--;
4339
0
          else top_nest->nest_depth = nest_depth;
4340
0
          }
4341
9
        else *parsed_pattern++ = META_NOCAPTURE;
4342
4343
        /* If nothing changed, no need to record. */
4344
4345
9
        if (options != oldoptions || xoptions != oldxoptions)
4346
0
          {
4347
0
          *parsed_pattern++ = META_OPTIONS;
4348
0
          *parsed_pattern++ = options;
4349
0
          *parsed_pattern++ = xoptions;
4350
0
          }
4351
9
        }     /* End options processing */
4352
9
      break;  /* End default case after (? */
4353
4354
4355
      /* ---- Python syntax support ---- */
4356
4357
9
      case CHAR_P:
4358
0
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4359
4360
      /* (?P<name> is the same as (?<name>, which defines a named group. */
4361
4362
0
      if (*ptr == CHAR_LESS_THAN_SIGN)
4363
0
        {
4364
0
        terminator = CHAR_GREATER_THAN_SIGN;
4365
0
        goto DEFINE_NAME;
4366
0
        }
4367
4368
      /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4369
      call. */
4370
4371
0
      if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4372
4373
      /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4374
      else after (?P is an error. */
4375
4376
0
      if (*ptr != CHAR_EQUALS_SIGN)
4377
0
        {
4378
0
        errorcode = ERR41;
4379
0
        goto FAILED;
4380
0
        }
4381
0
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4382
0
          &namelen, &errorcode, cb)) goto FAILED;
4383
0
      *parsed_pattern++ = META_BACKREF_BYNAME;
4384
0
      *parsed_pattern++ = namelen;
4385
0
      PUTOFFSET(offset, parsed_pattern);
4386
0
      okquantifier = TRUE;
4387
0
      break;   /* End of (?P processing */
4388
4389
4390
      /* ---- Recursion/subroutine calls by number ---- */
4391
4392
0
      case CHAR_R:
4393
0
      i = 0;         /* (?R) == (?R0) */
4394
0
      ptr++;
4395
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4396
0
        {
4397
0
        errorcode = ERR58;
4398
0
        goto FAILED;
4399
0
        }
4400
0
      goto SET_RECURSION;
4401
4402
      /* An item starting (?- followed by a digit comes here via the "default"
4403
      case because (?- followed by a non-digit is an options setting. */
4404
4405
3
      case CHAR_PLUS:
4406
3
      if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4407
3
        {
4408
3
        errorcode = ERR29;   /* Missing number */
4409
3
        goto FAILED;
4410
3
        }
4411
      /* Fall through */
4412
4413
18
      case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4414
21
      case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4415
21
      RECURSION_BYNUMBER:
4416
21
      if (!read_number(&ptr, ptrend,
4417
21
          (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4418
21
          MAX_GROUP_NUMBER, ERR61,
4419
21
          &i, &errorcode)) goto FAILED;
4420
21
      if (i < 0)  /* NB (?0) is permitted */
4421
0
        {
4422
0
        errorcode = ERR15;   /* Unknown group */
4423
0
        goto FAILED_BACK;
4424
0
        }
4425
21
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4426
0
        goto UNCLOSED_PARENTHESIS;
4427
4428
21
      SET_RECURSION:
4429
21
      *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4430
21
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4431
21
      ptr++;
4432
21
      PUTOFFSET(offset, parsed_pattern);
4433
21
      okquantifier = TRUE;
4434
21
      break;  /* End of recursive call by number handling */
4435
4436
4437
      /* ---- Recursion/subroutine calls by name ---- */
4438
4439
2
      case CHAR_AMPERSAND:
4440
2
      RECURSE_BY_NAME:
4441
2
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4442
2
          &namelen, &errorcode, cb)) goto FAILED;
4443
0
      *parsed_pattern++ = META_RECURSE_BYNAME;
4444
0
      *parsed_pattern++ = namelen;
4445
0
      PUTOFFSET(offset, parsed_pattern);
4446
0
      okquantifier = TRUE;
4447
0
      break;
4448
4449
      /* ---- Callout with numerical or string argument ---- */
4450
4451
0
      case CHAR_C:
4452
0
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4453
4454
      /* If the previous item was a condition starting (?(? an assertion,
4455
      optionally preceded by a callout, is expected. This is checked later on,
4456
      during actual compilation. However we need to identify this kind of
4457
      assertion in this pass because it must not be qualified. The value of
4458
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4459
      for a callout - still leaving a positive value that identifies the
4460
      assertion. Multiple callouts or any other items will make it zero or
4461
      less, which doesn't matter because they will cause an error later. */
4462
4463
0
      expect_cond_assert = prev_expect_cond_assert - 1;
4464
4465
      /* If previous_callout is not NULL, it means this follows a previous
4466
      callout. If it was a manual callout, do nothing; this means its "length
4467
      of next pattern item" field will remain zero. If it was an automatic
4468
      callout, abolish it. */
4469
4470
0
      if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4471
0
          previous_callout == parsed_pattern - 4 &&
4472
0
          parsed_pattern[-1] == 255)
4473
0
        parsed_pattern = previous_callout;
4474
4475
      /* Save for updating next pattern item length, and skip one item before
4476
      completing. */
4477
4478
0
      previous_callout = parsed_pattern;
4479
0
      after_manual_callout = 1;
4480
4481
      /* Handle a string argument; specific delimiter is required. */
4482
4483
0
      if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4484
0
        {
4485
0
        PCRE2_SIZE calloutlength;
4486
0
        PCRE2_SPTR startptr = ptr;
4487
4488
0
        delimiter = 0;
4489
0
        for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4490
0
          {
4491
0
          if (*ptr == PRIV(callout_start_delims)[i])
4492
0
            {
4493
0
            delimiter = PRIV(callout_end_delims)[i];
4494
0
            break;
4495
0
            }
4496
0
          }
4497
0
        if (delimiter == 0)
4498
0
          {
4499
0
          errorcode = ERR82;
4500
0
          goto FAILED;
4501
0
          }
4502
4503
0
        *parsed_pattern = META_CALLOUT_STRING;
4504
0
        parsed_pattern += 3;   /* Skip pattern info */
4505
4506
0
        for (;;)
4507
0
          {
4508
0
          if (++ptr >= ptrend)
4509
0
            {
4510
0
            errorcode = ERR81;
4511
0
            ptr = startptr;   /* To give a more useful message */
4512
0
            goto FAILED;
4513
0
            }
4514
0
          if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4515
0
            break;
4516
0
          }
4517
4518
0
        calloutlength = (PCRE2_SIZE)(ptr - startptr);
4519
0
        if (calloutlength > UINT32_MAX)
4520
0
          {
4521
0
          errorcode = ERR72;
4522
0
          goto FAILED;
4523
0
          }
4524
0
        *parsed_pattern++ = (uint32_t)calloutlength;
4525
0
        offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4526
0
        PUTOFFSET(offset, parsed_pattern);
4527
0
        }
4528
4529
      /* Handle a callout with an optional numerical argument, which must be
4530
      less than or equal to 255. A missing argument gives 0. */
4531
4532
0
      else
4533
0
        {
4534
0
        int n = 0;
4535
0
        *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
4536
0
        parsed_pattern += 3;                       /* Skip pattern info */
4537
0
        while (ptr < ptrend && IS_DIGIT(*ptr))
4538
0
          {
4539
0
          n = n * 10 + *ptr++ - CHAR_0;
4540
0
          if (n > 255)
4541
0
            {
4542
0
            errorcode = ERR38;
4543
0
            goto FAILED;
4544
0
            }
4545
0
          }
4546
0
        *parsed_pattern++ = n;
4547
0
        }
4548
4549
      /* Both formats must have a closing parenthesis */
4550
4551
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4552
0
        {
4553
0
        errorcode = ERR39;
4554
0
        goto FAILED;
4555
0
        }
4556
0
      ptr++;
4557
4558
      /* Remember the offset to the next item in the pattern, and set a default
4559
      length. This should get updated after the next item is read. */
4560
4561
0
      previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4562
0
      previous_callout[2] = 0;
4563
0
      break;                  /* End callout */
4564
4565
4566
      /* ---- Conditional group ---- */
4567
4568
      /* A condition can be an assertion, a number (referring to a numbered
4569
      group's having been set), a name (referring to a named group), or 'R',
4570
      referring to overall recursion. R<digits> and R&name are also permitted
4571
      for recursion state tests. Numbers may be preceded by + or - to specify a
4572
      relative group number.
4573
4574
      There are several syntaxes for testing a named group: (?(name)) is used
4575
      by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4576
4577
      There are two unfortunate ambiguities. 'R' can be the recursive thing or
4578
      the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4579
      the Perl DEFINE feature or the Python named test. We look for a name
4580
      first; if not found, we try the other case.
4581
4582
      For compatibility with auto-callouts, we allow a callout to be specified
4583
      before a condition that is an assertion. */
4584
4585
3
      case CHAR_LEFT_PARENTHESIS:
4586
3
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4587
3
      nest_depth++;
4588
4589
      /* If the next character is ? or * there must be an assertion next
4590
      (optionally preceded by a callout). We do not check this here, but
4591
      instead we set expect_cond_assert to 2. If this is still greater than
4592
      zero (callouts decrement it) when the next assertion is read, it will be
4593
      marked as a condition that must not be repeated. A value greater than
4594
      zero also causes checking that an assertion (possibly with callout)
4595
      follows. */
4596
4597
3
      if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4598
0
        {
4599
0
        *parsed_pattern++ = META_COND_ASSERT;
4600
0
        ptr--;   /* Pull pointer back to the opening parenthesis. */
4601
0
        expect_cond_assert = 2;
4602
0
        break;  /* End of conditional */
4603
0
        }
4604
4605
      /* Handle (?([+-]number)... */
4606
4607
3
      if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4608
3
          &errorcode))
4609
0
        {
4610
0
        if (i <= 0)
4611
0
          {
4612
0
          errorcode = ERR15;
4613
0
          goto FAILED;
4614
0
          }
4615
0
        *parsed_pattern++ = META_COND_NUMBER;
4616
0
        offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4617
0
        PUTOFFSET(offset, parsed_pattern);
4618
0
        *parsed_pattern++ = i;
4619
0
        }
4620
3
      else if (errorcode != 0) goto FAILED;   /* Number too big */
4621
4622
      /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4623
4624
3
      else if (ptrend - ptr >= 10 &&
4625
3
               PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4626
0
               ptr[7] != CHAR_RIGHT_PARENTHESIS)
4627
0
        {
4628
0
        uint32_t ge = 0;
4629
0
        int major = 0;
4630
0
        int minor = 0;
4631
4632
0
        ptr += 7;
4633
0
        if (*ptr == CHAR_GREATER_THAN_SIGN)
4634
0
          {
4635
0
          ge = 1;
4636
0
          ptr++;
4637
0
          }
4638
4639
        /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4640
        references its argument twice. */
4641
4642
0
        if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4643
0
          goto BAD_VERSION_CONDITION;
4644
4645
0
        if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4646
0
          goto FAILED;
4647
4648
0
        if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4649
0
        if (*ptr == CHAR_DOT)
4650
0
          {
4651
0
          if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4652
0
          minor = (*ptr++ - CHAR_0) * 10;
4653
0
          if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4654
0
          if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4655
0
          if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4656
0
            goto BAD_VERSION_CONDITION;
4657
0
          }
4658
4659
0
        *parsed_pattern++ = META_COND_VERSION;
4660
0
        *parsed_pattern++ = ge;
4661
0
        *parsed_pattern++ = major;
4662
0
        *parsed_pattern++ = minor;
4663
0
        }
4664
4665
      /* All the remaining cases now require us to read a name. We cannot at
4666
      this stage distinguish ambiguous cases such as (?(R12) which might be a
4667
      recursion test by number or a name, because the named groups have not yet
4668
      all been identified. Those cases are treated as names, but given a
4669
      different META code. */
4670
4671
3
      else
4672
3
        {
4673
3
        BOOL was_r_ampersand = FALSE;
4674
4675
3
        if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4676
0
          {
4677
0
          terminator = CHAR_RIGHT_PARENTHESIS;
4678
0
          was_r_ampersand = TRUE;
4679
0
          ptr++;
4680
0
          }
4681
3
        else if (*ptr == CHAR_LESS_THAN_SIGN)
4682
0
          terminator = CHAR_GREATER_THAN_SIGN;
4683
3
        else if (*ptr == CHAR_APOSTROPHE)
4684
0
          terminator = CHAR_APOSTROPHE;
4685
3
        else
4686
3
          {
4687
3
          terminator = CHAR_RIGHT_PARENTHESIS;
4688
3
          ptr--;   /* Point to char before name */
4689
3
          }
4690
3
        if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4691
3
            &errorcode, cb)) goto FAILED;
4692
4693
        /* Handle (?(R&name) */
4694
4695
0
        if (was_r_ampersand)
4696
0
          {
4697
0
          *parsed_pattern = META_COND_RNAME;
4698
0
          ptr--;   /* Back to closing parens */
4699
0
          }
4700
4701
        /* Handle (?(name). If the name is "DEFINE" we identify it with a
4702
        special code. Likewise if the name consists of R followed only by
4703
        digits. Otherwise, handle it like a quoted name. */
4704
4705
0
        else if (terminator == CHAR_RIGHT_PARENTHESIS)
4706
0
          {
4707
0
          if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4708
0
            *parsed_pattern = META_COND_DEFINE;
4709
0
          else
4710
0
            {
4711
0
            for (i = 1; i < (int)namelen; i++)
4712
0
              if (!IS_DIGIT(name[i])) break;
4713
0
            *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4714
0
              META_COND_RNUMBER : META_COND_NAME;
4715
0
            }
4716
0
          ptr--;   /* Back to closing parens */
4717
0
          }
4718
4719
        /* Handle (?('name') or (?(<name>) */
4720
4721
0
        else *parsed_pattern = META_COND_NAME;
4722
4723
        /* All these cases except DEFINE end with the name length and offset;
4724
        DEFINE just has an offset (for the "too many branches" error). */
4725
4726
0
        if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4727
0
        PUTOFFSET(offset, parsed_pattern);
4728
0
        }  /* End cases that read a name */
4729
4730
      /* Check the closing parenthesis of the condition */
4731
4732
0
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4733
0
        {
4734
0
        errorcode = ERR24;
4735
0
        goto FAILED;
4736
0
        }
4737
0
      ptr++;
4738
0
      break;  /* End of condition processing */
4739
4740
4741
      /* ---- Atomic group ---- */
4742
4743
21
      case CHAR_GREATER_THAN_SIGN:
4744
21
      ATOMIC_GROUP:                          /* Come from (*atomic: */
4745
21
      *parsed_pattern++ = META_ATOMIC;
4746
21
      nest_depth++;
4747
21
      ptr++;
4748
21
      break;
4749
4750
4751
      /* ---- Lookahead assertions ---- */
4752
4753
5
      case CHAR_EQUALS_SIGN:
4754
5
      POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
4755
5
      *parsed_pattern++ = META_LOOKAHEAD;
4756
5
      ptr++;
4757
5
      goto POST_ASSERTION;
4758
4759
3
      case CHAR_ASTERISK:
4760
3
      POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (?* */
4761
3
      *parsed_pattern++ = META_LOOKAHEAD_NA;
4762
3
      ptr++;
4763
3
      goto POST_ASSERTION;
4764
4765
3
      case CHAR_EXCLAMATION_MARK:
4766
3
      NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
4767
3
      *parsed_pattern++ = META_LOOKAHEADNOT;
4768
3
      ptr++;
4769
3
      goto POST_ASSERTION;
4770
4771
4772
      /* ---- Lookbehind assertions ---- */
4773
4774
      /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4775
      is the start of the name of a capturing group. */
4776
4777
2
      case CHAR_LESS_THAN_SIGN:
4778
2
      if (ptrend - ptr <= 1 ||
4779
2
         (ptr[1] != CHAR_EQUALS_SIGN &&
4780
0
          ptr[1] != CHAR_EXCLAMATION_MARK &&
4781
0
          ptr[1] != CHAR_ASTERISK))
4782
0
        {
4783
0
        terminator = CHAR_GREATER_THAN_SIGN;
4784
0
        goto DEFINE_NAME;
4785
0
        }
4786
2
      *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4787
2
        META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4788
0
        META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4789
4790
2
      POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
4791
2
      *has_lookbehind = TRUE;
4792
2
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4793
2
      PUTOFFSET(offset, parsed_pattern);
4794
2
      ptr += 2;
4795
      /* Fall through */
4796
4797
      /* If the previous item was a condition starting (?(? an assertion,
4798
      optionally preceded by a callout, is expected. This is checked later on,
4799
      during actual compilation. However we need to identify this kind of
4800
      assertion in this pass because it must not be qualified. The value of
4801
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4802
      for a callout - still leaving a positive value that identifies the
4803
      assertion. Multiple callouts or any other items will make it zero or
4804
      less, which doesn't matter because they will cause an error later. */
4805
4806
13
      POST_ASSERTION:
4807
13
      nest_depth++;
4808
13
      if (prev_expect_cond_assert > 0)
4809
0
        {
4810
0
        if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4811
0
        else if (++top_nest >= end_nests)
4812
0
          {
4813
0
          errorcode = ERR84;
4814
0
          goto FAILED;
4815
0
          }
4816
0
        top_nest->nest_depth = nest_depth;
4817
0
        top_nest->flags = NSF_CONDASSERT;
4818
0
        top_nest->options = options & PARSE_TRACKED_OPTIONS;
4819
0
        top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4820
0
        }
4821
13
      break;
4822
4823
4824
      /* ---- Define a named group ---- */
4825
4826
      /* A named group may be defined as (?'name') or (?<name>). In the latter
4827
      case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4828
      terminator set to '>'. */
4829
4830
13
      case CHAR_APOSTROPHE:
4831
3
      terminator = CHAR_APOSTROPHE;    /* Terminator */
4832
4833
3
      DEFINE_NAME:
4834
3
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4835
3
          &errorcode, cb)) goto FAILED;
4836
4837
      /* We have a name for this capturing group. It is also assigned a number,
4838
      which is its primary means of identification. */
4839
4840
0
      if (cb->bracount >= MAX_GROUP_NUMBER)
4841
0
        {
4842
0
        errorcode = ERR97;
4843
0
        goto FAILED;
4844
0
        }
4845
0
      cb->bracount++;
4846
0
      *parsed_pattern++ = META_CAPTURE | cb->bracount;
4847
0
      nest_depth++;
4848
4849
      /* Check not too many names */
4850
4851
0
      if (cb->names_found >= MAX_NAME_COUNT)
4852
0
        {
4853
0
        errorcode = ERR49;
4854
0
        goto FAILED;
4855
0
        }
4856
4857
      /* Adjust the entry size to accommodate the longest name found. */
4858
4859
0
      if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4860
0
        cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4861
4862
      /* Scan the list to check for duplicates. For duplicate names, if the
4863
      number is the same, break the loop, which causes the name to be
4864
      discarded; otherwise, if DUPNAMES is not set, give an error.
4865
      If it is set, allow the name with a different number, but continue
4866
      scanning in case this is a duplicate with the same number. For
4867
      non-duplicate names, give an error if the number is duplicated. */
4868
4869
0
      isdupname = FALSE;
4870
0
      ng = cb->named_groups;
4871
0
      for (i = 0; i < cb->names_found; i++, ng++)
4872
0
        {
4873
0
        if (namelen == ng->length &&
4874
0
            PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4875
0
          {
4876
0
          if (ng->number == cb->bracount) break;
4877
0
          if ((options & PCRE2_DUPNAMES) == 0)
4878
0
            {
4879
0
            errorcode = ERR43;
4880
0
            goto FAILED;
4881
0
            }
4882
0
          isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
4883
0
          cb->dupnames = TRUE;              /* Duplicate names exist */
4884
0
          }
4885
0
        else if (ng->number == cb->bracount)
4886
0
          {
4887
0
          errorcode = ERR65;
4888
0
          goto FAILED;
4889
0
          }
4890
0
        }
4891
4892
0
      if (i < cb->names_found) break;   /* Ignore duplicate with same number */
4893
4894
      /* Increase the list size if necessary */
4895
4896
0
      if (cb->names_found >= cb->named_group_list_size)
4897
0
        {
4898
0
        uint32_t newsize = cb->named_group_list_size * 2;
4899
0
        named_group *newspace =
4900
0
          cb->cx->memctl.malloc(newsize * sizeof(named_group),
4901
0
          cb->cx->memctl.memory_data);
4902
0
        if (newspace == NULL)
4903
0
          {
4904
0
          errorcode = ERR21;
4905
0
          goto FAILED;
4906
0
          }
4907
4908
0
        memcpy(newspace, cb->named_groups,
4909
0
          cb->named_group_list_size * sizeof(named_group));
4910
0
        if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4911
0
          cb->cx->memctl.free((void *)cb->named_groups,
4912
0
          cb->cx->memctl.memory_data);
4913
0
        cb->named_groups = newspace;
4914
0
        cb->named_group_list_size = newsize;
4915
0
        }
4916
4917
      /* Add this name to the list */
4918
4919
0
      cb->named_groups[cb->names_found].name = name;
4920
0
      cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4921
0
      cb->named_groups[cb->names_found].number = cb->bracount;
4922
0
      cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4923
0
      cb->names_found++;
4924
0
      break;
4925
78
      }        /* End of (? switch */
4926
64
    break;     /* End of ( handling */
4927
4928
4929
    /* ---- Branch terminators ---- */
4930
4931
    /* Alternation: reset the capture count if we are in a (?| group. */
4932
4933
12.1k
    case CHAR_VERTICAL_LINE:
4934
12.1k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4935
2
        (top_nest->flags & NSF_RESET) != 0)
4936
0
      {
4937
0
      if (cb->bracount > top_nest->max_group)
4938
0
        top_nest->max_group = (uint16_t)cb->bracount;
4939
0
      cb->bracount = top_nest->reset_group;
4940
0
      }
4941
12.1k
    *parsed_pattern++ = META_ALT;
4942
12.1k
    break;
4943
4944
    /* End of group; reset the capture count to the maximum if we are in a (?|
4945
    group and/or reset the options that are tracked during parsing. Disallow
4946
    quantifier for a condition that is an assertion. */
4947
4948
2.95k
    case CHAR_RIGHT_PARENTHESIS:
4949
2.95k
    okquantifier = TRUE;
4950
2.95k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4951
9
      {
4952
9
      options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4953
9
      xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
4954
9
      if ((top_nest->flags & NSF_RESET) != 0 &&
4955
0
          top_nest->max_group > cb->bracount)
4956
0
        cb->bracount = top_nest->max_group;
4957
9
      if ((top_nest->flags & NSF_CONDASSERT) != 0)
4958
0
        okquantifier = FALSE;
4959
4960
9
      if ((top_nest->flags & NSF_ATOMICSR) != 0)
4961
0
        {
4962
0
        *parsed_pattern++ = META_KET;
4963
0
        }
4964
4965
9
      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4966
0
        else top_nest--;
4967
9
      }
4968
2.95k
    if (nest_depth == 0)    /* Unmatched closing parenthesis */
4969
47
      {
4970
47
      errorcode = ERR22;
4971
47
      goto FAILED_BACK;
4972
47
      }
4973
2.91k
    nest_depth--;
4974
2.91k
    *parsed_pattern++ = META_KET;
4975
2.91k
    break;
4976
378k
    }  /* End of switch on pattern character */
4977
378k
  }    /* End of main character scan loop */
4978
4979
/* End of pattern reached. Check for missing ) at the end of a verb name. */
4980
4981
1.10k
if (inverbname && ptr >= ptrend)
4982
0
  {
4983
0
  errorcode = ERR60;
4984
0
  goto FAILED;
4985
0
  }
4986
4987
/* Manage callout for the final item */
4988
4989
1.10k
PARSED_END:
4990
1.10k
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4991
1.10k
  parsed_pattern, cb);
4992
4993
/* Insert trailing items for word and line matching (features provided for the
4994
benefit of pcre2grep). */
4995
4996
1.10k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
4997
0
  {
4998
0
  *parsed_pattern++ = META_KET;
4999
0
  *parsed_pattern++ = META_DOLLAR;
5000
0
  }
5001
1.10k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5002
0
  {
5003
0
  *parsed_pattern++ = META_KET;
5004
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
5005
0
  }
5006
5007
/* Terminate the parsed pattern, then return success if all groups are closed.
5008
Otherwise we have unclosed parentheses. */
5009
5010
1.10k
if (parsed_pattern >= parsed_pattern_end)
5011
0
  {
5012
0
  errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5013
0
  goto FAILED;
5014
0
  }
5015
5016
1.10k
*parsed_pattern = META_END;
5017
1.10k
if (nest_depth == 0) return 0;
5018
5019
71
UNCLOSED_PARENTHESIS:
5020
71
errorcode = ERR14;
5021
5022
/* Come here for all failures. */
5023
5024
464
FAILED:
5025
464
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5026
464
return errorcode;
5027
5028
/* Some errors need to indicate the previous character. */
5029
5030
92
FAILED_BACK:
5031
92
ptr--;
5032
92
goto FAILED;
5033
5034
/* This failure happens several times. */
5035
5036
0
BAD_VERSION_CONDITION:
5037
0
errorcode = ERR79;
5038
0
goto FAILED;
5039
71
}
5040
5041
5042
5043
/*************************************************
5044
*       Find first significant opcode            *
5045
*************************************************/
5046
5047
/* This is called by several functions that scan a compiled expression looking
5048
for a fixed first character, or an anchoring opcode etc. It skips over things
5049
that do not influence this. For some calls, it makes sense to skip negative
5050
forward and all backward assertions, and also the \b assertion; for others it
5051
does not.
5052
5053
Arguments:
5054
  code         pointer to the start of the group
5055
  skipassert   TRUE if certain assertions are to be skipped
5056
5057
Returns:       pointer to the first significant opcode
5058
*/
5059
5060
static const PCRE2_UCHAR*
5061
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5062
2.72k
{
5063
2.72k
for (;;)
5064
2.83k
  {
5065
2.83k
  switch ((int)*code)
5066
2.83k
    {
5067
0
    case OP_ASSERT_NOT:
5068
0
    case OP_ASSERTBACK:
5069
0
    case OP_ASSERTBACK_NOT:
5070
0
    case OP_ASSERTBACK_NA:
5071
0
    if (!skipassert) return code;
5072
0
    do code += GET(code, 1); while (*code == OP_ALT);
5073
0
    code += PRIV(OP_lengths)[*code];
5074
0
    break;
5075
5076
6
    case OP_WORD_BOUNDARY:
5077
158
    case OP_NOT_WORD_BOUNDARY:
5078
176
    case OP_UCP_WORD_BOUNDARY:
5079
338
    case OP_NOT_UCP_WORD_BOUNDARY:
5080
338
    if (!skipassert) return code;
5081
    /* Fall through */
5082
5083
111
    case OP_CALLOUT:
5084
111
    case OP_CREF:
5085
111
    case OP_DNCREF:
5086
111
    case OP_RREF:
5087
111
    case OP_DNRREF:
5088
111
    case OP_FALSE:
5089
111
    case OP_TRUE:
5090
111
    code += PRIV(OP_lengths)[*code];
5091
111
    break;
5092
5093
0
    case OP_CALLOUT_STR:
5094
0
    code += GET(code, 1 + 2*LINK_SIZE);
5095
0
    break;
5096
5097
0
    case OP_SKIPZERO:
5098
0
    code += 2 + GET(code, 2) + LINK_SIZE;
5099
0
    break;
5100
5101
0
    case OP_COND:
5102
0
    case OP_SCOND:
5103
0
    if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
5104
0
        code[GET(code, 1)] != OP_KET)      /* More than one branch */
5105
0
      return code;
5106
0
    code += GET(code, 1) + 1 + LINK_SIZE;
5107
0
    break;
5108
5109
0
    case OP_MARK:
5110
0
    case OP_COMMIT_ARG:
5111
0
    case OP_PRUNE_ARG:
5112
0
    case OP_SKIP_ARG:
5113
0
    case OP_THEN_ARG:
5114
0
    code += code[1] + PRIV(OP_lengths)[*code];
5115
0
    break;
5116
5117
2.49k
    default:
5118
2.49k
    return code;
5119
2.83k
    }
5120
2.83k
  }
5121
/* Control never reaches here */
5122
2.72k
}
5123
5124
5125
5126
#ifdef SUPPORT_UNICODE
5127
/*************************************************
5128
*           Get othercase range                  *
5129
*************************************************/
5130
5131
/* This function is passed the start and end of a class range in UCP mode. For
5132
single characters the range may be just one character long. The function
5133
searches up the characters, looking for ranges of characters in the "other"
5134
case. Each call returns the next one, updating the start address. A character
5135
with multiple other cases is returned on its own with a special return value.
5136
5137
Arguments:
5138
  cptr        points to starting character value; updated
5139
  d           end value
5140
  ocptr       where to put start of othercase range
5141
  odptr       where to put end of othercase range
5142
  restricted  TRUE if caseless restriction applies
5143
5144
Yield:        -1 when no more
5145
               0 when a range is returned
5146
              >0 the CASESET offset for char with multiple other cases;
5147
                 for this return, *ocptr contains the original
5148
*/
5149
5150
static int
5151
get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
5152
  uint32_t *odptr, BOOL restricted)
5153
37.6k
{
5154
37.6k
uint32_t c, othercase, next;
5155
37.6k
unsigned int co;
5156
5157
/* Find the first character that has an other case. If it has multiple other
5158
cases, return its case offset value. When CASELESS_RESTRICT is set, ignore the
5159
multi-case entries that begin with ASCII values. In 32-bit mode, a value
5160
greater than the Unicode maximum ends the range. */
5161
5162
80.5k
for (c = *cptr; c <= d; c++)
5163
53.5k
  {
5164
#if PCRE2_CODE_UNIT_WIDTH == 32
5165
  if (c > MAX_UTF_CODE_POINT) return -1;
5166
#endif
5167
53.5k
  if ((co = UCD_CASESET(c)) != 0 &&
5168
1.68k
      (!restricted || PRIV(ucd_caseless_sets)[co] > 127))
5169
1.68k
    {
5170
1.68k
    *ocptr = c++;   /* Character that has the set */
5171
1.68k
    *cptr = c;      /* Rest of input range */
5172
1.68k
    return (int)co;
5173
1.68k
    }
5174
5175
   /* This is not a valid multiple-case character. Check that the single other
5176
   case is different to the original. We don't need to check "restricted" here
5177
   because the non-ASCII characters with multiple cases that include an ASCII
5178
   character don't have a different "othercase". */
5179
5180
51.8k
  if ((othercase = UCD_OTHERCASE(c)) != c) break;
5181
51.8k
  }
5182
5183
35.9k
if (c > d) return -1;  /* Reached end of range */
5184
5185
/* Found a character that has a single other case. Search for the end of the
5186
range, which is either the end of the input range, or a character that has zero
5187
or more than one other cases. */
5188
5189
9.05k
*ocptr = othercase;
5190
9.05k
next = othercase + 1;
5191
5192
19.2k
for (++c; c <= d; c++)
5193
12.2k
  {
5194
12.2k
  if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
5195
10.1k
  next++;
5196
10.1k
  }
5197
5198
9.05k
*odptr = next - 1;     /* End of othercase range */
5199
9.05k
*cptr = c;             /* Rest of input range */
5200
9.05k
return 0;
5201
35.9k
}
5202
#endif  /* SUPPORT_UNICODE */
5203
5204
5205
5206
/*************************************************
5207
* Add a character or range to a class (internal) *
5208
*************************************************/
5209
5210
/* This function packages up the logic of adding a character or range of
5211
characters to a class. The character values in the arguments will be within the
5212
valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
5213
called only from within the "add to class" group of functions, some of which
5214
are recursive and mutually recursive. The external entry point is
5215
add_to_class().
5216
5217
Arguments:
5218
  classbits     the bit map for characters < 256
5219
  uchardptr     points to the pointer for extra data
5220
  options       the options bits
5221
  xoptions      the extra options bits
5222
  cb            compile data
5223
  start         start of range character
5224
  end           end of range character
5225
5226
Returns:        the number of < 256 characters added
5227
                the pointer to extra data is updated
5228
*/
5229
5230
static unsigned int
5231
add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5232
  uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start,
5233
  uint32_t end)
5234
173k
{
5235
173k
uint32_t c;
5236
173k
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5237
173k
unsigned int n8 = 0;
5238
5239
/* If caseless matching is required, scan the range and process alternate
5240
cases. In Unicode, there are 8-bit characters that have alternate cases that
5241
are greater than 255 and vice-versa (though these may be ignored if caseless
5242
restriction is in force). Sometimes we can just extend the original range. */
5243
5244
173k
if ((options & PCRE2_CASELESS) != 0)
5245
84.0k
  {
5246
84.0k
#ifdef SUPPORT_UNICODE
5247
84.0k
  if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5248
26.9k
    {
5249
26.9k
    int rc;
5250
26.9k
    uint32_t oc, od;
5251
5252
26.9k
    options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
5253
26.9k
    c = start;
5254
5255
37.6k
    while ((rc = get_othercase_range(&c, end, &oc, &od,
5256
37.6k
             (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)) >= 0)
5257
10.7k
      {
5258
      /* Handle a single character that has more than one other case. */
5259
5260
10.7k
      if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr,
5261
1.68k
        options, xoptions, cb, PRIV(ucd_caseless_sets) + rc, oc);
5262
5263
      /* Do nothing if the other case range is within the original range. */
5264
5265
9.05k
      else if (oc >= cb->class_range_start && od <= cb->class_range_end)
5266
716
        continue;
5267
5268
      /* Extend the original range if there is overlap, noting that if oc < c,
5269
      we can't have od > end because a subrange is always shorter than the
5270
      basic range. Otherwise, use a recursive call to add the additional range.
5271
      */
5272
5273
8.33k
      else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5274
8.33k
      else if (od > end && oc <= end + 1)
5275
28
        {
5276
28
        end = od;       /* Extend upwards */
5277
28
        if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5278
28
        }
5279
8.30k
      else n8 += add_to_class_internal(classbits, uchardptr, options, xoptions,
5280
8.30k
        cb, oc, od);
5281
10.7k
      }
5282
26.9k
    }
5283
57.0k
  else
5284
#else
5285
  (void)xoptions;   /* Avoid compiler warning */
5286
#endif  /* SUPPORT_UNICODE */
5287
5288
  /* Not UTF mode */
5289
5290
123k
  for (c = start; c <= classbits_end; c++)
5291
66.8k
    {
5292
66.8k
    SETBIT(classbits, cb->fcc[c]);
5293
66.8k
    n8++;
5294
66.8k
    }
5295
84.0k
  }
5296
5297
/* Now handle the originally supplied range. Adjust the final value according
5298
to the bit length - this means that the same lists of (e.g.) horizontal spaces
5299
can be used in all cases. */
5300
5301
173k
if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5302
272
  end = MAX_NON_UTF_CHAR;
5303
5304
173k
if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5305
5306
/* Use the bitmap for characters < 256. Otherwise use extra data.*/
5307
5308
495k
for (c = start; c <= classbits_end; c++)
5309
322k
  {
5310
  /* Regardless of start, c will always be <= 255. */
5311
322k
  SETBIT(classbits, c);
5312
322k
  n8++;
5313
322k
  }
5314
5315
173k
#ifdef SUPPORT_WIDE_CHARS
5316
173k
if (start <= 0xff) start = 0xff + 1;
5317
5318
173k
if (end >= start)
5319
1.82k
  {
5320
1.82k
  PCRE2_UCHAR *uchardata = *uchardptr;
5321
5322
1.82k
#ifdef SUPPORT_UNICODE
5323
1.82k
  if ((options & PCRE2_UTF) != 0)
5324
1.82k
    {
5325
1.82k
    if (start < end)
5326
8
      {
5327
8
      *uchardata++ = XCL_RANGE;
5328
8
      uchardata += PRIV(ord2utf)(start, uchardata);
5329
8
      uchardata += PRIV(ord2utf)(end, uchardata);
5330
8
      }
5331
1.81k
    else if (start == end)
5332
1.81k
      {
5333
1.81k
      *uchardata++ = XCL_SINGLE;
5334
1.81k
      uchardata += PRIV(ord2utf)(start, uchardata);
5335
1.81k
      }
5336
1.82k
    }
5337
0
  else
5338
0
#endif  /* SUPPORT_UNICODE */
5339
5340
  /* Without UTF support, character values are constrained by the bit length,
5341
  and can only be > 256 for 16-bit and 32-bit libraries. */
5342
5343
0
#if PCRE2_CODE_UNIT_WIDTH == 8
5344
0
    {}
5345
#else
5346
  if (start < end)
5347
    {
5348
    *uchardata++ = XCL_RANGE;
5349
    *uchardata++ = start;
5350
    *uchardata++ = end;
5351
    }
5352
  else if (start == end)
5353
    {
5354
    *uchardata++ = XCL_SINGLE;
5355
    *uchardata++ = start;
5356
    }
5357
#endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
5358
1.82k
  *uchardptr = uchardata;   /* Updata extra data pointer */
5359
1.82k
  }
5360
#else  /* SUPPORT_WIDE_CHARS */
5361
  (void)uchardptr;          /* Avoid compiler warning */
5362
#endif /* SUPPORT_WIDE_CHARS */
5363
5364
173k
return n8;    /* Number of 8-bit characters */
5365
173k
}
5366
5367
5368
5369
#ifdef SUPPORT_UNICODE
5370
/*************************************************
5371
* Add a list of characters to a class (internal) *
5372
*************************************************/
5373
5374
/* This function is used for adding a list of case-equivalent characters to a
5375
class when in UTF mode. This function is called only from within
5376
add_to_class_internal(), with which it is mutually recursive.
5377
5378
Arguments:
5379
  classbits     the bit map for characters < 256
5380
  uchardptr     points to the pointer for extra data
5381
  options       the options bits
5382
  xoptions      the extra options bits
5383
  cb            contains pointers to tables etc.
5384
  p             points to row of 32-bit values, terminated by NOTACHAR
5385
  except        character to omit; this is used when adding lists of
5386
                  case-equivalent characters to avoid including the one we
5387
                  already know about
5388
5389
Returns:        the number of < 256 characters added
5390
                the pointer to extra data is updated
5391
*/
5392
5393
static unsigned int
5394
add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5395
  uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p,
5396
  unsigned int except)
5397
1.68k
{
5398
1.68k
unsigned int n8 = 0;
5399
6.68k
while (p[0] < NOTACHAR)
5400
5.00k
  {
5401
5.00k
  unsigned int n = 0;
5402
5.00k
  if (p[0] != except)
5403
3.33k
    {
5404
3.37k
    while(p[n+1] == p[0] + n + 1) n++;
5405
3.33k
    n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5406
3.33k
      p[0], p[n]);
5407
3.33k
    }
5408
5.00k
  p += n + 1;
5409
5.00k
  }
5410
1.68k
return n8;
5411
1.68k
}
5412
#endif
5413
5414
5415
5416
/*************************************************
5417
*   External entry point for add range to class  *
5418
*************************************************/
5419
5420
/* This function sets the overall range so that the internal functions can try
5421
to avoid duplication when handling case-independence.
5422
5423
Arguments:
5424
  classbits     the bit map for characters < 256
5425
  uchardptr     points to the pointer for extra data
5426
  options       the options bits
5427
  xoptions      the extra options bits
5428
  cb            compile data
5429
  start         start of range character
5430
  end           end of range character
5431
5432
Returns:        the number of < 256 characters added
5433
                the pointer to extra data is updated
5434
*/
5435
5436
static unsigned int
5437
add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5438
  uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end)
5439
161k
{
5440
161k
cb->class_range_start = start;
5441
161k
cb->class_range_end = end;
5442
161k
return add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5443
161k
  start, end);
5444
161k
}
5445
5446
5447
/*************************************************
5448
*   External entry point for add list to class   *
5449
*************************************************/
5450
5451
/* This function is used for adding a list of horizontal or vertical whitespace
5452
characters to a class. The list must be in order so that ranges of characters
5453
can be detected and handled appropriately. This function sets the overall range
5454
so that the internal functions can try to avoid duplication when handling
5455
case-independence.
5456
5457
Arguments:
5458
  classbits     the bit map for characters < 256
5459
  uchardptr     points to the pointer for extra data
5460
  options       the options bits
5461
  xoptions      the extra options bits
5462
  cb            contains pointers to tables etc.
5463
  p             points to row of 32-bit values, terminated by NOTACHAR
5464
  except        character to omit; this is used when adding lists of
5465
                  case-equivalent characters to avoid including the one we
5466
                  already know about
5467
5468
Returns:        the number of < 256 characters added
5469
                the pointer to extra data is updated
5470
*/
5471
5472
static unsigned int
5473
add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5474
  uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except)
5475
4
{
5476
4
unsigned int n8 = 0;
5477
40
while (p[0] < NOTACHAR)
5478
36
  {
5479
36
  unsigned int n = 0;
5480
36
  if (p[0] != except)
5481
36
    {
5482
76
    while(p[n+1] == p[0] + n + 1) n++;
5483
36
    cb->class_range_start = p[0];
5484
36
    cb->class_range_end = p[n];
5485
36
    n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5486
36
      p[0], p[n]);
5487
36
    }
5488
36
  p += n + 1;
5489
36
  }
5490
4
return n8;
5491
4
}
5492
5493
5494
5495
/*************************************************
5496
*    Add characters not in a list to a class     *
5497
*************************************************/
5498
5499
/* This function is used for adding the complement of a list of horizontal or
5500
vertical whitespace to a class. The list must be in order.
5501
5502
Arguments:
5503
  classbits     the bit map for characters < 256
5504
  uchardptr     points to the pointer for extra data
5505
  options       the options bits
5506
  xoptions      the extra options bits
5507
  cb            contains pointers to tables etc.
5508
  p             points to row of 32-bit values, terminated by NOTACHAR
5509
5510
Returns:        the number of < 256 characters added
5511
                the pointer to extra data is updated
5512
*/
5513
5514
static unsigned int
5515
add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5516
  uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p)
5517
116
{
5518
116
BOOL utf = (options & PCRE2_UTF) != 0;
5519
116
unsigned int n8 = 0;
5520
116
if (p[0] > 0)
5521
116
  n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, 0, p[0] - 1);
5522
512
while (p[0] < NOTACHAR)
5523
396
  {
5524
908
  while (p[1] == p[0] + 1) p++;
5525
396
  n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, p[0] + 1,
5526
396
    (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5527
396
  p++;
5528
396
  }
5529
116
return n8;
5530
116
}
5531
5532
5533
5534
/*************************************************
5535
*    Find details of duplicate group names       *
5536
*************************************************/
5537
5538
/* This is called from compile_branch() when it needs to know the index and
5539
count of duplicates in the names table when processing named backreferences,
5540
either directly, or as conditions.
5541
5542
Arguments:
5543
  name          points to the name
5544
  length        the length of the name
5545
  indexptr      where to put the index
5546
  countptr      where to put the count of duplicates
5547
  errorcodeptr  where to put an error code
5548
  cb            the compile block
5549
5550
Returns:        TRUE if OK, FALSE if not, error code set
5551
*/
5552
5553
static BOOL
5554
find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5555
  int *countptr, int *errorcodeptr, compile_block *cb)
5556
0
{
5557
0
uint32_t i, groupnumber;
5558
0
int count;
5559
0
PCRE2_UCHAR *slot = cb->name_table;
5560
5561
/* Find the first entry in the table */
5562
5563
0
for (i = 0; i < cb->names_found; i++)
5564
0
  {
5565
0
  if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5566
0
      slot[IMM2_SIZE+length] == 0) break;
5567
0
  slot += cb->name_entry_size;
5568
0
  }
5569
5570
/* This should not occur, because this function is called only when we know we
5571
have duplicate names. Give an internal error. */
5572
5573
0
if (i >= cb->names_found)
5574
0
  {
5575
0
  *errorcodeptr = ERR53;
5576
0
  cb->erroroffset = name - cb->start_pattern;
5577
0
  return FALSE;
5578
0
  }
5579
5580
/* Record the index and then see how many duplicates there are, updating the
5581
backref map and maximum back reference as we do. */
5582
5583
0
*indexptr = i;
5584
0
count = 0;
5585
5586
0
for (;;)
5587
0
  {
5588
0
  count++;
5589
0
  groupnumber = GET2(slot,0);
5590
0
  cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5591
0
  if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5592
0
  if (++i >= cb->names_found) break;
5593
0
  slot += cb->name_entry_size;
5594
0
  if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5595
0
    (slot+IMM2_SIZE)[length] != 0) break;
5596
0
  }
5597
5598
0
*countptr = count;
5599
0
return TRUE;
5600
0
}
5601
5602
5603
5604
/*************************************************
5605
*           Compile one branch                   *
5606
*************************************************/
5607
5608
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5609
the options are changed during the branch, the pointer is used to change the
5610
external options bits. This function is used during the pre-compile phase when
5611
we are trying to find out the amount of memory needed, as well as during the
5612
real compile phase. The value of lengthptr distinguishes the two phases.
5613
5614
Arguments:
5615
  optionsptr        pointer to the option bits
5616
  xoptionsptr       pointer to the extra option bits
5617
  codeptr           points to the pointer to the current code point
5618
  pptrptr           points to the current parsed pattern pointer
5619
  errorcodeptr      points to error code variable
5620
  firstcuptr        place to put the first required code unit
5621
  firstcuflagsptr   place to put the first code unit flags
5622
  reqcuptr          place to put the last required code unit
5623
  reqcuflagsptr     place to put the last required code unit flags
5624
  bcptr             points to current branch chain
5625
  open_caps         points to current capitem
5626
  cb                contains pointers to tables etc.
5627
  lengthptr         NULL during the real compile phase
5628
                    points to length accumulator during pre-compile phase
5629
5630
Returns:            0 There's been an error, *errorcodeptr is non-zero
5631
                   +1 Success, this branch must match at least one character
5632
                   -1 Success, this branch may match an empty string
5633
*/
5634
5635
static int
5636
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5637
  PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5638
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5639
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5640
  compile_block *cb, PCRE2_SIZE *lengthptr)
5641
18.5k
{
5642
18.5k
int bravalue = 0;
5643
18.5k
int okreturn = -1;
5644
18.5k
int group_return = 0;
5645
18.5k
uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5646
18.5k
uint32_t greedy_default, greedy_non_default;
5647
18.5k
uint32_t repeat_type, op_type;
5648
18.5k
uint32_t options = *optionsptr;               /* May change dynamically */
5649
18.5k
uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
5650
18.5k
uint32_t firstcu, reqcu;
5651
18.5k
uint32_t zeroreqcu, zerofirstcu;
5652
18.5k
uint32_t escape;
5653
18.5k
uint32_t *pptr = *pptrptr;
5654
18.5k
uint32_t meta, meta_arg;
5655
18.5k
uint32_t firstcuflags, reqcuflags;
5656
18.5k
uint32_t zeroreqcuflags, zerofirstcuflags;
5657
18.5k
uint32_t req_caseopt, reqvary, tempreqvary;
5658
18.5k
PCRE2_SIZE offset = 0;
5659
18.5k
PCRE2_SIZE length_prevgroup = 0;
5660
18.5k
PCRE2_UCHAR *code = *codeptr;
5661
18.5k
PCRE2_UCHAR *last_code = code;
5662
18.5k
PCRE2_UCHAR *orig_code = code;
5663
18.5k
PCRE2_UCHAR *tempcode;
5664
18.5k
PCRE2_UCHAR *previous = NULL;
5665
18.5k
PCRE2_UCHAR op_previous;
5666
18.5k
BOOL groupsetfirstcu = FALSE;
5667
18.5k
BOOL had_accept = FALSE;
5668
18.5k
BOOL matched_char = FALSE;
5669
18.5k
BOOL previous_matched_char = FALSE;
5670
18.5k
BOOL reset_caseful = FALSE;
5671
18.5k
const uint8_t *cbits = cb->cbits;
5672
18.5k
uint8_t classbits[32];
5673
5674
/* We can fish out the UTF setting once and for all into a BOOL, but we must
5675
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5676
as we process the pattern. */
5677
5678
18.5k
#ifdef SUPPORT_UNICODE
5679
18.5k
BOOL utf = (options & PCRE2_UTF) != 0;
5680
18.5k
BOOL ucp = (options & PCRE2_UCP) != 0;
5681
#else  /* No Unicode support */
5682
BOOL utf = FALSE;
5683
#endif
5684
5685
/* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5686
class_uchardata always so that it can be passed to add_to_class() always,
5687
though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5688
alternative calls for the different cases. */
5689
5690
18.5k
PCRE2_UCHAR *class_uchardata;
5691
18.5k
#ifdef SUPPORT_WIDE_CHARS
5692
18.5k
BOOL xclass;
5693
18.5k
PCRE2_UCHAR *class_uchardata_base;
5694
18.5k
#endif
5695
5696
/* Set up the default and non-default settings for greediness */
5697
5698
18.5k
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5699
18.5k
greedy_non_default = greedy_default ^ 1;
5700
5701
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
5702
matching encountered yet". It gets changed to REQ_NONE if we hit something that
5703
matches a non-fixed first unit; reqcu just remains unset if we never find one.
5704
5705
When we hit a repeat whose minimum is zero, we may have to adjust these values
5706
to take the zero repeat into account. This is implemented by setting them to
5707
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5708
item types that can be repeated set these backoff variables appropriately. */
5709
5710
18.5k
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5711
18.5k
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5712
5713
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
5714
according to the current setting of the caseless flag. The REQ_CASELESS value
5715
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5716
to record the case status of the value. This is used only for ASCII characters.
5717
*/
5718
5719
18.5k
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5720
5721
/* Switch on next META item until the end of the branch */
5722
5723
417k
for (;; pptr++)
5724
436k
  {
5725
436k
#ifdef SUPPORT_WIDE_CHARS
5726
436k
  BOOL xclass_has_prop;
5727
436k
#endif
5728
436k
  BOOL negate_class;
5729
436k
  BOOL should_flip_negation;
5730
436k
  BOOL match_all_or_no_wide_chars;
5731
436k
  BOOL possessive_quantifier;
5732
436k
  BOOL note_group_empty;
5733
436k
  int class_has_8bitchar;
5734
436k
  uint32_t mclength;
5735
436k
  uint32_t skipunits;
5736
436k
  uint32_t subreqcu, subfirstcu;
5737
436k
  uint32_t groupnumber;
5738
436k
  uint32_t verbarglen, verbculen;
5739
436k
  uint32_t subreqcuflags, subfirstcuflags;
5740
436k
  open_capitem *oc;
5741
436k
  PCRE2_UCHAR mcbuffer[8];
5742
5743
  /* Get next META item in the pattern and its potential argument. */
5744
5745
436k
  meta = META_CODE(*pptr);
5746
436k
  meta_arg = META_DATA(*pptr);
5747
5748
  /* If we are in the pre-compile phase, accumulate the length used for the
5749
  previous cycle of this loop, unless the next item is a quantifier. */
5750
5751
436k
  if (lengthptr != NULL)
5752
227k
    {
5753
227k
    if (code > cb->start_workspace + cb->workspace_size -
5754
227k
        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
5755
0
      {
5756
0
      *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5757
0
        ERR52 : ERR86;
5758
0
      return 0;
5759
0
      }
5760
5761
    /* There is at least one situation where code goes backwards: this is the
5762
    case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5763
    is processed, the whole class is eliminated. However, it is created first,
5764
    so we have to allow memory for it. Therefore, don't ever reduce the length
5765
    at this point. */
5766
5767
227k
    if (code < last_code) code = last_code;
5768
5769
    /* If the next thing is not a quantifier, we add the length of the previous
5770
    item into the total, and reset the code pointer to the start of the
5771
    workspace. Otherwise leave the previous item available to be quantified. */
5772
5773
227k
    if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5774
207k
      {
5775
207k
      if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5776
0
        {
5777
0
        *errorcodeptr = ERR20;   /* Integer overflow */
5778
0
        return 0;
5779
0
        }
5780
207k
      *lengthptr += (PCRE2_SIZE)(code - orig_code);
5781
207k
      if (*lengthptr > MAX_PATTERN_SIZE)
5782
0
        {
5783
0
        *errorcodeptr = ERR20;   /* Pattern is too large */
5784
0
        return 0;
5785
0
        }
5786
207k
      code = orig_code;
5787
207k
      }
5788
5789
    /* Remember where this code item starts so we can catch the "backwards"
5790
    case above next time round. */
5791
5792
227k
    last_code = code;
5793
227k
    }
5794
5795
  /* Process the next parsed pattern item. If it is not a quantifier, remember
5796
  where it starts so that it can be quantified when a quantifier follows.
5797
  Checking for the legality of quantifiers happens in parse_regex(), except for
5798
  a quantifier after an assertion that is a condition. */
5799
5800
436k
  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5801
396k
    {
5802
396k
    previous = code;
5803
396k
    if (matched_char && !had_accept) okreturn = 1;
5804
396k
    }
5805
5806
436k
  previous_matched_char = matched_char;
5807
436k
  matched_char = FALSE;
5808
436k
  note_group_empty = FALSE;
5809
436k
  skipunits = 0;         /* Default value for most subgroups */
5810
5811
436k
  switch(meta)
5812
436k
    {
5813
    /* ===================================================================*/
5814
    /* The branch terminates at pattern end or | or ) */
5815
5816
1.92k
    case META_END:
5817
16.0k
    case META_ALT:
5818
18.4k
    case META_KET:
5819
18.4k
    *firstcuptr = firstcu;
5820
18.4k
    *firstcuflagsptr = firstcuflags;
5821
18.4k
    *reqcuptr = reqcu;
5822
18.4k
    *reqcuflagsptr = reqcuflags;
5823
18.4k
    *codeptr = code;
5824
18.4k
    *pptrptr = pptr;
5825
18.4k
    return okreturn;
5826
5827
5828
    /* ===================================================================*/
5829
    /* Handle single-character metacharacters. In multiline mode, ^ disables
5830
    the setting of any following char as a first character. */
5831
5832
4.15k
    case META_CIRCUMFLEX:
5833
4.15k
    if ((options & PCRE2_MULTILINE) != 0)
5834
390
      {
5835
390
      if (firstcuflags == REQ_UNSET)
5836
12
        zerofirstcuflags = firstcuflags = REQ_NONE;
5837
390
      *code++ = OP_CIRCM;
5838
390
      }
5839
3.76k
    else *code++ = OP_CIRC;
5840
4.15k
    break;
5841
5842
789
    case META_DOLLAR:
5843
789
    *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5844
789
    break;
5845
5846
    /* There can never be a first char if '.' is first, whatever happens about
5847
    repeats. The value of reqcu doesn't change either. */
5848
5849
6.85k
    case META_DOT:
5850
6.85k
    matched_char = TRUE;
5851
6.85k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5852
6.85k
    zerofirstcu = firstcu;
5853
6.85k
    zerofirstcuflags = firstcuflags;
5854
6.85k
    zeroreqcu = reqcu;
5855
6.85k
    zeroreqcuflags = reqcuflags;
5856
6.85k
    *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5857
6.85k
    break;
5858
5859
5860
    /* ===================================================================*/
5861
    /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5862
    Otherwise, an initial ']' is taken as a data character. When empty classes
5863
    are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5864
    match any character, so generate OP_ALLANY. */
5865
5866
0
    case META_CLASS_EMPTY:
5867
0
    case META_CLASS_EMPTY_NOT:
5868
0
    matched_char = TRUE;
5869
0
    *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5870
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5871
0
    zerofirstcu = firstcu;
5872
0
    zerofirstcuflags = firstcuflags;
5873
0
    break;
5874
5875
5876
    /* ===================================================================*/
5877
    /* Non-empty character class. If the included characters are all < 256, we
5878
    build a 32-byte bitmap of the permitted characters, except in the special
5879
    case where there is only one such character. For negated classes, we build
5880
    the map as usual, then invert it at the end. However, we use a different
5881
    opcode so that data characters > 255 can be handled correctly.
5882
5883
    If the class contains characters outside the 0-255 range, a different
5884
    opcode is compiled. It may optionally have a bit map for characters < 256,
5885
    but those above are explicitly listed afterwards. A flag code unit tells
5886
    whether the bitmap is present, and whether this is a negated class or
5887
    not. */
5888
5889
3.07k
    case META_CLASS_NOT:
5890
8.53k
    case META_CLASS:
5891
8.53k
    matched_char = TRUE;
5892
8.53k
    negate_class = meta == META_CLASS_NOT;
5893
5894
    /* We can optimize the case of a single character in a class by generating
5895
    OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5896
    negative. In the negative case there can be no first char if this item is
5897
    first, whatever repeat count may follow. In the case of reqcu, save the
5898
    previous value for reinstating. */
5899
5900
    /* NOTE: at present this optimization is not effective if the only
5901
    character in a class in 32-bit, non-UCP mode has its top bit set. */
5902
5903
8.53k
    if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5904
1.09k
      {
5905
1.09k
#ifdef SUPPORT_UNICODE
5906
1.09k
      uint32_t d;
5907
1.09k
#endif
5908
1.09k
      uint32_t c = pptr[1];
5909
5910
1.09k
      pptr += 2;                 /* Move on to class end */
5911
1.09k
      if (meta == META_CLASS)    /* A positive one-char class can be */
5912
176
        {                        /* handled as a normal literal character. */
5913
176
        meta = c;                /* Set up the character */
5914
176
        goto NORMAL_CHAR_SET;
5915
176
        }
5916
5917
      /* Handle a negative one-character class */
5918
5919
922
      zeroreqcu = reqcu;
5920
922
      zeroreqcuflags = reqcuflags;
5921
922
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5922
922
      zerofirstcu = firstcu;
5923
922
      zerofirstcuflags = firstcuflags;
5924
5925
      /* For caseless UTF or UCP mode, check whether this character has more
5926
      than one other case. If so, generate a special OP_NOTPROP item instead of
5927
      OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
5928
      caseless set that starts with an ASCII character. */
5929
5930
922
#ifdef SUPPORT_UNICODE
5931
922
      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5932
216
          (d = UCD_CASESET(c)) != 0 &&
5933
12
          ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
5934
0
          PRIV(ucd_caseless_sets)[d] > 127))
5935
12
        {
5936
12
        *code++ = OP_NOTPROP;
5937
12
        *code++ = PT_CLIST;
5938
12
        *code++ = d;
5939
12
        break;   /* We are finished with this class */
5940
12
        }
5941
910
#endif
5942
      /* Char has only one other (usable) case, or UCP not available */
5943
5944
910
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5945
910
      code += PUTCHAR(c, code);
5946
910
      break;   /* We are finished with this class */
5947
922
      }        /* End of 1-char optimization */
5948
5949
    /* Handle character classes that contain more than just one literal
5950
    character. If there are exactly two characters in a positive class, see if
5951
    they are case partners. This can be optimized to generate a caseless single
5952
    character match (which also sets first/required code units if relevant).
5953
    When casing restrictions apply, ignore a caseless set if both characters
5954
    are ASCII. */
5955
5956
7.44k
    if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5957
5.01k
        pptr[3] == META_CLASS_END)
5958
278
      {
5959
278
      uint32_t c = pptr[1];
5960
5961
278
#ifdef SUPPORT_UNICODE
5962
278
      if (UCD_CASESET(c) == 0 ||
5963
8
         ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
5964
0
         c < 128 && pptr[2] < 128))
5965
270
#endif
5966
270
        {
5967
270
        uint32_t d;
5968
5969
270
#ifdef SUPPORT_UNICODE
5970
270
        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5971
270
#endif
5972
270
          {
5973
#if PCRE2_CODE_UNIT_WIDTH != 8
5974
          if (c > 255) d = c; else
5975
#endif
5976
270
          d = TABLE_GET(c, cb->fcc, c);
5977
270
          }
5978
5979
270
        if (c != d && pptr[2] == d)
5980
0
          {
5981
0
          pptr += 3;                 /* Move on to class end */
5982
0
          meta = c;
5983
0
          if ((options & PCRE2_CASELESS) == 0)
5984
0
            {
5985
0
            reset_caseful = TRUE;
5986
0
            options |= PCRE2_CASELESS;
5987
0
            req_caseopt = REQ_CASELESS;
5988
0
            }
5989
0
          goto CLASS_CASELESS_CHAR;
5990
0
          }
5991
270
        }
5992
278
      }
5993
5994
    /* If a non-extended class contains a negative special such as \S, we need
5995
    to flip the negation flag at the end, so that support for characters > 255
5996
    works correctly (they are all included in the class). An extended class may
5997
    need to insert specific matching or non-matching code for wide characters.
5998
    */
5999
6000
7.44k
    should_flip_negation = match_all_or_no_wide_chars = FALSE;
6001
6002
    /* Extended class (xclass) will be used when characters > 255
6003
    might match. */
6004
6005
7.44k
#ifdef SUPPORT_WIDE_CHARS
6006
7.44k
    xclass = FALSE;
6007
7.44k
    class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
6008
7.44k
    class_uchardata_base = class_uchardata;   /* Save the start */
6009
7.44k
#endif
6010
6011
    /* For optimization purposes, we track some properties of the class:
6012
    class_has_8bitchar will be non-zero if the class contains at least one
6013
    character with a code point less than 256; xclass_has_prop will be TRUE if
6014
    Unicode property checks are present in the class. */
6015
6016
7.44k
    class_has_8bitchar = 0;
6017
7.44k
#ifdef SUPPORT_WIDE_CHARS
6018
7.44k
    xclass_has_prop = FALSE;
6019
7.44k
#endif
6020
6021
    /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
6022
    in a temporary bit of memory, in case the class contains fewer than two
6023
    8-bit characters because in that case the compiled code doesn't use the bit
6024
    map. */
6025
6026
7.44k
    memset(classbits, 0, 32 * sizeof(uint8_t));
6027
6028
    /* Process items until META_CLASS_END is reached. */
6029
6030
170k
    while ((meta = *(++pptr)) != META_CLASS_END)
6031
162k
      {
6032
      /* Handle POSIX classes such as [:alpha:] etc. */
6033
6034
162k
      if (meta == META_POSIX || meta == META_POSIX_NEG)
6035
0
        {
6036
0
        BOOL local_negate = (meta == META_POSIX_NEG);
6037
0
        int posix_class = *(++pptr);
6038
0
        int taboffset, tabopt;
6039
0
        uint8_t pbits[32];
6040
6041
0
        should_flip_negation = local_negate;  /* Note negative special */
6042
6043
        /* If matching is caseless, upper and lower are converted to alpha.
6044
        This relies on the fact that the class table starts with alpha,
6045
        lower, upper as the first 3 entries. */
6046
6047
0
        if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
6048
0
          posix_class = 0;
6049
6050
        /* When PCRE2_UCP is set, some of the POSIX classes are converted to
6051
        different escape sequences that use Unicode properties \p or \P.
6052
        Others that are not available via \p or \P have to generate
6053
        XCL_PROP/XCL_NOTPROP directly, which is done here. */
6054
6055
0
#ifdef SUPPORT_UNICODE
6056
0
        if ((options & PCRE2_UCP) != 0 &&
6057
0
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
6058
0
          {
6059
0
          switch(posix_class)
6060
0
            {
6061
0
            case PC_GRAPH:
6062
0
            case PC_PRINT:
6063
0
            case PC_PUNCT:
6064
0
            *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
6065
0
            *class_uchardata++ = (PCRE2_UCHAR)
6066
0
              ((posix_class == PC_GRAPH)? PT_PXGRAPH :
6067
0
               (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
6068
0
            *class_uchardata++ = 0;
6069
0
            xclass_has_prop = TRUE;
6070
0
            goto CONTINUE_CLASS;
6071
6072
            /* For the other POSIX classes (ex: ascii) we are going to
6073
            fall through to the non-UCP case and build a bit map for
6074
            characters with code points less than 256. However, if we are in
6075
            a negated POSIX class, characters with code points greater than
6076
            255 must either all match or all not match, depending on whether
6077
            the whole class is not or is negated. For example, for
6078
            [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
6079
            they must not.
6080
6081
            In the special case where there are no xclass items, this is
6082
            automatically handled by the use of OP_CLASS or OP_NCLASS, but an
6083
            explicit range is needed for OP_XCLASS. Setting a flag here
6084
            causes the range to be generated later when it is known that
6085
            OP_XCLASS is required. In the 8-bit library this is relevant only in
6086
            utf mode, since no wide characters can exist otherwise. */
6087
6088
0
            default:
6089
0
#if PCRE2_CODE_UNIT_WIDTH == 8
6090
0
            if (utf)
6091
0
#endif
6092
0
            match_all_or_no_wide_chars |= local_negate;
6093
0
            break;
6094
0
            }
6095
0
          }
6096
0
#endif  /* SUPPORT_UNICODE */
6097
6098
        /* In the non-UCP case, or when UCP makes no difference, we build the
6099
        bit map for the POSIX class in a chunk of local store because we may
6100
        be adding and subtracting from it, and we don't want to subtract bits
6101
        that may be in the main map already. At the end we or the result into
6102
        the bit map that is being built. */
6103
6104
0
        posix_class *= 3;
6105
6106
        /* Copy in the first table (always present) */
6107
6108
0
        memcpy(pbits, cbits + posix_class_maps[posix_class],
6109
0
          32 * sizeof(uint8_t));
6110
6111
        /* If there is a second table, add or remove it as required. */
6112
6113
0
        taboffset = posix_class_maps[posix_class + 1];
6114
0
        tabopt = posix_class_maps[posix_class + 2];
6115
6116
0
        if (taboffset >= 0)
6117
0
          {
6118
0
          if (tabopt >= 0)
6119
0
            for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
6120
0
          else
6121
0
            for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
6122
0
          }
6123
6124
        /* Now see if we need to remove any special characters. An option
6125
        value of 1 removes vertical space and 2 removes underscore. */
6126
6127
0
        if (tabopt < 0) tabopt = -tabopt;
6128
0
        if (tabopt == 1) pbits[1] &= ~0x3c;
6129
0
          else if (tabopt == 2) pbits[11] &= 0x7f;
6130
6131
        /* Add the POSIX table or its complement into the main table that is
6132
        being built and we are done. */
6133
6134
0
        if (local_negate)
6135
0
          for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
6136
0
        else
6137
0
          for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
6138
6139
        /* Every class contains at least one < 256 character. */
6140
6141
0
        class_has_8bitchar = 1;
6142
0
        goto CONTINUE_CLASS;    /* End of POSIX handling */
6143
0
        }
6144
6145
      /* Other than POSIX classes, the only items we should encounter are
6146
      \d-type escapes and literal characters (possibly as ranges). */
6147
6148
162k
      if (meta == META_BIGVALUE)
6149
0
        {
6150
0
        meta = *(++pptr);
6151
0
        goto CLASS_LITERAL;
6152
0
        }
6153
6154
      /* Any other non-literal must be an escape */
6155
6156
162k
      if (meta >= META_END)
6157
1.63k
        {
6158
1.63k
        if (META_CODE(meta) != META_ESCAPE)
6159
0
          {
6160
#ifdef DEBUG_SHOW_PARSED
6161
          fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
6162
                          "in character class\n", meta);
6163
#endif
6164
0
          *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
6165
0
          return 0;
6166
0
          }
6167
1.63k
        escape = META_DATA(meta);
6168
6169
        /* Every class contains at least one < 256 character. */
6170
6171
1.63k
        class_has_8bitchar++;
6172
6173
1.63k
        switch(escape)
6174
1.63k
          {
6175
452
          case ESC_d:
6176
14.9k
          for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
6177
452
          break;
6178
6179
34
          case ESC_D:
6180
34
          should_flip_negation = TRUE;
6181
1.12k
          for (int i = 0; i < 32; i++)
6182
1.08k
            classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
6183
34
          break;
6184
6185
162
          case ESC_w:
6186
5.34k
          for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
6187
162
          break;
6188
6189
567
          case ESC_W:
6190
567
          should_flip_negation = TRUE;
6191
18.7k
          for (int i = 0; i < 32; i++)
6192
18.1k
            classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
6193
567
          break;
6194
6195
          /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
6196
          5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
6197
          previously set by something earlier in the character class.
6198
          Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
6199
          we could just adjust the appropriate bit. From PCRE 8.34 we no
6200
          longer treat \s and \S specially. */
6201
6202
4
          case ESC_s:
6203
132
          for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
6204
4
          break;
6205
6206
12
          case ESC_S:
6207
12
          should_flip_negation = TRUE;
6208
396
          for (int i = 0; i < 32; i++)
6209
384
            classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
6210
12
          break;
6211
6212
          /* When adding the horizontal or vertical space lists to a class, or
6213
          their complements, disable PCRE2_CASELESS, because it justs wastes
6214
          time, and in the "not-x" UTF cases can create unwanted duplicates in
6215
          the XCLASS list (provoked by characters that have more than one other
6216
          case and by both cases being in the same "not-x" sublist). */
6217
6218
4
          case ESC_h:
6219
4
          (void)add_list_to_class(classbits, &class_uchardata,
6220
4
            options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list),
6221
4
              NOTACHAR);
6222
4
          break;
6223
6224
8
          case ESC_H:
6225
8
          (void)add_not_list_to_class(classbits, &class_uchardata,
6226
8
            options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list));
6227
8
          break;
6228
6229
0
          case ESC_v:
6230
0
          (void)add_list_to_class(classbits, &class_uchardata,
6231
0
            options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list),
6232
0
              NOTACHAR);
6233
0
          break;
6234
6235
108
          case ESC_V:
6236
108
          (void)add_not_list_to_class(classbits, &class_uchardata,
6237
108
            options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list));
6238
108
          break;
6239
6240
          /* If Unicode is not supported, \P and \p are not allowed and are
6241
          faulted at parse time, so will never appear here. */
6242
6243
0
#ifdef SUPPORT_UNICODE
6244
170
          case ESC_p:
6245
284
          case ESC_P:
6246
284
            {
6247
284
            uint32_t ptype = *(++pptr) >> 16;
6248
284
            uint32_t pdata = *pptr & 0xffff;
6249
284
            *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
6250
284
            *class_uchardata++ = ptype;
6251
284
            *class_uchardata++ = pdata;
6252
284
            xclass_has_prop = TRUE;
6253
284
            class_has_8bitchar--;                /* Undo! */
6254
284
            }
6255
284
          break;
6256
1.63k
#endif
6257
1.63k
          }
6258
6259
1.63k
        goto CONTINUE_CLASS;
6260
1.63k
        }  /* End handling \d-type escapes */
6261
6262
      /* A literal character may be followed by a range meta. At parse time
6263
      there are checks for out-of-order characters, for ranges where the two
6264
      characters are equal, and for hyphens that cannot indicate a range. At
6265
      this point, therefore, no checking is needed. */
6266
6267
161k
      else
6268
161k
        {
6269
161k
        uint32_t c, d;
6270
6271
161k
        CLASS_LITERAL:
6272
161k
        c = d = meta;
6273
6274
        /* Remember if \r or \n were explicitly used */
6275
6276
161k
        if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6277
6278
        /* Process a character range */
6279
6280
161k
        if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6281
964
          {
6282
#ifdef EBCDIC
6283
          BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6284
#endif
6285
964
          pptr += 2;
6286
964
          d = *pptr;
6287
964
          if (d == META_BIGVALUE) d = *(++pptr);
6288
6289
          /* Remember an explicit \r or \n, and add the range to the class. */
6290
6291
964
          if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6292
6293
          /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6294
          because there are holes in the encoding, and simply using the range
6295
          A-Z (for example) would include the characters in the holes. This
6296
          applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6297
6298
#ifdef EBCDIC
6299
          if (range_is_literal &&
6300
               (cb->ctypes[c] & ctype_letter) != 0 &&
6301
               (cb->ctypes[d] & ctype_letter) != 0 &&
6302
               (c <= CHAR_z) == (d <= CHAR_z))
6303
            {
6304
            uint32_t uc = (d <= CHAR_z)? 0 : 64;
6305
            uint32_t C = c - uc;
6306
            uint32_t D = d - uc;
6307
6308
            if (C <= CHAR_i)
6309
              {
6310
              class_has_8bitchar +=
6311
                add_to_class(classbits, &class_uchardata, options, xoptions,
6312
                  cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc);
6313
              C = CHAR_j;
6314
              }
6315
6316
            if (C <= D && C <= CHAR_r)
6317
              {
6318
              class_has_8bitchar +=
6319
                add_to_class(classbits, &class_uchardata, options, xoptions,
6320
                  cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc);
6321
              C = CHAR_s;
6322
              }
6323
6324
            if (C <= D)
6325
              {
6326
              class_has_8bitchar +=
6327
                add_to_class(classbits, &class_uchardata, options, xoptions,
6328
                  cb, C + uc, D + uc);
6329
              }
6330
            }
6331
          else
6332
#endif
6333
          /* Not an EBCDIC special range */
6334
6335
964
          class_has_8bitchar += add_to_class(classbits, &class_uchardata,
6336
964
            options, xoptions, cb, c, d);
6337
964
          goto CONTINUE_CLASS;   /* Go get the next char in the class */
6338
964
          }  /* End of range handling */
6339
6340
6341
        /* Handle a single character. */
6342
6343
160k
        class_has_8bitchar +=
6344
160k
          add_to_class(classbits, &class_uchardata, options, xoptions, cb,
6345
160k
            meta, meta);
6346
160k
        }
6347
6348
      /* Continue to the next item in the class. */
6349
6350
162k
      CONTINUE_CLASS:
6351
6352
162k
#ifdef SUPPORT_WIDE_CHARS
6353
      /* If any wide characters or Unicode properties have been encountered,
6354
      set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6355
      of the extra data and reset the pointer. This is so that very large
6356
      classes that contain a zillion wide characters or Unicode property tests
6357
      do not overwrite the workspace (which is on the stack). */
6358
6359
162k
      if (class_uchardata > class_uchardata_base)
6360
7.63k
        {
6361
7.63k
        xclass = TRUE;
6362
7.63k
        if (lengthptr != NULL)
6363
787
          {
6364
787
          *lengthptr += class_uchardata - class_uchardata_base;
6365
787
          class_uchardata = class_uchardata_base;
6366
787
          }
6367
7.63k
        }
6368
162k
#endif
6369
6370
162k
      continue;  /* Needed to avoid error when not supporting wide chars */
6371
162k
      }   /* End of main class-processing loop */
6372
6373
    /* If this class is the first thing in the branch, there can be no first
6374
    char setting, whatever the repeat count. Any reqcu setting must remain
6375
    unchanged after any kind of repeat. */
6376
6377
7.44k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6378
7.44k
    zerofirstcu = firstcu;
6379
7.44k
    zerofirstcuflags = firstcuflags;
6380
7.44k
    zeroreqcu = reqcu;
6381
7.44k
    zeroreqcuflags = reqcuflags;
6382
6383
    /* If there are characters with values > 255, or Unicode property settings
6384
    (\p or \P), we have to compile an extended class, with its own opcode,
6385
    unless there were no property settings and there was a negated special such
6386
    as \S in the class, and PCRE2_UCP is not set, because in that case all
6387
    characters > 255 are in or not in the class, so any that were explicitly
6388
    given as well can be ignored.
6389
6390
    In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
6391
    were present in a class, we either have to match or not match all wide
6392
    characters (depending on whether the whole class is or is not negated).
6393
    This requirement is indicated by match_all_or_no_wide_chars being true.
6394
    We do this by including an explicit range, which works in both cases.
6395
    This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6396
    cannot be any wide characters in 8-bit non-UTF mode.
6397
6398
    When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6399
    class where \S etc is present without PCRE2_UCP, causing an extended class
6400
    to be compiled, we make sure that all characters > 255 are included by
6401
    forcing match_all_or_no_wide_chars to be true.
6402
6403
    If, when generating an xclass, there are no characters < 256, we can omit
6404
    the bitmap in the actual compiled code. */
6405
6406
7.44k
#ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
6407
7.44k
    if (xclass && (
6408
1.01k
#ifdef SUPPORT_UNICODE
6409
1.01k
        (options & PCRE2_UCP) != 0 ||
6410
124
#endif
6411
124
        xclass_has_prop || !should_flip_negation))
6412
1.01k
      {
6413
1.01k
      if (match_all_or_no_wide_chars || (
6414
1.01k
#if PCRE2_CODE_UNIT_WIDTH == 8
6415
1.01k
           utf &&
6416
886
#endif
6417
886
           should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6418
0
        {
6419
0
        *class_uchardata++ = XCL_RANGE;
6420
0
        if (utf)   /* Will always be utf in the 8-bit library */
6421
0
          {
6422
0
          class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6423
0
          class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6424
0
          }
6425
0
        else       /* Can only happen for the 16-bit & 32-bit libraries */
6426
0
          {
6427
#if PCRE2_CODE_UNIT_WIDTH == 16
6428
          *class_uchardata++ = 0x100;
6429
          *class_uchardata++ = 0xffffu;
6430
#elif PCRE2_CODE_UNIT_WIDTH == 32
6431
          *class_uchardata++ = 0x100;
6432
          *class_uchardata++ = 0xffffffffu;
6433
#endif
6434
0
          }
6435
0
        }
6436
1.01k
      *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
6437
1.01k
      *code++ = OP_XCLASS;
6438
1.01k
      code += LINK_SIZE;
6439
1.01k
      *code = negate_class? XCL_NOT:0;
6440
1.01k
      if (xclass_has_prop) *code |= XCL_HASPROP;
6441
6442
      /* If the map is required, move up the extra data to make room for it;
6443
      otherwise just move the code pointer to the end of the extra data. */
6444
6445
1.01k
      if (class_has_8bitchar > 0)
6446
1.01k
        {
6447
1.01k
        *code++ |= XCL_MAP;
6448
1.01k
        (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6449
1.01k
          CU2BYTES(class_uchardata - code));
6450
1.01k
        if (negate_class && !xclass_has_prop)
6451
498
          {
6452
          /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6453
16.4k
          for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6454
498
          }
6455
1.01k
        memcpy(code, classbits, 32);
6456
1.01k
        code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6457
1.01k
        }
6458
0
      else code = class_uchardata;
6459
6460
      /* Now fill in the complete length of the item */
6461
6462
1.01k
      PUT(previous, 1, (int)(code - previous));
6463
1.01k
      break;   /* End of class handling */
6464
1.01k
      }
6465
6.43k
#endif  /* SUPPORT_WIDE_CHARS */
6466
6467
    /* If there are no characters > 255, or they are all to be included or
6468
    excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6469
    whole class was negated and whether there were negative specials such as \S
6470
    (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6471
    negating it if necessary. */
6472
6473
6.43k
    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6474
6.43k
    if (lengthptr == NULL)    /* Save time in the pre-compile phase */
6475
3.21k
      {
6476
3.21k
      if (negate_class)
6477
777
        {
6478
       /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6479
25.6k
       for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6480
777
       }
6481
3.21k
      memcpy(code, classbits, 32);
6482
3.21k
      }
6483
6.43k
    code += 32 / sizeof(PCRE2_UCHAR);
6484
6.43k
    break;  /* End of class processing */
6485
6486
6487
    /* ===================================================================*/
6488
    /* Deal with (*VERB)s. */
6489
6490
    /* Check for open captures before ACCEPT and close those that are within
6491
    the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6492
    assertion. In the first pass, just accumulate the length required;
6493
    otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6494
    workspace overflow. Do not set firstcu after *ACCEPT. */
6495
6496
0
    case META_ACCEPT:
6497
0
    cb->had_accept = had_accept = TRUE;
6498
0
    for (oc = open_caps;
6499
0
         oc != NULL && oc->assert_depth >= cb->assert_depth;
6500
0
         oc = oc->next)
6501
0
      {
6502
0
      if (lengthptr != NULL)
6503
0
        {
6504
0
        *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6505
0
        }
6506
0
      else
6507
0
        {
6508
0
        *code++ = OP_CLOSE;
6509
0
        PUT2INC(code, 0, oc->number);
6510
0
        }
6511
0
      }
6512
0
    *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6513
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6514
0
    break;
6515
6516
0
    case META_PRUNE:
6517
0
    case META_SKIP:
6518
0
    cb->had_pruneorskip = TRUE;
6519
    /* Fall through */
6520
0
    case META_COMMIT:
6521
0
    case META_FAIL:
6522
0
    *code++ = verbops[(meta - META_MARK) >> 16];
6523
0
    break;
6524
6525
0
    case META_THEN:
6526
0
    cb->external_flags |= PCRE2_HASTHEN;
6527
0
    *code++ = OP_THEN;
6528
0
    break;
6529
6530
    /* Handle verbs with arguments. Arguments can be very long, especially in
6531
    16- and 32-bit modes, and can overflow the workspace in the first pass.
6532
    However, the argument length is constrained to be small enough to fit in
6533
    one code unit. This check happens in parse_regex(). In the first pass,
6534
    instead of putting the argument into memory, we just update the length
6535
    counter and set up an empty argument. */
6536
6537
0
    case META_THEN_ARG:
6538
0
    cb->external_flags |= PCRE2_HASTHEN;
6539
0
    goto VERB_ARG;
6540
6541
0
    case META_PRUNE_ARG:
6542
0
    case META_SKIP_ARG:
6543
0
    cb->had_pruneorskip = TRUE;
6544
    /* Fall through */
6545
0
    case META_MARK:
6546
0
    case META_COMMIT_ARG:
6547
0
    VERB_ARG:
6548
0
    *code++ = verbops[(meta - META_MARK) >> 16];
6549
    /* The length is in characters. */
6550
0
    verbarglen = *(++pptr);
6551
0
    verbculen = 0;
6552
0
    tempcode = code++;
6553
0
    for (int i = 0; i < (int)verbarglen; i++)
6554
0
      {
6555
0
      meta = *(++pptr);
6556
0
#ifdef SUPPORT_UNICODE
6557
0
      if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6558
0
#endif
6559
0
        {
6560
0
        mclength = 1;
6561
0
        mcbuffer[0] = meta;
6562
0
        }
6563
0
      if (lengthptr != NULL) *lengthptr += mclength; else
6564
0
        {
6565
0
        memcpy(code, mcbuffer, CU2BYTES(mclength));
6566
0
        code += mclength;
6567
0
        verbculen += mclength;
6568
0
        }
6569
0
      }
6570
6571
0
    *tempcode = verbculen;   /* Fill in the code unit length */
6572
0
    *code++ = 0;             /* Terminating zero */
6573
0
    break;
6574
6575
6576
    /* ===================================================================*/
6577
    /* Handle options change. The new setting must be passed back for use in
6578
    subsequent branches. Reset the greedy defaults and the case value for
6579
    firstcu and reqcu. */
6580
6581
0
    case META_OPTIONS:
6582
0
    *optionsptr = options = *(++pptr);
6583
0
    *xoptionsptr = xoptions = *(++pptr);
6584
0
    greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6585
0
    greedy_non_default = greedy_default ^ 1;
6586
0
    req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6587
0
    break;
6588
6589
6590
    /* ===================================================================*/
6591
    /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6592
    because it could be a numerical check on recursion, or a name check on a
6593
    group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6594
    we can handle it either way. We first try for a name; if not found, process
6595
    the number. */
6596
6597
0
    case META_COND_RNUMBER:   /* (?(Rdigits) */
6598
0
    case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6599
0
    case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6600
0
    bravalue = OP_COND;
6601
0
      {
6602
0
      int count, index;
6603
0
      unsigned int i;
6604
0
      PCRE2_SPTR name;
6605
0
      named_group *ng = cb->named_groups;
6606
0
      uint32_t length = *(++pptr);
6607
6608
0
      GETPLUSOFFSET(offset, pptr);
6609
0
      name = cb->start_pattern + offset;
6610
6611
      /* In the first pass, the names generated in the pre-pass are available,
6612
      but the main name table has not yet been created. Scan the list of names
6613
      generated in the pre-pass in order to get a number and whether or not
6614
      this name is duplicated. If it is not duplicated, we can handle it as a
6615
      numerical group. */
6616
6617
0
      for (i = 0; i < cb->names_found; i++, ng++)
6618
0
        {
6619
0
        if (length == ng->length &&
6620
0
            PRIV(strncmp)(name, ng->name, length) == 0)
6621
0
          {
6622
0
          if (!ng->isdup)
6623
0
            {
6624
0
            code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6625
0
            PUT2(code, 2+LINK_SIZE, ng->number);
6626
0
            if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6627
0
            skipunits = 1+IMM2_SIZE;
6628
0
            goto GROUP_PROCESS_NOTE_EMPTY;
6629
0
            }
6630
0
          break;  /* Found a duplicated name */
6631
0
          }
6632
0
        }
6633
6634
      /* If the name was not found we have a bad reference, unless we are
6635
      dealing with R<digits>, which is treated as a recursion test by number.
6636
      */
6637
6638
0
      if (i >= cb->names_found)
6639
0
        {
6640
0
        groupnumber = 0;
6641
0
        if (meta == META_COND_RNUMBER)
6642
0
          {
6643
0
          for (i = 1; i < length; i++)
6644
0
            {
6645
0
            groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6646
0
            if (groupnumber > MAX_GROUP_NUMBER)
6647
0
              {
6648
0
              *errorcodeptr = ERR61;
6649
0
              cb->erroroffset = offset + i;
6650
0
              return 0;
6651
0
              }
6652
0
            }
6653
0
          }
6654
6655
0
        if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6656
0
          {
6657
0
          *errorcodeptr = ERR15;
6658
0
          cb->erroroffset = offset;
6659
0
          return 0;
6660
0
          }
6661
6662
        /* (?Rdigits) treated as a recursion reference by number. A value of
6663
        zero (which is the result of both (?R) and (?R0)) means "any", and is
6664
        translated into RREF_ANY (which is 0xffff). */
6665
6666
0
        if (groupnumber == 0) groupnumber = RREF_ANY;
6667
0
        code[1+LINK_SIZE] = OP_RREF;
6668
0
        PUT2(code, 2+LINK_SIZE, groupnumber);
6669
0
        skipunits = 1+IMM2_SIZE;
6670
0
        goto GROUP_PROCESS_NOTE_EMPTY;
6671
0
        }
6672
6673
      /* A duplicated name was found. Note that if an R<digits> name is found
6674
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6675
6676
0
      code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6677
6678
      /* We have a duplicated name. In the compile pass we have to search the
6679
      main table in order to get the index and count values. */
6680
6681
0
      count = 0;  /* Values for first pass (avoids compiler warning) */
6682
0
      index = 0;
6683
0
      if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6684
0
            &count, errorcodeptr, cb)) return 0;
6685
6686
      /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6687
      insert appropriate data values. */
6688
6689
0
      code[1+LINK_SIZE]++;
6690
0
      skipunits = 1+2*IMM2_SIZE;
6691
0
      PUT2(code, 2+LINK_SIZE, index);
6692
0
      PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6693
0
      }
6694
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6695
6696
    /* The DEFINE condition is always false. Its internal groups may never
6697
    be called, so matched_char must remain false, hence the jump to
6698
    GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6699
6700
0
    case META_COND_DEFINE:
6701
0
    bravalue = OP_COND;
6702
0
    GETPLUSOFFSET(offset, pptr);
6703
0
    code[1+LINK_SIZE] = OP_DEFINE;
6704
0
    skipunits = 1;
6705
0
    goto GROUP_PROCESS;
6706
6707
    /* Conditional test of a group's being set. */
6708
6709
0
    case META_COND_NUMBER:
6710
0
    bravalue = OP_COND;
6711
0
    GETPLUSOFFSET(offset, pptr);
6712
0
    groupnumber = *(++pptr);
6713
0
    if (groupnumber > cb->bracount)
6714
0
      {
6715
0
      *errorcodeptr = ERR15;
6716
0
      cb->erroroffset = offset;
6717
0
      return 0;
6718
0
      }
6719
0
    if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6720
0
    offset -= 2;   /* Point at initial ( for too many branches error */
6721
0
    code[1+LINK_SIZE] = OP_CREF;
6722
0
    skipunits = 1+IMM2_SIZE;
6723
0
    PUT2(code, 2+LINK_SIZE, groupnumber);
6724
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6725
6726
    /* Test for the PCRE2 version. */
6727
6728
0
    case META_COND_VERSION:
6729
0
    bravalue = OP_COND;
6730
0
    if (pptr[1] > 0)
6731
0
      code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6732
0
        (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6733
0
          OP_TRUE : OP_FALSE;
6734
0
    else
6735
0
      code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6736
0
        OP_TRUE : OP_FALSE;
6737
0
    skipunits = 1;
6738
0
    pptr += 3;
6739
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6740
6741
    /* The condition is an assertion, possibly preceded by a callout. */
6742
6743
0
    case META_COND_ASSERT:
6744
0
    bravalue = OP_COND;
6745
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6746
6747
6748
    /* ===================================================================*/
6749
    /* Handle all kinds of nested bracketed groups. The non-capturing,
6750
    non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6751
6752
4
    case META_LOOKAHEAD:
6753
4
    bravalue = OP_ASSERT;
6754
4
    cb->assert_depth += 1;
6755
4
    goto GROUP_PROCESS;
6756
6757
6
    case META_LOOKAHEAD_NA:
6758
6
    bravalue = OP_ASSERT_NA;
6759
6
    cb->assert_depth += 1;
6760
6
    goto GROUP_PROCESS;
6761
6762
    /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6763
    thing to do, but Perl allows all assertions to be quantified, and when
6764
    they contain capturing parentheses there may be a potential use for
6765
    this feature. Not that that applies to a quantified (?!) but we allow
6766
    it for uniformity. */
6767
6768
0
    case META_LOOKAHEADNOT:
6769
0
    if (pptr[1] == META_KET &&
6770
0
         (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6771
0
      {
6772
0
      *code++ = OP_FAIL;
6773
0
      pptr++;
6774
0
      }
6775
0
    else
6776
0
      {
6777
0
      bravalue = OP_ASSERT_NOT;
6778
0
      cb->assert_depth += 1;
6779
0
      goto GROUP_PROCESS;
6780
0
      }
6781
0
    break;
6782
6783
0
    case META_LOOKBEHIND:
6784
0
    bravalue = OP_ASSERTBACK;
6785
0
    cb->assert_depth += 1;
6786
0
    goto GROUP_PROCESS;
6787
6788
0
    case META_LOOKBEHINDNOT:
6789
0
    bravalue = OP_ASSERTBACK_NOT;
6790
0
    cb->assert_depth += 1;
6791
0
    goto GROUP_PROCESS;
6792
6793
0
    case META_LOOKBEHIND_NA:
6794
0
    bravalue = OP_ASSERTBACK_NA;
6795
0
    cb->assert_depth += 1;
6796
0
    goto GROUP_PROCESS;
6797
6798
3
    case META_ATOMIC:
6799
3
    bravalue = OP_ONCE;
6800
3
    goto GROUP_PROCESS_NOTE_EMPTY;
6801
6802
0
    case META_SCRIPT_RUN:
6803
0
    bravalue = OP_SCRIPT_RUN;
6804
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6805
6806
18
    case META_NOCAPTURE:
6807
18
    bravalue = OP_BRA;
6808
    /* Fall through */
6809
6810
    /* Process nested bracketed regex. The nesting depth is maintained for the
6811
    benefit of the stackguard function. The test for too deep nesting is now
6812
    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6813
    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6814
    note of whether or not they may match an empty string. */
6815
6816
2.47k
    GROUP_PROCESS_NOTE_EMPTY:
6817
2.47k
    note_group_empty = TRUE;
6818
6819
2.48k
    GROUP_PROCESS:
6820
2.48k
    cb->parens_depth += 1;
6821
2.48k
    *code = bravalue;
6822
2.48k
    pptr++;
6823
2.48k
    tempcode = code;
6824
2.48k
    tempreqvary = cb->req_varyopt;        /* Save value before group */
6825
2.48k
    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6826
6827
2.48k
    if ((group_return =
6828
2.48k
         compile_regex(
6829
2.48k
         options,                         /* The options state */
6830
2.48k
         xoptions,                        /* The extra options state */
6831
2.48k
         &tempcode,                       /* Where to put code (updated) */
6832
2.48k
         &pptr,                           /* Input pointer (updated) */
6833
2.48k
         errorcodeptr,                    /* Where to put an error message */
6834
2.48k
         skipunits,                       /* Skip over bracket number */
6835
2.48k
         &subfirstcu,                     /* For possible first char */
6836
2.48k
         &subfirstcuflags,
6837
2.48k
         &subreqcu,                       /* For possible last char */
6838
2.48k
         &subreqcuflags,
6839
2.48k
         bcptr,                           /* Current branch chain */
6840
2.48k
         open_caps,                       /* Pointer to capture stack */
6841
2.48k
         cb,                              /* Compile data block */
6842
2.48k
         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6843
2.48k
           &length_prevgroup              /* Pre-compile phase */
6844
2.48k
         )) == 0)
6845
6
      return 0;  /* Error */
6846
6847
2.47k
    cb->parens_depth -= 1;
6848
6849
    /* If that was a non-conditional significant group (not an assertion, not a
6850
    DEFINE) that matches at least one character, then the current item matches
6851
    a character. Conditionals are handled below. */
6852
6853
2.47k
    if (note_group_empty && bravalue != OP_COND && group_return > 0)
6854
621
      matched_char = TRUE;
6855
6856
    /* If we've just compiled an assertion, pop the assert depth. */
6857
6858
2.47k
    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6859
10
      cb->assert_depth -= 1;
6860
6861
    /* At the end of compiling, code is still pointing to the start of the
6862
    group, while tempcode has been updated to point past the end of the group.
6863
    The parsed pattern pointer (pptr) is on the closing META_KET.
6864
6865
    If this is a conditional bracket, check that there are no more than
6866
    two branches in the group, or just one if it's a DEFINE group. We do this
6867
    in the real compile phase, not in the pre-pass, where the whole group may
6868
    not be available. */
6869
6870
2.47k
    if (bravalue == OP_COND && lengthptr == NULL)
6871
0
      {
6872
0
      PCRE2_UCHAR *tc = code;
6873
0
      int condcount = 0;
6874
6875
0
      do {
6876
0
         condcount++;
6877
0
         tc += GET(tc,1);
6878
0
         }
6879
0
      while (*tc != OP_KET);
6880
6881
      /* A DEFINE group is never obeyed inline (the "condition" is always
6882
      false). It must have only one branch. Having checked this, change the
6883
      opcode to OP_FALSE. */
6884
6885
0
      if (code[LINK_SIZE+1] == OP_DEFINE)
6886
0
        {
6887
0
        if (condcount > 1)
6888
0
          {
6889
0
          cb->erroroffset = offset;
6890
0
          *errorcodeptr = ERR54;
6891
0
          return 0;
6892
0
          }
6893
0
        code[LINK_SIZE+1] = OP_FALSE;
6894
0
        bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6895
0
        }
6896
6897
      /* A "normal" conditional group. If there is just one branch, we must not
6898
      make use of its firstcu or reqcu, because this is equivalent to an
6899
      empty second branch. Also, it may match an empty string. If there are two
6900
      branches, this item must match a character if the group must. */
6901
6902
0
      else
6903
0
        {
6904
0
        if (condcount > 2)
6905
0
          {
6906
0
          cb->erroroffset = offset;
6907
0
          *errorcodeptr = ERR27;
6908
0
          return 0;
6909
0
          }
6910
0
        if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6911
0
          else if (group_return > 0) matched_char = TRUE;
6912
0
        }
6913
0
      }
6914
6915
    /* In the pre-compile phase, update the length by the length of the group,
6916
    less the brackets at either end. Then reduce the compiled code to just a
6917
    set of non-capturing brackets so that it doesn't use much memory if it is
6918
    duplicated by a quantifier.*/
6919
6920
2.47k
    if (lengthptr != NULL)
6921
1.35k
      {
6922
1.35k
      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6923
0
        {
6924
0
        *errorcodeptr = ERR20;
6925
0
        return 0;
6926
0
        }
6927
1.35k
      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6928
1.35k
      code++;   /* This already contains bravalue */
6929
1.35k
      PUTINC(code, 0, 1 + LINK_SIZE);
6930
1.35k
      *code++ = OP_KET;
6931
1.35k
      PUTINC(code, 0, 1 + LINK_SIZE);
6932
1.35k
      break;    /* No need to waste time with special character handling */
6933
1.35k
      }
6934
6935
    /* Otherwise update the main code pointer to the end of the group. */
6936
6937
1.11k
    code = tempcode;
6938
6939
    /* For a DEFINE group, required and first character settings are not
6940
    relevant. */
6941
6942
1.11k
    if (bravalue == OP_DEFINE) break;
6943
6944
    /* Handle updating of the required and first code units for other types of
6945
    group. Update for normal brackets of all kinds, and conditions with two
6946
    branches (see code above). If the bracket is followed by a quantifier with
6947
    zero repeat, we have to back off. Hence the definition of zeroreqcu and
6948
    zerofirstcu outside the main loop so that they can be accessed for the back
6949
    off. */
6950
6951
1.11k
    zeroreqcu = reqcu;
6952
1.11k
    zeroreqcuflags = reqcuflags;
6953
1.11k
    zerofirstcu = firstcu;
6954
1.11k
    zerofirstcuflags = firstcuflags;
6955
1.11k
    groupsetfirstcu = FALSE;
6956
6957
1.11k
    if (bravalue >= OP_ONCE)  /* Not an assertion */
6958
1.11k
      {
6959
      /* If we have not yet set a firstcu in this branch, take it from the
6960
      subpattern, remembering that it was set here so that a repeat of more
6961
      than one can replicate it as reqcu if necessary. If the subpattern has
6962
      no firstcu, set "none" for the whole branch. In both cases, a zero
6963
      repeat forces firstcu to "none". */
6964
6965
1.11k
      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6966
126
        {
6967
126
        if (subfirstcuflags < REQ_NONE)
6968
75
          {
6969
75
          firstcu = subfirstcu;
6970
75
          firstcuflags = subfirstcuflags;
6971
75
          groupsetfirstcu = TRUE;
6972
75
          }
6973
51
        else firstcuflags = REQ_NONE;
6974
126
        zerofirstcuflags = REQ_NONE;
6975
126
        }
6976
6977
      /* If firstcu was previously set, convert the subpattern's firstcu
6978
      into reqcu if there wasn't one, using the vary flag that was in
6979
      existence beforehand. */
6980
6981
988
      else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6982
31
        {
6983
31
        subreqcu = subfirstcu;
6984
31
        subreqcuflags = subfirstcuflags | tempreqvary;
6985
31
        }
6986
6987
      /* If the subpattern set a required code unit (or set a first code unit
6988
      that isn't really the first code unit - see above), set it. */
6989
6990
1.11k
      if (subreqcuflags < REQ_NONE)
6991
174
        {
6992
174
        reqcu = subreqcu;
6993
174
        reqcuflags = subreqcuflags;
6994
174
        }
6995
1.11k
      }
6996
6997
    /* For a forward assertion, we take the reqcu, if set, provided that the
6998
    group has also set a firstcu. This can be helpful if the pattern that
6999
    follows the assertion doesn't set a different char. For example, it's
7000
    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7001
    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7002
    the "real" "a" would then become a reqcu instead of a firstcu. This is
7003
    overcome by a scan at the end if there's no firstcu, looking for an
7004
    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7005
    we must only take the reqcu when the group also set a firstcu. Otherwise,
7006
    in that example, 'X' ends up set for both. */
7007
7008
5
    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7009
5
             subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7010
5
      {
7011
5
      reqcu = subreqcu;
7012
5
      reqcuflags = subreqcuflags;
7013
5
      }
7014
7015
1.11k
    break;  /* End of nested group handling */
7016
7017
7018
    /* ===================================================================*/
7019
    /* Handle named backreferences and recursions. */
7020
7021
0
    case META_BACKREF_BYNAME:
7022
0
    case META_RECURSE_BYNAME:
7023
0
      {
7024
0
      int count, index;
7025
0
      PCRE2_SPTR name;
7026
0
      BOOL is_dupname = FALSE;
7027
0
      named_group *ng = cb->named_groups;
7028
0
      uint32_t length = *(++pptr);
7029
7030
0
      GETPLUSOFFSET(offset, pptr);
7031
0
      name = cb->start_pattern + offset;
7032
7033
      /* In the first pass, the names generated in the pre-pass are available,
7034
      but the main name table has not yet been created. Scan the list of names
7035
      generated in the pre-pass in order to get a number and whether or not
7036
      this name is duplicated. */
7037
7038
0
      groupnumber = 0;
7039
0
      for (unsigned int i = 0; i < cb->names_found; i++, ng++)
7040
0
        {
7041
0
        if (length == ng->length &&
7042
0
            PRIV(strncmp)(name, ng->name, length) == 0)
7043
0
          {
7044
0
          is_dupname = ng->isdup;
7045
0
          groupnumber = ng->number;
7046
7047
          /* For a recursion, that's all that is needed. We can now go to
7048
          the code that handles numerical recursion, applying it to the first
7049
          group with the given name. */
7050
7051
0
          if (meta == META_RECURSE_BYNAME)
7052
0
            {
7053
0
            meta_arg = groupnumber;
7054
0
            goto HANDLE_NUMERICAL_RECURSION;
7055
0
            }
7056
7057
          /* For a back reference, update the back reference map and the
7058
          maximum back reference. */
7059
7060
0
          cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7061
0
          if (groupnumber > cb->top_backref)
7062
0
            cb->top_backref = groupnumber;
7063
0
          }
7064
0
        }
7065
7066
      /* If the name was not found we have a bad reference. */
7067
7068
0
      if (groupnumber == 0)
7069
0
        {
7070
0
        *errorcodeptr = ERR15;
7071
0
        cb->erroroffset = offset;
7072
0
        return 0;
7073
0
        }
7074
7075
      /* If a back reference name is not duplicated, we can handle it as
7076
      a numerical reference. */
7077
7078
0
      if (!is_dupname)
7079
0
        {
7080
0
        meta_arg = groupnumber;
7081
0
        goto HANDLE_SINGLE_REFERENCE;
7082
0
        }
7083
7084
      /* If a back reference name is duplicated, we generate a different
7085
      opcode to a numerical back reference. In the second pass we must
7086
      search for the index and count in the final name table. */
7087
7088
0
      count = 0;  /* Values for first pass (avoids compiler warning) */
7089
0
      index = 0;
7090
0
      if (lengthptr == NULL && !find_dupname_details(name, length, &index,
7091
0
            &count, errorcodeptr, cb)) return 0;
7092
7093
0
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7094
0
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7095
0
      PUT2INC(code, 0, index);
7096
0
      PUT2INC(code, 0, count);
7097
0
      }
7098
0
    break;
7099
7100
7101
    /* ===================================================================*/
7102
    /* Handle a numerical callout. */
7103
7104
0
    case META_CALLOUT_NUMBER:
7105
0
    code[0] = OP_CALLOUT;
7106
0
    PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7107
0
    PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7108
0
    code[1 + 2*LINK_SIZE] = pptr[3];
7109
0
    pptr += 3;
7110
0
    code += PRIV(OP_lengths)[OP_CALLOUT];
7111
0
    break;
7112
7113
7114
    /* ===================================================================*/
7115
    /* Handle a callout with a string argument. In the pre-pass we just compute
7116
    the length without generating anything. The length in pptr[3] includes both
7117
    delimiters; in the actual compile only the first one is copied, but a
7118
    terminating zero is added. Any doubled delimiters within the string make
7119
    this an overestimate, but it is not worth bothering about. */
7120
7121
0
    case META_CALLOUT_STRING:
7122
0
    if (lengthptr != NULL)
7123
0
      {
7124
0
      *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7125
0
      pptr += 3;
7126
0
      SKIPOFFSET(pptr);
7127
0
      }
7128
7129
    /* In the real compile we can copy the string. The starting delimiter is
7130
     included so that the client can discover it if they want. We also pass the
7131
     start offset to help a script language give better error messages. */
7132
7133
0
    else
7134
0
      {
7135
0
      PCRE2_SPTR pp;
7136
0
      uint32_t delimiter;
7137
0
      uint32_t length = pptr[3];
7138
0
      PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7139
7140
0
      code[0] = OP_CALLOUT_STR;
7141
0
      PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7142
0
      PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7143
7144
0
      pptr += 3;
7145
0
      GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7146
0
      pp = cb->start_pattern + offset;
7147
0
      delimiter = *callout_string++ = *pp++;
7148
0
      if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7149
0
        delimiter = CHAR_RIGHT_CURLY_BRACKET;
7150
0
      PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7151
7152
      /* The syntax of the pattern was checked in the parsing scan. The length
7153
      includes both delimiters, but we have passed the opening one just above,
7154
      so we reduce length before testing it. The test is for > 1 because we do
7155
      not want to copy the final delimiter. This also ensures that pp[1] is
7156
      accessible. */
7157
7158
0
      while (--length > 1)
7159
0
        {
7160
0
        if (*pp == delimiter && pp[1] == delimiter)
7161
0
          {
7162
0
          *callout_string++ = delimiter;
7163
0
          pp += 2;
7164
0
          length--;
7165
0
          }
7166
0
        else *callout_string++ = *pp++;
7167
0
        }
7168
0
      *callout_string++ = CHAR_NUL;
7169
7170
      /* Set the length of the entire item, the advance to its end. */
7171
7172
0
      PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7173
0
      code = callout_string;
7174
0
      }
7175
0
    break;
7176
7177
7178
    /* ===================================================================*/
7179
    /* Handle repetition. The different types are all sorted out in the parsing
7180
    pass. */
7181
7182
0
    case META_MINMAX_PLUS:
7183
0
    case META_MINMAX_QUERY:
7184
0
    case META_MINMAX:
7185
0
    repeat_min = *(++pptr);
7186
0
    repeat_max = *(++pptr);
7187
0
    goto REPEAT;
7188
7189
4.67k
    case META_ASTERISK:
7190
4.84k
    case META_ASTERISK_PLUS:
7191
6.44k
    case META_ASTERISK_QUERY:
7192
6.44k
    repeat_min = 0;
7193
6.44k
    repeat_max = REPEAT_UNLIMITED;
7194
6.44k
    goto REPEAT;
7195
7196
8.33k
    case META_PLUS:
7197
9.05k
    case META_PLUS_PLUS:
7198
11.3k
    case META_PLUS_QUERY:
7199
11.3k
    repeat_min = 1;
7200
11.3k
    repeat_max = REPEAT_UNLIMITED;
7201
11.3k
    goto REPEAT;
7202
7203
19.1k
    case META_QUERY:
7204
20.0k
    case META_QUERY_PLUS:
7205
22.7k
    case META_QUERY_QUERY:
7206
22.7k
    repeat_min = 0;
7207
22.7k
    repeat_max = 1;
7208
7209
40.4k
    REPEAT:
7210
40.4k
    if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7211
7212
    /* Remember whether this is a variable length repeat, and default to
7213
    single-char opcodes. */
7214
7215
40.4k
    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7216
40.4k
    op_type = 0;
7217
7218
    /* Adjust first and required code units for a zero repeat. */
7219
7220
40.4k
    if (repeat_min == 0)
7221
29.1k
      {
7222
29.1k
      firstcu = zerofirstcu;
7223
29.1k
      firstcuflags = zerofirstcuflags;
7224
29.1k
      reqcu = zeroreqcu;
7225
29.1k
      reqcuflags = zeroreqcuflags;
7226
29.1k
      }
7227
7228
    /* Note the greediness and possessiveness. */
7229
7230
40.4k
    switch (meta)
7231
40.4k
      {
7232
0
      case META_MINMAX_PLUS:
7233
166
      case META_ASTERISK_PLUS:
7234
889
      case META_PLUS_PLUS:
7235
1.79k
      case META_QUERY_PLUS:
7236
1.79k
      repeat_type = 0;                  /* Force greedy */
7237
1.79k
      possessive_quantifier = TRUE;
7238
1.79k
      break;
7239
7240
0
      case META_MINMAX_QUERY:
7241
1.59k
      case META_ASTERISK_QUERY:
7242
3.88k
      case META_PLUS_QUERY:
7243
6.53k
      case META_QUERY_QUERY:
7244
6.53k
      repeat_type = greedy_non_default;
7245
6.53k
      possessive_quantifier = FALSE;
7246
6.53k
      break;
7247
7248
32.1k
      default:
7249
32.1k
      repeat_type = greedy_default;
7250
32.1k
      possessive_quantifier = FALSE;
7251
32.1k
      break;
7252
40.4k
      }
7253
7254
    /* Save start of previous item, in case we have to move it up in order to
7255
    insert something before it, and remember what it was. */
7256
7257
40.4k
    tempcode = previous;
7258
40.4k
    op_previous = *previous;
7259
7260
    /* Now handle repetition for the different types of item. If the repeat
7261
    minimum and the repeat maximum are both 1, we can ignore the quantifier for
7262
    non-parenthesized items, as they have only one alternative. For anything in
7263
    parentheses, we must not ignore if {1} is possessive. */
7264
7265
40.4k
    switch (op_previous)
7266
40.4k
      {
7267
      /* If previous was a character or negated character match, abolish the
7268
      item and generate a repeat item instead. If a char item has a minimum of
7269
      more than one, ensure that it is set in reqcu - it might not be if a
7270
      sequence such as x{3} is the first thing in a branch because the x will
7271
      have gone into firstcu instead.  */
7272
7273
15.4k
      case OP_CHAR:
7274
22.0k
      case OP_CHARI:
7275
22.2k
      case OP_NOT:
7276
22.7k
      case OP_NOTI:
7277
22.7k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7278
22.7k
      op_type = chartypeoffset[op_previous - OP_CHAR];
7279
7280
      /* Deal with UTF characters that take up more than one code unit. */
7281
7282
22.7k
#ifdef MAYBE_UTF_MULTI
7283
22.7k
      if (utf && NOT_FIRSTCU(code[-1]))
7284
22
        {
7285
22
        PCRE2_UCHAR *lastchar = code - 1;
7286
22
        BACKCHAR(lastchar);
7287
22
        mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7288
22
        memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7289
22
        }
7290
22.7k
      else
7291
22.7k
#endif  /* MAYBE_UTF_MULTI */
7292
7293
      /* Handle the case of a single code unit - either with no UTF support, or
7294
      with UTF disabled, or for a single-code-unit UTF character. In the latter
7295
      case, for a repeated positive match, get the caseless flag for the
7296
      required code unit from the previous character, because a class like [Aa]
7297
      sets a caseless A but by now the req_caseopt flag has been reset. */
7298
7299
22.7k
        {
7300
22.7k
        mcbuffer[0] = code[-1];
7301
22.7k
        mclength = 1;
7302
22.7k
        if (op_previous <= OP_CHARI && repeat_min > 1)
7303
0
          {
7304
0
          reqcu = mcbuffer[0];
7305
0
          reqcuflags = cb->req_varyopt;
7306
0
          if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7307
0
          }
7308
22.7k
        }
7309
22.7k
      goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7310
7311
      /* If previous was a character class or a back reference, we put the
7312
      repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7313
7314
0
#ifdef SUPPORT_WIDE_CHARS
7315
528
      case OP_XCLASS:
7316
528
#endif
7317
4.18k
      case OP_CLASS:
7318
5.04k
      case OP_NCLASS:
7319
5.05k
      case OP_REF:
7320
5.05k
      case OP_REFI:
7321
5.05k
      case OP_DNREF:
7322
5.05k
      case OP_DNREFI:
7323
7324
5.05k
      if (repeat_max == 0)
7325
0
        {
7326
0
        code = previous;
7327
0
        goto END_REPEAT;
7328
0
        }
7329
5.05k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7330
7331
5.05k
      if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7332
2.70k
        *code++ = OP_CRSTAR + repeat_type;
7333
2.35k
      else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7334
1.15k
        *code++ = OP_CRPLUS + repeat_type;
7335
1.19k
      else if (repeat_min == 0 && repeat_max == 1)
7336
1.19k
        *code++ = OP_CRQUERY + repeat_type;
7337
0
      else
7338
0
        {
7339
0
        *code++ = OP_CRRANGE + repeat_type;
7340
0
        PUT2INC(code, 0, repeat_min);
7341
0
        if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7342
0
        PUT2INC(code, 0, repeat_max);
7343
0
        }
7344
5.05k
      break;
7345
7346
      /* If previous is OP_FAIL, it was generated by an empty class []
7347
      (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7348
      generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7349
      time. We can just ignore this repeat. */
7350
7351
0
      case OP_FAIL:
7352
0
      goto END_REPEAT;
7353
7354
      /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7355
      because pcre2_match() could not handle backtracking into recursively
7356
      called groups. Now that this backtracking is available, we no longer need
7357
      to do this. However, we still need to replicate recursions as we do for
7358
      groups so as to have independent backtracking points. We can replicate
7359
      for the minimum number of repeats directly. For optional repeats we now
7360
      wrap the recursion in OP_BRA brackets and make use of the bracket
7361
      repetition. */
7362
7363
0
      case OP_RECURSE:
7364
0
      if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7365
0
        goto END_REPEAT;
7366
7367
      /* Generate unwrapped repeats for a non-zero minimum, except when the
7368
      minimum is 1 and the maximum unlimited, because that can be handled with
7369
      OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7370
      minimum, we just need to generate the appropriate additional copies.
7371
      Otherwise we need to generate one more, to simulate the situation when
7372
      the minimum is zero. */
7373
7374
0
      if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7375
0
        {
7376
0
        int replicate = repeat_min;
7377
0
        if (repeat_min == repeat_max) replicate--;
7378
7379
        /* In the pre-compile phase, we don't actually do the replication. We
7380
        just adjust the length as if we had. Do some paranoid checks for
7381
        potential integer overflow. */
7382
7383
0
        if (lengthptr != NULL)
7384
0
          {
7385
0
          PCRE2_SIZE delta;
7386
0
          if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7387
0
              OFLOW_MAX - *lengthptr < delta)
7388
0
            {
7389
0
            *errorcodeptr = ERR20;
7390
0
            return 0;
7391
0
            }
7392
0
          *lengthptr += delta;
7393
0
          }
7394
7395
0
        else for (int i = 0; i < replicate; i++)
7396
0
          {
7397
0
          memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7398
0
          previous = code;
7399
0
          code += 1 + LINK_SIZE;
7400
0
          }
7401
7402
        /* If the number of repeats is fixed, we are done. Otherwise, adjust
7403
        the counts and fall through. */
7404
7405
0
        if (repeat_min == repeat_max) break;
7406
0
        if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7407
0
        repeat_min = 0;
7408
0
        }
7409
7410
      /* Wrap the recursion call in OP_BRA brackets. */
7411
7412
0
      (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7413
0
      op_previous = *previous = OP_BRA;
7414
0
      PUT(previous, 1, 2 + 2*LINK_SIZE);
7415
0
      previous[2 + 2*LINK_SIZE] = OP_KET;
7416
0
      PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7417
0
      code += 2 + 2 * LINK_SIZE;
7418
0
      length_prevgroup = 3 + 3*LINK_SIZE;
7419
0
      group_return = -1;  /* Set "may match empty string" */
7420
7421
      /* Now treat as a repeated OP_BRA. */
7422
      /* Fall through */
7423
7424
      /* If previous was a bracket group, we may have to replicate it in
7425
      certain cases. Note that at this point we can encounter only the "basic"
7426
      bracket opcodes such as BRA and CBRA, as this is the place where they get
7427
      converted into the more special varieties such as BRAPOS and SBRA.
7428
      Originally, PCRE did not allow repetition of assertions, but now it does,
7429
      for Perl compatibility. */
7430
7431
0
      case OP_ASSERT:
7432
0
      case OP_ASSERT_NOT:
7433
4
      case OP_ASSERT_NA:
7434
4
      case OP_ASSERTBACK:
7435
4
      case OP_ASSERTBACK_NOT:
7436
4
      case OP_ASSERTBACK_NA:
7437
4
      case OP_ONCE:
7438
4
      case OP_SCRIPT_RUN:
7439
8
      case OP_BRA:
7440
1.01k
      case OP_CBRA:
7441
1.01k
      case OP_COND:
7442
1.01k
        {
7443
1.01k
        int len = (int)(code - previous);
7444
1.01k
        PCRE2_UCHAR *bralink = NULL;
7445
1.01k
        PCRE2_UCHAR *brazeroptr = NULL;
7446
7447
1.01k
        if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7448
0
          goto END_REPEAT;
7449
7450
        /* Repeating a DEFINE group (or any group where the condition is always
7451
        FALSE and there is only one branch) is pointless, but Perl allows the
7452
        syntax, so we just ignore the repeat. */
7453
7454
1.01k
        if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7455
0
            previous[GET(previous, 1)] != OP_ALT)
7456
0
          goto END_REPEAT;
7457
7458
        /* Perl allows all assertions to be quantified, and when they contain
7459
        capturing parentheses and/or are optional there are potential uses for
7460
        this feature. PCRE2 used to force the maximum quantifier to 1 on the
7461
        invalid grounds that further repetition was never useful. This was
7462
        always a bit pointless, since an assertion could be wrapped with a
7463
        repeated group to achieve the effect. General repetition is now
7464
        permitted, but if the maximum is unlimited it is set to one more than
7465
        the minimum. */
7466
7467
1.01k
        if (op_previous < OP_ONCE)    /* Assertion */
7468
4
          {
7469
4
          if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7470
4
          }
7471
7472
        /* The case of a zero minimum is special because of the need to stick
7473
        OP_BRAZERO in front of it, and because the group appears once in the
7474
        data, whereas in other cases it appears the minimum number of times. For
7475
        this reason, it is simplest to treat this case separately, as otherwise
7476
        the code gets far too messy. There are several special subcases when the
7477
        minimum is zero. */
7478
7479
1.01k
        if (repeat_min == 0)
7480
4
          {
7481
          /* If the maximum is also zero, we used to just omit the group from
7482
          the output altogether, like this:
7483
7484
          ** if (repeat_max == 0)
7485
          **   {
7486
          **   code = previous;
7487
          **   goto END_REPEAT;
7488
          **   }
7489
7490
          However, that fails when a group or a subgroup within it is
7491
          referenced as a subroutine from elsewhere in the pattern, so now we
7492
          stick in OP_SKIPZERO in front of it so that it is skipped on
7493
          execution. As we don't have a list of which groups are referenced, we
7494
          cannot do this selectively.
7495
7496
          If the maximum is 1 or unlimited, we just have to stick in the
7497
          BRAZERO and do no more at this point. */
7498
7499
4
          if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7500
4
            {
7501
4
            (void)memmove(previous + 1, previous, CU2BYTES(len));
7502
4
            code++;
7503
4
            if (repeat_max == 0)
7504
0
              {
7505
0
              *previous++ = OP_SKIPZERO;
7506
0
              goto END_REPEAT;
7507
0
              }
7508
4
            brazeroptr = previous;    /* Save for possessive optimizing */
7509
4
            *previous++ = OP_BRAZERO + repeat_type;
7510
4
            }
7511
7512
          /* If the maximum is greater than 1 and limited, we have to replicate
7513
          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7514
          The first one has to be handled carefully because it's the original
7515
          copy, which has to be moved up. The remainder can be handled by code
7516
          that is common with the non-zero minimum case below. We have to
7517
          adjust the value or repeat_max, since one less copy is required. */
7518
7519
0
          else
7520
0
            {
7521
0
            int linkoffset;
7522
0
            (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7523
0
            code += 2 + LINK_SIZE;
7524
0
            *previous++ = OP_BRAZERO + repeat_type;
7525
0
            *previous++ = OP_BRA;
7526
7527
            /* We chain together the bracket link offset fields that have to be
7528
            filled in later when the ends of the brackets are reached. */
7529
7530
0
            linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7531
0
            bralink = previous;
7532
0
            PUTINC(previous, 0, linkoffset);
7533
0
            }
7534
7535
4
          if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7536
4
          }
7537
7538
        /* If the minimum is greater than zero, replicate the group as many
7539
        times as necessary, and adjust the maximum to the number of subsequent
7540
        copies that we need. */
7541
7542
1.01k
        else
7543
1.01k
          {
7544
1.01k
          if (repeat_min > 1)
7545
0
            {
7546
            /* In the pre-compile phase, we don't actually do the replication.
7547
            We just adjust the length as if we had. Do some paranoid checks for
7548
            potential integer overflow. */
7549
7550
0
            if (lengthptr != NULL)
7551
0
              {
7552
0
              PCRE2_SIZE delta;
7553
0
              if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7554
0
                                 (int)length_prevgroup) ||
7555
0
                  OFLOW_MAX - *lengthptr < delta)
7556
0
                {
7557
0
                *errorcodeptr = ERR20;
7558
0
                return 0;
7559
0
                }
7560
0
              *lengthptr += delta;
7561
0
              }
7562
7563
            /* This is compiling for real. If there is a set first code unit
7564
            for the group, and we have not yet set a "required code unit", set
7565
            it. */
7566
7567
0
            else
7568
0
              {
7569
0
              if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7570
0
                {
7571
0
                reqcu = firstcu;
7572
0
                reqcuflags = firstcuflags;
7573
0
                }
7574
0
              for (uint32_t i = 1; i < repeat_min; i++)
7575
0
                {
7576
0
                memcpy(code, previous, CU2BYTES(len));
7577
0
                code += len;
7578
0
                }
7579
0
              }
7580
0
            }
7581
7582
1.01k
          if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7583
1.01k
          }
7584
7585
        /* This code is common to both the zero and non-zero minimum cases. If
7586
        the maximum is limited, it replicates the group in a nested fashion,
7587
        remembering the bracket starts on a stack. In the case of a zero
7588
        minimum, the first one was set up above. In all cases the repeat_max
7589
        now specifies the number of additional copies needed. Again, we must
7590
        remember to replicate entries on the forward reference list. */
7591
7592
1.01k
        if (repeat_max != REPEAT_UNLIMITED)
7593
4
          {
7594
          /* In the pre-compile phase, we don't actually do the replication. We
7595
          just adjust the length as if we had. For each repetition we must add
7596
          1 to the length for BRAZERO and for all but the last repetition we
7597
          must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7598
          paranoid checks to avoid integer overflow. */
7599
7600
4
          if (lengthptr != NULL && repeat_max > 0)
7601
2
            {
7602
2
            PCRE2_SIZE delta;
7603
2
            if (PRIV(ckd_smul)(&delta, repeat_max,
7604
2
                               (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7605
2
                OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7606
0
              {
7607
0
              *errorcodeptr = ERR20;
7608
0
              return 0;
7609
0
              }
7610
2
            delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7611
2
            *lengthptr += delta;
7612
2
            }
7613
7614
          /* This is compiling for real */
7615
7616
4
          else for (uint32_t i = repeat_max; i >= 1; i--)
7617
2
            {
7618
2
            *code++ = OP_BRAZERO + repeat_type;
7619
7620
            /* All but the final copy start a new nesting, maintaining the
7621
            chain of brackets outstanding. */
7622
7623
2
            if (i != 1)
7624
0
              {
7625
0
              int linkoffset;
7626
0
              *code++ = OP_BRA;
7627
0
              linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7628
0
              bralink = code;
7629
0
              PUTINC(code, 0, linkoffset);
7630
0
              }
7631
7632
2
            memcpy(code, previous, CU2BYTES(len));
7633
2
            code += len;
7634
2
            }
7635
7636
          /* Now chain through the pending brackets, and fill in their length
7637
          fields (which are holding the chain links pro tem). */
7638
7639
4
          while (bralink != NULL)
7640
0
            {
7641
0
            int oldlinkoffset;
7642
0
            int linkoffset = (int)(code - bralink + 1);
7643
0
            PCRE2_UCHAR *bra = code - linkoffset;
7644
0
            oldlinkoffset = GET(bra, 1);
7645
0
            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7646
0
            *code++ = OP_KET;
7647
0
            PUTINC(code, 0, linkoffset);
7648
0
            PUT(bra, 1, linkoffset);
7649
0
            }
7650
4
          }
7651
7652
        /* If the maximum is unlimited, set a repeater in the final copy. For
7653
        SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7654
        possessively repeated ONCE brackets can be converted into non-capturing
7655
        brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7656
        saves having to deal with possessive ONCEs specially.
7657
7658
        Otherwise, when we are doing the actual compile phase, check to see
7659
        whether this group is one that could match an empty string. If so,
7660
        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7661
        that runtime checking can be done. [This check is also applied to ONCE
7662
        and SCRIPT_RUN groups at runtime, but in a different way.]
7663
7664
        Then, if the quantifier was possessive and the bracket is not a
7665
        conditional, we convert the BRA code to the POS form, and the KET code
7666
        to KETRPOS. (It turns out to be convenient at runtime to detect this
7667
        kind of subpattern at both the start and at the end.) The use of
7668
        special opcodes makes it possible to reduce greatly the stack usage in
7669
        pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7670
        OP_BRAPOSZERO.
7671
7672
        Then, if the minimum number of matches is 1 or 0, cancel the possessive
7673
        flag so that the default action below, of wrapping everything inside
7674
        atomic brackets, does not happen. When the minimum is greater than 1,
7675
        there will be earlier copies of the group, and so we still have to wrap
7676
        the whole thing. */
7677
7678
1.01k
        else
7679
1.01k
          {
7680
1.01k
          PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7681
1.01k
          PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7682
7683
          /* Convert possessive ONCE brackets to non-capturing */
7684
7685
1.01k
          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7686
7687
          /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7688
          to do is to set the KET. */
7689
7690
1.01k
          if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7691
0
            *ketcode = OP_KETRMAX + repeat_type;
7692
7693
          /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7694
          (which have been converted to non-capturing above). */
7695
7696
1.01k
          else
7697
1.01k
            {
7698
            /* In the compile phase, adjust the opcode if the group can match
7699
            an empty string. For a conditional group with only one branch, the
7700
            value of group_return will not show "could be empty", so we must
7701
            check that separately. */
7702
7703
1.01k
            if (lengthptr == NULL)
7704
446
              {
7705
446
              if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7706
446
              if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7707
0
                *bracode = OP_SCOND;
7708
446
              }
7709
7710
            /* Handle possessive quantifiers. */
7711
7712
1.01k
            if (possessive_quantifier)
7713
634
              {
7714
              /* For COND brackets, we wrap the whole thing in a possessively
7715
              repeated non-capturing bracket, because we have not invented POS
7716
              versions of the COND opcodes. */
7717
7718
634
              if (*bracode == OP_COND || *bracode == OP_SCOND)
7719
0
                {
7720
0
                int nlen = (int)(code - bracode);
7721
0
                (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7722
0
                code += 1 + LINK_SIZE;
7723
0
                nlen += 1 + LINK_SIZE;
7724
0
                *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7725
0
                *code++ = OP_KETRPOS;
7726
0
                PUTINC(code, 0, nlen);
7727
0
                PUT(bracode, 1, nlen);
7728
0
                }
7729
7730
              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7731
7732
634
              else
7733
634
                {
7734
634
                *bracode += 1;              /* Switch to xxxPOS opcodes */
7735
634
                *ketcode = OP_KETRPOS;
7736
634
                }
7737
7738
              /* If the minimum is zero, mark it as possessive, then unset the
7739
              possessive flag when the minimum is 0 or 1. */
7740
7741
634
              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7742
634
              if (repeat_min < 2) possessive_quantifier = FALSE;
7743
634
              }
7744
7745
            /* Non-possessive quantifier */
7746
7747
377
            else *ketcode = OP_KETRMAX + repeat_type;
7748
1.01k
            }
7749
1.01k
          }
7750
1.01k
        }
7751
1.01k
      break;
7752
7753
      /* If previous was a character type match (\d or similar), abolish it and
7754
      create a suitable repeat item. The code is shared with single-character
7755
      repeats by setting op_type to add a suitable offset into repeat_type.
7756
      Note the the Unicode property types will be present only when
7757
      SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7758
      here because it just makes it horribly messy. */
7759
7760
11.6k
      default:
7761
11.6k
      if (op_previous >= OP_EODN)   /* Not a character type - internal error */
7762
0
        {
7763
0
        *errorcodeptr = ERR10;
7764
0
        return 0;
7765
0
        }
7766
11.6k
      else
7767
11.6k
        {
7768
11.6k
        int prop_type, prop_value;
7769
11.6k
        PCRE2_UCHAR *oldcode;
7770
7771
11.6k
        if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7772
7773
11.6k
        op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7774
11.6k
        mclength = 0;                         /* Not a character */
7775
7776
11.6k
        if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7777
652
          {
7778
652
          prop_type = previous[1];
7779
652
          prop_value = previous[2];
7780
652
          }
7781
10.9k
        else
7782
10.9k
          {
7783
          /* Come here from just above with a character in mcbuffer/mclength. */
7784
33.7k
          OUTPUT_SINGLE_REPEAT:
7785
33.7k
          prop_type = prop_value = -1;
7786
33.7k
          }
7787
7788
        /* At this point, if prop_type == prop_value == -1 we either have a
7789
        character in mcbuffer when mclength is greater than zero, or we have
7790
        mclength zero, in which case there is a non-property character type in
7791
        op_previous. If prop_type/value are not negative, we have a property
7792
        character type in op_previous. */
7793
7794
34.4k
        oldcode = code;                   /* Save where we were */
7795
34.4k
        code = previous;                  /* Usually overwrite previous item */
7796
7797
        /* If the maximum is zero then the minimum must also be zero; Perl allows
7798
        this case, so we do too - by simply omitting the item altogether. */
7799
7800
34.4k
        if (repeat_max == 0) goto END_REPEAT;
7801
7802
        /* Combine the op_type with the repeat_type */
7803
7804
34.4k
        repeat_type += op_type;
7805
7806
        /* A minimum of zero is handled either as the special case * or ?, or as
7807
        an UPTO, with the maximum given. */
7808
7809
34.4k
        if (repeat_min == 0)
7810
25.2k
          {
7811
25.2k
          if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7812
21.5k
            else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7813
0
          else
7814
0
            {
7815
0
            *code++ = OP_UPTO + repeat_type;
7816
0
            PUT2INC(code, 0, repeat_max);
7817
0
            }
7818
25.2k
          }
7819
7820
        /* A repeat minimum of 1 is optimized into some special cases. If the
7821
        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7822
        left in place and, if the maximum is greater than 1, we use OP_UPTO with
7823
        one less than the maximum. */
7824
7825
9.17k
        else if (repeat_min == 1)
7826
9.17k
          {
7827
9.17k
          if (repeat_max == REPEAT_UNLIMITED)
7828
9.17k
            *code++ = OP_PLUS + repeat_type;
7829
0
          else
7830
0
            {
7831
0
            code = oldcode;  /* Leave previous item in place */
7832
0
            if (repeat_max == 1) goto END_REPEAT;
7833
0
            *code++ = OP_UPTO + repeat_type;
7834
0
            PUT2INC(code, 0, repeat_max - 1);
7835
0
            }
7836
9.17k
          }
7837
7838
        /* The case {n,n} is just an EXACT, while the general case {n,m} is
7839
        handled as an EXACT followed by an UPTO or STAR or QUERY. */
7840
7841
0
        else
7842
0
          {
7843
0
          *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7844
0
          PUT2INC(code, 0, repeat_min);
7845
7846
          /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7847
          and then generate the second opcode. For a repeated Unicode property
7848
          match, there are two extra values that define the required property,
7849
          and mclength is set zero to indicate this. */
7850
7851
0
          if (repeat_max != repeat_min)
7852
0
            {
7853
0
            if (mclength > 0)
7854
0
              {
7855
0
              memcpy(code, mcbuffer, CU2BYTES(mclength));
7856
0
              code += mclength;
7857
0
              }
7858
0
            else
7859
0
              {
7860
0
              *code++ = op_previous;
7861
0
              if (prop_type >= 0)
7862
0
                {
7863
0
                *code++ = prop_type;
7864
0
                *code++ = prop_value;
7865
0
                }
7866
0
              }
7867
7868
            /* Now set up the following opcode */
7869
7870
0
            if (repeat_max == REPEAT_UNLIMITED)
7871
0
              *code++ = OP_STAR + repeat_type;
7872
0
            else
7873
0
              {
7874
0
              repeat_max -= repeat_min;
7875
0
              if (repeat_max == 1)
7876
0
                {
7877
0
                *code++ = OP_QUERY + repeat_type;
7878
0
                }
7879
0
              else
7880
0
                {
7881
0
                *code++ = OP_UPTO + repeat_type;
7882
0
                PUT2INC(code, 0, repeat_max);
7883
0
                }
7884
0
              }
7885
0
            }
7886
0
          }
7887
7888
        /* Fill in the character or character type for the final opcode. */
7889
7890
34.4k
        if (mclength > 0)
7891
22.7k
          {
7892
22.7k
          memcpy(code, mcbuffer, CU2BYTES(mclength));
7893
22.7k
          code += mclength;
7894
22.7k
          }
7895
11.6k
        else
7896
11.6k
          {
7897
11.6k
          *code++ = op_previous;
7898
11.6k
          if (prop_type >= 0)
7899
652
            {
7900
652
            *code++ = prop_type;
7901
652
            *code++ = prop_value;
7902
652
            }
7903
11.6k
          }
7904
34.4k
        }
7905
34.4k
      break;
7906
40.4k
      }  /* End of switch on different op_previous values */
7907
7908
7909
    /* If the character following a repeat is '+', possessive_quantifier is
7910
    TRUE. For some opcodes, there are special alternative opcodes for this
7911
    case. For anything else, we wrap the entire repeated item inside OP_ONCE
7912
    brackets. Logically, the '+' notation is just syntactic sugar, taken from
7913
    Sun's Java package, but the special opcodes can optimize it.
7914
7915
    Some (but not all) possessively repeated subpatterns have already been
7916
    completely handled in the code just above. For them, possessive_quantifier
7917
    is always FALSE at this stage. Note that the repeated item starts at
7918
    tempcode, not at previous, which might be the first part of a string whose
7919
    (former) last char we repeated. */
7920
7921
40.4k
    if (possessive_quantifier)
7922
1.16k
      {
7923
1.16k
      int len;
7924
7925
      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7926
      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7927
      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7928
      remains is greater than zero, there's a further opcode that can be
7929
      handled. If not, do nothing, leaving the EXACT alone. */
7930
7931
1.16k
      switch(*tempcode)
7932
1.16k
        {
7933
0
        case OP_TYPEEXACT:
7934
0
        tempcode += PRIV(OP_lengths)[*tempcode] +
7935
0
          ((tempcode[1 + IMM2_SIZE] == OP_PROP
7936
0
          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7937
0
        break;
7938
7939
        /* CHAR opcodes are used for exacts whose count is 1. */
7940
7941
0
        case OP_CHAR:
7942
0
        case OP_CHARI:
7943
0
        case OP_NOT:
7944
0
        case OP_NOTI:
7945
0
        case OP_EXACT:
7946
0
        case OP_EXACTI:
7947
0
        case OP_NOTEXACT:
7948
0
        case OP_NOTEXACTI:
7949
0
        tempcode += PRIV(OP_lengths)[*tempcode];
7950
0
#ifdef SUPPORT_UNICODE
7951
0
        if (utf && HAS_EXTRALEN(tempcode[-1]))
7952
0
          tempcode += GET_EXTRALEN(tempcode[-1]);
7953
0
#endif
7954
0
        break;
7955
7956
        /* For the class opcodes, the repeat operator appears at the end;
7957
        adjust tempcode to point to it. */
7958
7959
112
        case OP_CLASS:
7960
114
        case OP_NCLASS:
7961
114
        tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7962
114
        break;
7963
7964
0
#ifdef SUPPORT_WIDE_CHARS
7965
0
        case OP_XCLASS:
7966
0
        tempcode += GET(tempcode, 1);
7967
0
        break;
7968
1.16k
#endif
7969
1.16k
        }
7970
7971
      /* If tempcode is equal to code (which points to the end of the repeated
7972
      item), it means we have skipped an EXACT item but there is no following
7973
      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7974
      all other cases, tempcode will be pointing to the repeat opcode, and will
7975
      be less than code, so the value of len will be greater than 0. */
7976
7977
1.16k
      len = (int)(code - tempcode);
7978
1.16k
      if (len > 0)
7979
1.16k
        {
7980
1.16k
        unsigned int repcode = *tempcode;
7981
7982
        /* There is a table for possessifying opcodes, all of which are less
7983
        than OP_CALLOUT. A zero entry means there is no possessified version.
7984
        */
7985
7986
1.16k
        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7987
1.15k
          *tempcode = opcode_possessify[repcode];
7988
7989
        /* For opcode without a special possessified version, wrap the item in
7990
        ONCE brackets. */
7991
7992
4
        else
7993
4
          {
7994
4
          (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7995
4
          code += 1 + LINK_SIZE;
7996
4
          len += 1 + LINK_SIZE;
7997
4
          tempcode[0] = OP_ONCE;
7998
4
          *code++ = OP_KET;
7999
4
          PUTINC(code, 0, len);
8000
4
          PUT(tempcode, 1, len);
8001
4
          }
8002
1.16k
        }
8003
1.16k
      }
8004
8005
    /* We set the "follows varying string" flag for subsequently encountered
8006
    reqcus if it isn't already set and we have just passed a varying length
8007
    item. */
8008
8009
40.4k
    END_REPEAT:
8010
40.4k
    cb->req_varyopt |= reqvary;
8011
40.4k
    break;
8012
8013
8014
    /* ===================================================================*/
8015
    /* Handle a 32-bit data character with a value greater than META_END. */
8016
8017
0
    case META_BIGVALUE:
8018
0
    pptr++;
8019
0
    goto NORMAL_CHAR;
8020
8021
8022
    /* ===============================================================*/
8023
    /* Handle a back reference by number, which is the meta argument. The
8024
    pattern offsets for back references to group numbers less than 10 are held
8025
    in a special vector, to avoid using more than two parsed pattern elements
8026
    in 64-bit environments. We only need the offset to the first occurrence,
8027
    because if that doesn't fail, subsequent ones will also be OK. */
8028
8029
845
    case META_BACKREF:
8030
845
    if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8031
104
      else GETPLUSOFFSET(offset, pptr);
8032
8033
845
    if (meta_arg > cb->bracount)
8034
71
      {
8035
71
      cb->erroroffset = offset;
8036
71
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8037
71
      return 0;
8038
71
      }
8039
8040
    /* Come here from named backref handling when the reference is to a
8041
    single group (that is, not to a duplicated name). The back reference
8042
    data will have already been updated. We must disable firstcu if not
8043
    set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8044
    later. */
8045
8046
774
    HANDLE_SINGLE_REFERENCE:
8047
774
    if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8048
774
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8049
774
    PUT2INC(code, 0, meta_arg);
8050
8051
    /* Update the map of back references, and keep the highest one. We
8052
    could do this in parse_regex() for numerical back references, but not
8053
    for named back references, because we don't know the numbers to which
8054
    named back references refer. So we do it all in this function. */
8055
8056
774
    cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8057
774
    if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8058
774
    break;
8059
8060
8061
    /* ===============================================================*/
8062
    /* Handle recursion by inserting the number of the called group (which is
8063
    the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8064
    scanned and these numbers are replaced by offsets within the pattern. It is
8065
    done like this to avoid problems with forward references and adjusting
8066
    offsets when groups are duplicated and moved (as discovered in previous
8067
    implementations). Note that a recursion does not have a set first
8068
    character. */
8069
8070
3
    case META_RECURSE:
8071
3
    GETPLUSOFFSET(offset, pptr);
8072
3
    if (meta_arg > cb->bracount)
8073
3
      {
8074
3
      cb->erroroffset = offset;
8075
3
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8076
3
      return 0;
8077
3
      }
8078
0
    HANDLE_NUMERICAL_RECURSION:
8079
0
    *code = OP_RECURSE;
8080
0
    PUT(code, 1, meta_arg);
8081
0
    code += 1 + LINK_SIZE;
8082
0
    groupsetfirstcu = FALSE;
8083
0
    cb->had_recurse = TRUE;
8084
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8085
0
    zerofirstcu = firstcu;
8086
0
    zerofirstcuflags = firstcuflags;
8087
0
    break;
8088
8089
8090
    /* ===============================================================*/
8091
    /* Handle capturing parentheses; the number is the meta argument. */
8092
8093
2.45k
    case META_CAPTURE:
8094
2.45k
    bravalue = OP_CBRA;
8095
2.45k
    skipunits = IMM2_SIZE;
8096
2.45k
    PUT2(code, 1+LINK_SIZE, meta_arg);
8097
2.45k
    cb->lastcapture = meta_arg;
8098
2.45k
    goto GROUP_PROCESS_NOTE_EMPTY;
8099
8100
8101
    /* ===============================================================*/
8102
    /* Handle escape sequence items. For ones like \d, the ESC_values are
8103
    arranged to be the same as the corresponding OP_values in the default case
8104
    when PCRE2_UCP is not set (which is the only case in which they will appear
8105
    here).
8106
8107
    Note: \Q and \E are never seen here, as they were dealt with in
8108
    parse_pattern(). Neither are numerical back references or recursions, which
8109
    were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8110
    \g, when followed by names, are turned into META_BACKREF_BYNAME or
8111
    META_RECURSE_BYNAME. */
8112
8113
23.7k
    case META_ESCAPE:
8114
8115
    /* We can test for escape sequences that consume a character because their
8116
    values lie between ESC_b and ESC_Z; this may have to change if any new ones
8117
    are ever created. For these sequences, we disable the setting of a first
8118
    character if it hasn't already been set. */
8119
8120
23.7k
    if (meta_arg > ESC_b && meta_arg < ESC_Z)
8121
22.8k
      {
8122
22.8k
      matched_char = TRUE;
8123
22.8k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8124
22.8k
      }
8125
8126
    /* Set values to reset to if this is followed by a zero repeat. */
8127
8128
23.7k
    zerofirstcu = firstcu;
8129
23.7k
    zerofirstcuflags = firstcuflags;
8130
23.7k
    zeroreqcu = reqcu;
8131
23.7k
    zeroreqcuflags = reqcuflags;
8132
8133
    /* If Unicode is not supported, \P and \p are not allowed and are
8134
    faulted at parse time, so will never appear here. */
8135
8136
23.7k
#ifdef SUPPORT_UNICODE
8137
23.7k
    if (meta_arg == ESC_P || meta_arg == ESC_p)
8138
1.00k
      {
8139
1.00k
      uint32_t ptype = *(++pptr) >> 16;
8140
1.00k
      uint32_t pdata = *pptr & 0xffff;
8141
8142
      /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
8143
      from the auto-anchoring code. */
8144
8145
1.00k
      if (meta_arg == ESC_p && ptype == PT_ANY)
8146
0
        {
8147
0
        *code++ = OP_ALLANY;
8148
0
        }
8149
1.00k
      else
8150
1.00k
        {
8151
1.00k
        *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8152
1.00k
        *code++ = ptype;
8153
1.00k
        *code++ = pdata;
8154
1.00k
        }
8155
1.00k
      break;  /* End META_ESCAPE */
8156
1.00k
      }
8157
22.7k
#endif
8158
8159
    /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8160
    done. However, there's an option, in case anyone was relying on it. */
8161
8162
22.7k
    if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8163
0
        (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8164
0
      {
8165
0
      *errorcodeptr = ERR99;
8166
0
      return 0;
8167
0
      }
8168
8169
    /* For the rest (including \X when Unicode is supported - if not it's
8170
    faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8171
    not set; if it is set, most of them do not show up here because they are
8172
    converted into Unicode property tests in parse_regex().
8173
8174
    In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8175
    instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8176
    There are special UCP codes for \B and \b which are used in UCP mode unless
8177
    "word" matching is being forced to ASCII.
8178
8179
    Note that \b and \B do a one-character lookbehind, and \A also behaves as
8180
    if it does. */
8181
8182
22.7k
    switch(meta_arg)
8183
22.7k
      {
8184
213
      case ESC_C:
8185
213
      cb->external_flags |= PCRE2_HASBKC;  /* Record */
8186
#if PCRE2_CODE_UNIT_WIDTH == 32
8187
      meta_arg = OP_ALLANY;
8188
#else
8189
213
      if (!utf) meta_arg = OP_ALLANY;
8190
213
#endif
8191
213
      break;
8192
8193
250
      case ESC_B:
8194
814
      case ESC_b:
8195
814
      if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8196
154
        meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8197
154
          OP_UCP_WORD_BOUNDARY;
8198
      /* Fall through */
8199
8200
832
      case ESC_A:
8201
832
      if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8202
832
      break;
8203
22.7k
      }
8204
8205
22.7k
    *code++ = meta_arg;
8206
22.7k
    break;  /* End META_ESCAPE */
8207
8208
8209
    /* ===================================================================*/
8210
    /* Handle an unrecognized meta value. A parsed pattern value less than
8211
    META_END is a literal. Otherwise we have a problem. */
8212
8213
330k
    default:
8214
330k
    if (meta >= META_END)
8215
0
      {
8216
#ifdef DEBUG_SHOW_PARSED
8217
      fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
8218
#endif
8219
0
      *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8220
0
      return 0;
8221
0
      }
8222
8223
    /* Handle a literal character. We come here by goto in the case of a
8224
    32-bit, non-UTF character whose value is greater than META_END. */
8225
8226
330k
    NORMAL_CHAR:
8227
330k
    meta = *pptr;     /* Get the full 32 bits */
8228
330k
    NORMAL_CHAR_SET:  /* Character is already in meta */
8229
330k
    matched_char = TRUE;
8230
8231
    /* For caseless UTF or UCP mode, check whether this character has more than
8232
    one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8233
    When casing restrictions apply, ignore caseless sets that start with an
8234
    ASCII character. */
8235
8236
330k
#ifdef SUPPORT_UNICODE
8237
330k
    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8238
22.2k
      {
8239
22.2k
      uint32_t caseset = UCD_CASESET(meta);
8240
22.2k
      if (caseset != 0 &&
8241
1.06k
           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
8242
0
           PRIV(ucd_caseless_sets)[caseset] > 127))
8243
1.06k
        {
8244
1.06k
        *code++ = OP_PROP;
8245
1.06k
        *code++ = PT_CLIST;
8246
1.06k
        *code++ = caseset;
8247
1.06k
        if (firstcuflags == REQ_UNSET)
8248
20
          firstcuflags = zerofirstcuflags = REQ_NONE;
8249
1.06k
        break;  /* End handling this meta item */
8250
1.06k
        }
8251
22.2k
      }
8252
329k
#endif
8253
8254
    /* Caseful matches, or caseless and not one of the multicase characters. We
8255
    come here by goto in the case of a positive class that contains only
8256
    case-partners of a character with just two cases; matched_char has already
8257
    been set TRUE and options fudged if necessary. */
8258
8259
329k
    CLASS_CASELESS_CHAR:
8260
8261
    /* Get the character's code units into mcbuffer, with the length in
8262
    mclength. When not in UTF mode, the length is always 1. */
8263
8264
329k
#ifdef SUPPORT_UNICODE
8265
329k
    if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8266
300k
#endif
8267
300k
      {
8268
300k
      mclength = 1;
8269
300k
      mcbuffer[0] = meta;
8270
300k
      }
8271
8272
    /* Generate the appropriate code */
8273
8274
329k
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8275
329k
    memcpy(code, mcbuffer, CU2BYTES(mclength));
8276
329k
    code += mclength;
8277
8278
    /* Remember if \r or \n were seen */
8279
8280
329k
    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8281
7.01k
      cb->external_flags |= PCRE2_HASCRORLF;
8282
8283
    /* Set the first and required code units appropriately. If no previous
8284
    first code unit, set it from this character, but revert to none on a zero
8285
    repeat. Otherwise, leave the firstcu value alone, and don't change it on
8286
    a zero repeat. */
8287
8288
329k
    if (firstcuflags == REQ_UNSET)
8289
8.08k
      {
8290
8.08k
      zerofirstcuflags = REQ_NONE;
8291
8.08k
      zeroreqcu = reqcu;
8292
8.08k
      zeroreqcuflags = reqcuflags;
8293
8294
      /* If the character is more than one code unit long, we can set a single
8295
      firstcu only if it is not to be matched caselessly. Multiple possible
8296
      starting code units may be picked up later in the studying code. */
8297
8298
8.08k
      if (mclength == 1 || req_caseopt == 0)
8299
8.04k
        {
8300
8.04k
        firstcu = mcbuffer[0];
8301
8.04k
        firstcuflags = req_caseopt;
8302
8.04k
        if (mclength != 1)
8303
0
          {
8304
0
          reqcu = code[-1];
8305
0
          reqcuflags = cb->req_varyopt;
8306
0
          }
8307
8.04k
        }
8308
36
      else firstcuflags = reqcuflags = REQ_NONE;
8309
8.08k
      }
8310
8311
    /* firstcu was previously set; we can set reqcu only if the length is
8312
    1 or the matching is caseful. */
8313
8314
321k
    else
8315
321k
      {
8316
321k
      zerofirstcu = firstcu;
8317
321k
      zerofirstcuflags = firstcuflags;
8318
321k
      zeroreqcu = reqcu;
8319
321k
      zeroreqcuflags = reqcuflags;
8320
321k
      if (mclength == 1 || req_caseopt == 0)
8321
320k
        {
8322
320k
        reqcu = code[-1];
8323
320k
        reqcuflags = req_caseopt | cb->req_varyopt;
8324
320k
        }
8325
321k
      }
8326
8327
    /* If caselessness was temporarily instated, reset it. */
8328
8329
329k
    if (reset_caseful)
8330
0
      {
8331
0
      options &= ~PCRE2_CASELESS;
8332
0
      req_caseopt = 0;
8333
0
      reset_caseful = FALSE;
8334
0
      }
8335
8336
329k
    break;    /* End literal character handling */
8337
436k
    }         /* End of big switch */
8338
436k
  }           /* End of big loop */
8339
8340
/* Control never reaches here. */
8341
18.5k
}
8342
8343
8344
8345
/*************************************************
8346
*   Compile regex: a sequence of alternatives    *
8347
*************************************************/
8348
8349
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8350
the closing bracket or META_END. The code variable is pointing at the code unit
8351
into which the BRA operator has been stored. This function is used during the
8352
pre-compile phase when we are trying to find out the amount of memory needed,
8353
as well as during the real compile phase. The value of lengthptr distinguishes
8354
the two phases.
8355
8356
Arguments:
8357
  options           option bits, including any changes for this subpattern
8358
  xoptions          extra option bits, ditto
8359
  codeptr           -> the address of the current code pointer
8360
  pptrptr           -> the address of the current parsed pattern pointer
8361
  errorcodeptr      -> pointer to error code variable
8362
  skipunits         skip this many code units at start (for brackets and OP_COND)
8363
  firstcuptr        place to put the first required code unit
8364
  firstcuflagsptr   place to put the first code unit flags
8365
  reqcuptr          place to put the last required code unit
8366
  reqcuflagsptr     place to put the last required code unit flags
8367
  bcptr             pointer to the chain of currently open branches
8368
  cb                points to the data block with tables pointers etc.
8369
  lengthptr         NULL during the real compile phase
8370
                    points to length accumulator during pre-compile phase
8371
8372
Returns:            0 There has been an error
8373
                   +1 Success, this group must match at least one character
8374
                   -1 Success, this group may match an empty string
8375
*/
8376
8377
static int
8378
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8379
  uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8380
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8381
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8382
  compile_block *cb, PCRE2_SIZE *lengthptr)
8383
4.47k
{
8384
4.47k
PCRE2_UCHAR *code = *codeptr;
8385
4.47k
PCRE2_UCHAR *last_branch = code;
8386
4.47k
PCRE2_UCHAR *start_bracket = code;
8387
4.47k
BOOL lookbehind;
8388
4.47k
open_capitem capitem;
8389
4.47k
int capnumber = 0;
8390
4.47k
int okreturn = 1;
8391
4.47k
uint32_t *pptr = *pptrptr;
8392
4.47k
uint32_t firstcu, reqcu;
8393
4.47k
uint32_t lookbehindlength;
8394
4.47k
uint32_t lookbehindminlength;
8395
4.47k
uint32_t firstcuflags, reqcuflags;
8396
4.47k
uint32_t branchfirstcu, branchreqcu;
8397
4.47k
uint32_t branchfirstcuflags, branchreqcuflags;
8398
4.47k
PCRE2_SIZE length;
8399
4.47k
branch_chain bc;
8400
8401
/* If set, call the external function that checks for stack availability. */
8402
8403
4.47k
if (cb->cx->stack_guard != NULL &&
8404
0
    cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8405
0
  {
8406
0
  *errorcodeptr= ERR33;
8407
0
  return 0;
8408
0
  }
8409
8410
/* Miscellaneous initialization */
8411
8412
4.47k
bc.outer = bcptr;
8413
4.47k
bc.current_branch = code;
8414
8415
4.47k
firstcu = reqcu = 0;
8416
4.47k
firstcuflags = reqcuflags = REQ_UNSET;
8417
8418
/* Accumulate the length for use in the pre-compile phase. Start with the
8419
length of the BRA and KET and any extra code units that are required at the
8420
beginning. We accumulate in a local variable to save frequent testing of
8421
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8422
start and end of each alternative, because compiled items are discarded during
8423
the pre-compile phase so that the workspace is not exceeded. */
8424
8425
4.47k
length = 2 + 2*LINK_SIZE + skipunits;
8426
8427
/* Remember if this is a lookbehind assertion, and if it is, save its length
8428
and skip over the pattern offset. */
8429
8430
4.47k
lookbehind = *code == OP_ASSERTBACK ||
8431
4.47k
             *code == OP_ASSERTBACK_NOT ||
8432
4.47k
             *code == OP_ASSERTBACK_NA;
8433
8434
4.47k
if (lookbehind)
8435
0
  {
8436
0
  lookbehindlength = META_DATA(pptr[-1]);
8437
0
  lookbehindminlength = *pptr;
8438
0
  pptr += SIZEOFFSET;
8439
0
  }
8440
4.47k
else lookbehindlength = lookbehindminlength = 0;
8441
8442
/* If this is a capturing subpattern, add to the chain of open capturing items
8443
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8444
need be tested here; changing this opcode to one of its variants, e.g.
8445
OP_SCBRAPOS, happens later, after the group has been compiled. */
8446
8447
4.47k
if (*code == OP_CBRA)
8448
2.45k
  {
8449
2.45k
  capnumber = GET2(code, 1 + LINK_SIZE);
8450
2.45k
  capitem.number = capnumber;
8451
2.45k
  capitem.next = open_caps;
8452
2.45k
  capitem.assert_depth = cb->assert_depth;
8453
2.45k
  open_caps = &capitem;
8454
2.45k
  }
8455
8456
/* Offset is set zero to mark that this bracket is still open */
8457
8458
4.47k
PUT(code, 1, 0);
8459
4.47k
code += 1 + LINK_SIZE + skipunits;
8460
8461
/* Loop for each alternative branch */
8462
8463
4.47k
for (;;)
8464
18.5k
  {
8465
18.5k
  int branch_return;
8466
8467
  /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8468
  is only a single mimimum length for the whole assertion. When the mimimum
8469
  length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8470
  though not necessarily the same length. In this case, the original OP_REVERSE
8471
  can be used. It can also be used if a branch in a variable length lookbehind
8472
  has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8473
  maximum and minimum values. */
8474
8475
18.5k
  if (lookbehind && lookbehindlength > 0)
8476
0
    {
8477
0
    if (lookbehindminlength == LOOKBEHIND_MAX ||
8478
0
        lookbehindminlength == lookbehindlength)
8479
0
      {
8480
0
      *code++ = OP_REVERSE;
8481
0
      PUT2INC(code, 0, lookbehindlength);
8482
0
      length += 1 + IMM2_SIZE;
8483
0
      }
8484
0
    else
8485
0
      {
8486
0
      *code++ = OP_VREVERSE;
8487
0
      PUT2INC(code, 0, lookbehindminlength);
8488
0
      PUT2INC(code, 0, lookbehindlength);
8489
0
      length += 1 + 2*IMM2_SIZE;
8490
0
      }
8491
0
    }
8492
8493
  /* Now compile the branch; in the pre-compile phase its length gets added
8494
  into the length. */
8495
8496
18.5k
  if ((branch_return =
8497
18.5k
        compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8498
18.5k
          &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8499
18.5k
          &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8500
80
    return 0;
8501
8502
  /* If a branch can match an empty string, so can the whole group. */
8503
8504
18.4k
  if (branch_return < 0) okreturn = -1;
8505
8506
  /* In the real compile phase, there is some post-processing to be done. */
8507
8508
18.4k
  if (lengthptr == NULL)
8509
8.88k
    {
8510
    /* If this is the first branch, the firstcu and reqcu values for the
8511
    branch become the values for the regex. */
8512
8513
8.88k
    if (*last_branch != OP_ALT)
8514
2.08k
      {
8515
2.08k
      firstcu = branchfirstcu;
8516
2.08k
      firstcuflags = branchfirstcuflags;
8517
2.08k
      reqcu = branchreqcu;
8518
2.08k
      reqcuflags = branchreqcuflags;
8519
2.08k
      }
8520
8521
    /* If this is not the first branch, the first char and reqcu have to
8522
    match the values from all the previous branches, except that if the
8523
    previous value for reqcu didn't have REQ_VARY set, it can still match,
8524
    and we set REQ_VARY for the group from this branch's value. */
8525
8526
6.80k
    else
8527
6.80k
      {
8528
      /* If we previously had a firstcu, but it doesn't match the new branch,
8529
      we have to abandon the firstcu for the regex, but if there was
8530
      previously no reqcu, it takes on the value of the old firstcu. */
8531
8532
6.80k
      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8533
4.62k
        {
8534
4.62k
        if (firstcuflags < REQ_NONE)
8535
578
          {
8536
578
          if (reqcuflags >= REQ_NONE)
8537
33
            {
8538
33
            reqcu = firstcu;
8539
33
            reqcuflags = firstcuflags;
8540
33
            }
8541
578
          }
8542
4.62k
        firstcuflags = REQ_NONE;
8543
4.62k
        }
8544
8545
      /* If we (now or from before) have no firstcu, a firstcu from the
8546
      branch becomes a reqcu if there isn't a branch reqcu. */
8547
8548
6.80k
      if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8549
1.94k
          branchreqcuflags >= REQ_NONE)
8550
59
        {
8551
59
        branchreqcu = branchfirstcu;
8552
59
        branchreqcuflags = branchfirstcuflags;
8553
59
        }
8554
8555
      /* Now ensure that the reqcus match */
8556
8557
6.80k
      if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8558
1.79k
          reqcu != branchreqcu)
8559
5.79k
        reqcuflags = REQ_NONE;
8560
1.00k
      else
8561
1.00k
        {
8562
1.00k
        reqcu = branchreqcu;
8563
1.00k
        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8564
1.00k
        }
8565
6.80k
      }
8566
8.88k
    }
8567
8568
  /* Handle reaching the end of the expression, either ')' or end of pattern.
8569
  In the real compile phase, go back through the alternative branches and
8570
  reverse the chain of offsets, with the field in the BRA item now becoming an
8571
  offset to the first alternative. If there are no alternatives, it points to
8572
  the end of the group. The length in the terminating ket is always the length
8573
  of the whole bracketed item. Return leaving the pointer at the terminating
8574
  char. */
8575
8576
18.4k
  if (META_CODE(*pptr) != META_ALT)
8577
4.39k
    {
8578
4.39k
    if (lengthptr == NULL)
8579
2.08k
      {
8580
2.08k
      PCRE2_SIZE branch_length = code - last_branch;
8581
2.08k
      do
8582
8.88k
        {
8583
8.88k
        PCRE2_SIZE prev_length = GET(last_branch, 1);
8584
8.88k
        PUT(last_branch, 1, branch_length);
8585
8.88k
        branch_length = prev_length;
8586
8.88k
        last_branch -= branch_length;
8587
8.88k
        }
8588
8.88k
      while (branch_length > 0);
8589
2.08k
      }
8590
8591
    /* Fill in the ket */
8592
8593
4.39k
    *code = OP_KET;
8594
4.39k
    PUT(code, 1, (int)(code - start_bracket));
8595
4.39k
    code += 1 + LINK_SIZE;
8596
8597
    /* Set values to pass back */
8598
8599
4.39k
    *codeptr = code;
8600
4.39k
    *pptrptr = pptr;
8601
4.39k
    *firstcuptr = firstcu;
8602
4.39k
    *firstcuflagsptr = firstcuflags;
8603
4.39k
    *reqcuptr = reqcu;
8604
4.39k
    *reqcuflagsptr = reqcuflags;
8605
4.39k
    if (lengthptr != NULL)
8606
2.31k
      {
8607
2.31k
      if (OFLOW_MAX - *lengthptr < length)
8608
0
        {
8609
0
        *errorcodeptr = ERR20;
8610
0
        return 0;
8611
0
        }
8612
2.31k
      *lengthptr += length;
8613
2.31k
      }
8614
4.39k
    return okreturn;
8615
4.39k
    }
8616
8617
  /* Another branch follows. In the pre-compile phase, we can move the code
8618
  pointer back to where it was for the start of the first branch. (That is,
8619
  pretend that each branch is the only one.)
8620
8621
  In the real compile phase, insert an ALT node. Its length field points back
8622
  to the previous branch while the bracket remains open. At the end the chain
8623
  is reversed. It's done like this so that the start of the bracket has a
8624
  zero offset until it is closed, making it possible to detect recursion. */
8625
8626
14.0k
  if (lengthptr != NULL)
8627
7.27k
    {
8628
7.27k
    code = *codeptr + 1 + LINK_SIZE + skipunits;
8629
7.27k
    length += 1 + LINK_SIZE;
8630
7.27k
    }
8631
6.80k
  else
8632
6.80k
    {
8633
6.80k
    *code = OP_ALT;
8634
6.80k
    PUT(code, 1, (int)(code - last_branch));
8635
6.80k
    bc.current_branch = last_branch = code;
8636
6.80k
    code += 1 + LINK_SIZE;
8637
6.80k
    }
8638
8639
  /* Set the maximum lookbehind length for the next branch (if not in a
8640
  lookbehind the value will be zero) and then advance past the vertical bar. */
8641
8642
14.0k
  lookbehindlength = META_DATA(*pptr);
8643
14.0k
  pptr++;
8644
14.0k
  }
8645
/* Control never reaches here */
8646
4.47k
}
8647
8648
8649
8650
/*************************************************
8651
*          Check for anchored pattern            *
8652
*************************************************/
8653
8654
/* Try to find out if this is an anchored regular expression. Consider each
8655
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8656
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8657
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8658
be found, because ^ generates OP_CIRCM in that mode.
8659
8660
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8661
This is the code for \G, which means "match at start of match position, taking
8662
into account the match offset".
8663
8664
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8665
because that will try the rest of the pattern at all possible matching points,
8666
so there is no point trying again.... er ....
8667
8668
.... except when the .* appears inside capturing parentheses, and there is a
8669
subsequent back reference to those parentheses. We haven't enough information
8670
to catch that case precisely.
8671
8672
At first, the best we could do was to detect when .* was in capturing brackets
8673
and the highest back reference was greater than or equal to that level.
8674
However, by keeping a bitmap of the first 31 back references, we can catch some
8675
of the more common cases more precisely.
8676
8677
... A second exception is when the .* appears inside an atomic group, because
8678
this prevents the number of characters it matches from being adjusted.
8679
8680
Arguments:
8681
  code           points to start of the compiled pattern
8682
  bracket_map    a bitmap of which brackets we are inside while testing; this
8683
                   handles up to substring 31; after that we just have to take
8684
                   the less precise approach
8685
  cb             points to the compile data block
8686
  atomcount      atomic group level
8687
  inassert       TRUE if in an assertion
8688
8689
Returns:     TRUE or FALSE
8690
*/
8691
8692
static BOOL
8693
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8694
  int atomcount, BOOL inassert)
8695
946
{
8696
976
do {
8697
976
   PCRE2_SPTR scode = first_significant_code(
8698
976
     code + PRIV(OP_lengths)[*code], FALSE);
8699
976
   int op = *scode;
8700
8701
   /* Non-capturing brackets */
8702
8703
976
   if (op == OP_BRA  || op == OP_BRAPOS ||
8704
976
       op == OP_SBRA || op == OP_SBRAPOS)
8705
0
     {
8706
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8707
0
       return FALSE;
8708
0
     }
8709
8710
   /* Capturing brackets */
8711
8712
976
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8713
969
            op == OP_SCBRA || op == OP_SCBRAPOS)
8714
7
     {
8715
7
     int n = GET2(scode, 1+LINK_SIZE);
8716
7
     uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8717
7
     if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8718
7
     }
8719
8720
   /* Positive forward assertion */
8721
8722
969
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8723
0
     {
8724
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8725
0
     }
8726
8727
   /* Condition. If there is no second branch, it can't be anchored. */
8728
8729
969
   else if (op == OP_COND || op == OP_SCOND)
8730
0
     {
8731
0
     if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8732
0
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8733
0
       return FALSE;
8734
0
     }
8735
8736
   /* Atomic groups */
8737
8738
969
   else if (op == OP_ONCE)
8739
0
     {
8740
0
     if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8741
0
       return FALSE;
8742
0
     }
8743
8744
   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8745
   it isn't in brackets that are or may be referenced or inside an atomic
8746
   group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8747
   because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8748
   with the subject "aab", which matches "b", i.e. not at the start of a line.
8749
   There is also an option that disables auto-anchoring. */
8750
8751
969
   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8752
965
             op == OP_TYPEPOSSTAR))
8753
6
     {
8754
6
     if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8755
0
         atomcount > 0 || cb->had_pruneorskip || inassert ||
8756
0
         (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8757
6
       return FALSE;
8758
6
     }
8759
8760
   /* Check for explicit anchoring */
8761
8762
963
   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8763
8764
39
   code += GET(code, 1);
8765
39
   }
8766
946
while (*code == OP_ALT);   /* Loop for each alternative */
8767
9
return TRUE;
8768
946
}
8769
8770
8771
8772
/*************************************************
8773
*         Check for starting with ^ or .*        *
8774
*************************************************/
8775
8776
/* This is called to find out if every branch starts with ^ or .* so that
8777
"first char" processing can be done to speed things up in multiline
8778
matching and for non-DOTALL patterns that start with .* (which must start at
8779
the beginning or after \n). As in the case of is_anchored() (see above), we
8780
have to take account of back references to capturing brackets that contain .*
8781
because in that case we can't make the assumption. Also, the appearance of .*
8782
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8783
or *SKIP does not count, because once again the assumption no longer holds.
8784
8785
Arguments:
8786
  code           points to start of the compiled pattern or a group
8787
  bracket_map    a bitmap of which brackets we are inside while testing; this
8788
                   handles up to substring 31; after that we just have to take
8789
                   the less precise approach
8790
  cb             points to the compile data
8791
  atomcount      atomic group level
8792
  inassert       TRUE if in an assertion
8793
8794
Returns:         TRUE or FALSE
8795
*/
8796
8797
static BOOL
8798
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8799
  int atomcount, BOOL inassert)
8800
848
{
8801
876
do {
8802
876
   PCRE2_SPTR scode = first_significant_code(
8803
876
     code + PRIV(OP_lengths)[*code], FALSE);
8804
876
   int op = *scode;
8805
8806
   /* If we are at the start of a conditional assertion group, *both* the
8807
   conditional assertion *and* what follows the condition must satisfy the test
8808
   for start of line. Other kinds of condition fail. Note that there may be an
8809
   auto-callout at the start of a condition. */
8810
8811
876
   if (op == OP_COND)
8812
0
     {
8813
0
     scode += 1 + LINK_SIZE;
8814
8815
0
     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8816
0
       else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8817
8818
0
     switch (*scode)
8819
0
       {
8820
0
       case OP_CREF:
8821
0
       case OP_DNCREF:
8822
0
       case OP_RREF:
8823
0
       case OP_DNRREF:
8824
0
       case OP_FAIL:
8825
0
       case OP_FALSE:
8826
0
       case OP_TRUE:
8827
0
       return FALSE;
8828
8829
0
       default:     /* Assertion */
8830
0
       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8831
0
       do scode += GET(scode, 1); while (*scode == OP_ALT);
8832
0
       scode += 1 + LINK_SIZE;
8833
0
       break;
8834
0
       }
8835
0
     scode = first_significant_code(scode, FALSE);
8836
0
     op = *scode;
8837
0
     }
8838
8839
   /* Non-capturing brackets */
8840
8841
876
   if (op == OP_BRA  || op == OP_BRAPOS ||
8842
876
       op == OP_SBRA || op == OP_SBRAPOS)
8843
0
     {
8844
0
     if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8845
0
       return FALSE;
8846
0
     }
8847
8848
   /* Capturing brackets */
8849
8850
876
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8851
875
            op == OP_SCBRA || op == OP_SCBRAPOS)
8852
1
     {
8853
1
     int n = GET2(scode, 1+LINK_SIZE);
8854
1
     unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8855
1
     if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8856
1
     }
8857
8858
   /* Positive forward assertions */
8859
8860
875
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8861
0
     {
8862
0
     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8863
0
       return FALSE;
8864
0
     }
8865
8866
   /* Atomic brackets */
8867
8868
875
   else if (op == OP_ONCE)
8869
0
     {
8870
0
     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8871
0
       return FALSE;
8872
0
     }
8873
8874
   /* .* means "start at start or after \n" if it isn't in atomic brackets or
8875
   brackets that may be referenced or an assertion, and as long as the pattern
8876
   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8877
   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8878
   i.e. not at the start of a line. There is also an option that disables this
8879
   optimization. */
8880
8881
875
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8882
6
     {
8883
6
     if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8884
0
         atomcount > 0 || cb->had_pruneorskip || inassert ||
8885
0
         (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8886
6
       return FALSE;
8887
6
     }
8888
8889
   /* Check for explicit circumflex; anything else gives a FALSE result. Note
8890
   in particular that this includes atomic brackets OP_ONCE because the number
8891
   of characters matched by .* cannot be adjusted inside them. */
8892
8893
869
   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8894
8895
   /* Move on to the next alternative */
8896
8897
29
   code += GET(code, 1);
8898
29
   }
8899
848
while (*code == OP_ALT);  /* Loop for each alternative */
8900
1
return TRUE;
8901
848
}
8902
8903
8904
8905
/*************************************************
8906
*   Scan compiled regex for recursion reference  *
8907
*************************************************/
8908
8909
/* This function scans through a compiled pattern until it finds an instance of
8910
OP_RECURSE.
8911
8912
Arguments:
8913
  code        points to start of expression
8914
  utf         TRUE in UTF mode
8915
8916
Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8917
*/
8918
8919
static PCRE2_SPTR
8920
find_recurse(PCRE2_SPTR code, BOOL utf)
8921
0
{
8922
0
for (;;)
8923
0
  {
8924
0
  PCRE2_UCHAR c = *code;
8925
0
  if (c == OP_END) return NULL;
8926
0
  if (c == OP_RECURSE) return code;
8927
8928
  /* XCLASS is used for classes that cannot be represented just by a bit map.
8929
  This includes negated single high-valued characters. CALLOUT_STR is used for
8930
  callouts with string arguments. In both cases the length in the table is
8931
  zero; the actual length is stored in the compiled code. */
8932
8933
0
  if (c == OP_XCLASS) code += GET(code, 1);
8934
0
    else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8935
8936
  /* Otherwise, we can get the item's length from the table, except that for
8937
  repeated character types, we have to test for \p and \P, which have an extra
8938
  two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8939
  we must add in its length. */
8940
8941
0
  else
8942
0
    {
8943
0
    switch(c)
8944
0
      {
8945
0
      case OP_TYPESTAR:
8946
0
      case OP_TYPEMINSTAR:
8947
0
      case OP_TYPEPLUS:
8948
0
      case OP_TYPEMINPLUS:
8949
0
      case OP_TYPEQUERY:
8950
0
      case OP_TYPEMINQUERY:
8951
0
      case OP_TYPEPOSSTAR:
8952
0
      case OP_TYPEPOSPLUS:
8953
0
      case OP_TYPEPOSQUERY:
8954
0
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8955
0
      break;
8956
8957
0
      case OP_TYPEPOSUPTO:
8958
0
      case OP_TYPEUPTO:
8959
0
      case OP_TYPEMINUPTO:
8960
0
      case OP_TYPEEXACT:
8961
0
      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8962
0
        code += 2;
8963
0
      break;
8964
8965
0
      case OP_MARK:
8966
0
      case OP_COMMIT_ARG:
8967
0
      case OP_PRUNE_ARG:
8968
0
      case OP_SKIP_ARG:
8969
0
      case OP_THEN_ARG:
8970
0
      code += code[1];
8971
0
      break;
8972
0
      }
8973
8974
    /* Add in the fixed length from the table */
8975
8976
0
    code += PRIV(OP_lengths)[c];
8977
8978
    /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8979
    be followed by a multi-unit character. The length in the table is a
8980
    minimum, so we have to arrange to skip the extra units. */
8981
8982
0
#ifdef MAYBE_UTF_MULTI
8983
0
    if (utf) switch(c)
8984
0
      {
8985
0
      case OP_CHAR:
8986
0
      case OP_CHARI:
8987
0
      case OP_NOT:
8988
0
      case OP_NOTI:
8989
0
      case OP_EXACT:
8990
0
      case OP_EXACTI:
8991
0
      case OP_NOTEXACT:
8992
0
      case OP_NOTEXACTI:
8993
0
      case OP_UPTO:
8994
0
      case OP_UPTOI:
8995
0
      case OP_NOTUPTO:
8996
0
      case OP_NOTUPTOI:
8997
0
      case OP_MINUPTO:
8998
0
      case OP_MINUPTOI:
8999
0
      case OP_NOTMINUPTO:
9000
0
      case OP_NOTMINUPTOI:
9001
0
      case OP_POSUPTO:
9002
0
      case OP_POSUPTOI:
9003
0
      case OP_NOTPOSUPTO:
9004
0
      case OP_NOTPOSUPTOI:
9005
0
      case OP_STAR:
9006
0
      case OP_STARI:
9007
0
      case OP_NOTSTAR:
9008
0
      case OP_NOTSTARI:
9009
0
      case OP_MINSTAR:
9010
0
      case OP_MINSTARI:
9011
0
      case OP_NOTMINSTAR:
9012
0
      case OP_NOTMINSTARI:
9013
0
      case OP_POSSTAR:
9014
0
      case OP_POSSTARI:
9015
0
      case OP_NOTPOSSTAR:
9016
0
      case OP_NOTPOSSTARI:
9017
0
      case OP_PLUS:
9018
0
      case OP_PLUSI:
9019
0
      case OP_NOTPLUS:
9020
0
      case OP_NOTPLUSI:
9021
0
      case OP_MINPLUS:
9022
0
      case OP_MINPLUSI:
9023
0
      case OP_NOTMINPLUS:
9024
0
      case OP_NOTMINPLUSI:
9025
0
      case OP_POSPLUS:
9026
0
      case OP_POSPLUSI:
9027
0
      case OP_NOTPOSPLUS:
9028
0
      case OP_NOTPOSPLUSI:
9029
0
      case OP_QUERY:
9030
0
      case OP_QUERYI:
9031
0
      case OP_NOTQUERY:
9032
0
      case OP_NOTQUERYI:
9033
0
      case OP_MINQUERY:
9034
0
      case OP_MINQUERYI:
9035
0
      case OP_NOTMINQUERY:
9036
0
      case OP_NOTMINQUERYI:
9037
0
      case OP_POSQUERY:
9038
0
      case OP_POSQUERYI:
9039
0
      case OP_NOTPOSQUERY:
9040
0
      case OP_NOTPOSQUERYI:
9041
0
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9042
0
      break;
9043
0
      }
9044
#else
9045
    (void)(utf);  /* Keep compiler happy by referencing function argument */
9046
#endif  /* MAYBE_UTF_MULTI */
9047
0
    }
9048
0
  }
9049
0
}
9050
9051
9052
9053
/*************************************************
9054
*    Check for asserted fixed first code unit    *
9055
*************************************************/
9056
9057
/* During compilation, the "first code unit" settings from forward assertions
9058
are discarded, because they can cause conflicts with actual literals that
9059
follow. However, if we end up without a first code unit setting for an
9060
unanchored pattern, it is worth scanning the regex to see if there is an
9061
initial asserted first code unit. If all branches start with the same asserted
9062
code unit, or with a non-conditional bracket all of whose alternatives start
9063
with the same asserted code unit (recurse ad lib), then we return that code
9064
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9065
REQ_NONE in the flags.
9066
9067
Arguments:
9068
  code       points to start of compiled pattern
9069
  flags      points to the first code unit flags
9070
  inassert   non-zero if in an assertion
9071
9072
Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9073
*/
9074
9075
static uint32_t
9076
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9077
873
{
9078
873
uint32_t c = 0;
9079
873
uint32_t cflags = REQ_NONE;
9080
9081
873
*flags = REQ_NONE;
9082
873
do {
9083
873
   uint32_t d;
9084
873
   uint32_t dflags;
9085
873
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9086
872
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9087
873
   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9088
873
   PCRE2_UCHAR op = *scode;
9089
9090
873
   switch(op)
9091
873
     {
9092
412
     default:
9093
412
     return 0;
9094
9095
0
     case OP_BRA:
9096
0
     case OP_BRAPOS:
9097
1
     case OP_CBRA:
9098
1
     case OP_SCBRA:
9099
1
     case OP_CBRAPOS:
9100
1
     case OP_SCBRAPOS:
9101
1
     case OP_ASSERT:
9102
1
     case OP_ASSERT_NA:
9103
1
     case OP_ONCE:
9104
1
     case OP_SCRIPT_RUN:
9105
1
     d = find_firstassertedcu(scode, &dflags, inassert +
9106
1
       ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9107
1
     if (dflags >= REQ_NONE) return 0;
9108
0
     if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9109
0
       else if (c != d || cflags != dflags) return 0;
9110
0
     break;
9111
9112
0
     case OP_EXACT:
9113
0
     scode += IMM2_SIZE;
9114
     /* Fall through */
9115
9116
206
     case OP_CHAR:
9117
206
     case OP_PLUS:
9118
206
     case OP_MINPLUS:
9119
207
     case OP_POSPLUS:
9120
207
     if (inassert == 0) return 0;
9121
0
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9122
0
       else if (c != scode[1]) return 0;
9123
0
     break;
9124
9125
0
     case OP_EXACTI:
9126
0
     scode += IMM2_SIZE;
9127
     /* Fall through */
9128
9129
253
     case OP_CHARI:
9130
253
     case OP_PLUSI:
9131
253
     case OP_MINPLUSI:
9132
253
     case OP_POSPLUSI:
9133
253
     if (inassert == 0) return 0;
9134
9135
     /* If the character is more than one code unit long, we cannot set its
9136
     first code unit when matching caselessly. Later scanning may pick up
9137
     multiple code units. */
9138
9139
0
#ifdef SUPPORT_UNICODE
9140
0
#if PCRE2_CODE_UNIT_WIDTH == 8
9141
0
     if (scode[1] >= 0x80) return 0;
9142
#elif PCRE2_CODE_UNIT_WIDTH == 16
9143
     if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9144
#endif
9145
0
#endif
9146
9147
0
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9148
0
       else if (c != scode[1]) return 0;
9149
0
     break;
9150
873
     }
9151
9152
0
   code += GET(code, 1);
9153
0
   }
9154
873
while (*code == OP_ALT);
9155
9156
0
*flags = cflags;
9157
0
return c;
9158
873
}
9159
9160
9161
9162
/*************************************************
9163
*     Add an entry to the name/number table      *
9164
*************************************************/
9165
9166
/* This function is called between compiling passes to add an entry to the
9167
name/number table, maintaining alphabetical order. Checking for permitted
9168
and forbidden duplicates has already been done.
9169
9170
Arguments:
9171
  cb           the compile data block
9172
  name         the name to add
9173
  length       the length of the name
9174
  groupno      the group number
9175
  tablecount   the count of names in the table so far
9176
9177
Returns:       nothing
9178
*/
9179
9180
static void
9181
add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9182
  unsigned int groupno, uint32_t tablecount)
9183
0
{
9184
0
uint32_t i;
9185
0
PCRE2_UCHAR *slot = cb->name_table;
9186
9187
0
for (i = 0; i < tablecount; i++)
9188
0
  {
9189
0
  int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9190
0
  if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9191
0
    crc = -1; /* Current name is a substring */
9192
9193
  /* Make space in the table and break the loop for an earlier name. For a
9194
  duplicate or later name, carry on. We do this for duplicates so that in the
9195
  simple case (when ?(| is not used) they are in order of their numbers. In all
9196
  cases they are in the order in which they appear in the pattern. */
9197
9198
0
  if (crc < 0)
9199
0
    {
9200
0
    (void)memmove(slot + cb->name_entry_size, slot,
9201
0
      CU2BYTES((tablecount - i) * cb->name_entry_size));
9202
0
    break;
9203
0
    }
9204
9205
  /* Continue the loop for a later or duplicate name */
9206
9207
0
  slot += cb->name_entry_size;
9208
0
  }
9209
9210
0
PUT2(slot, 0, groupno);
9211
0
memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9212
9213
/* Add a terminating zero and fill the rest of the slot with zeroes so that
9214
the memory is all initialized. Otherwise valgrind moans about uninitialized
9215
memory when saving serialized compiled patterns. */
9216
9217
0
memset(slot + IMM2_SIZE + length, 0,
9218
0
  CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9219
0
}
9220
9221
9222
9223
/*************************************************
9224
*             Skip in parsed pattern             *
9225
*************************************************/
9226
9227
/* This function is called to skip parts of the parsed pattern when finding the
9228
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9229
the end of the branch, it is called to skip over an internal lookaround or
9230
(DEFINE) group, and it is also called to skip to the end of a class, during
9231
which it will never encounter nested groups (but there's no need to have
9232
special code for that).
9233
9234
When called to find the end of a branch or group, pptr must point to the first
9235
meta code inside the branch, not the branch-starting code. In other cases it
9236
can point to the item that causes the function to be called.
9237
9238
Arguments:
9239
  pptr       current pointer to skip from
9240
  skiptype   PSKIP_CLASS when skipping to end of class
9241
             PSKIP_ALT when META_ALT ends the skip
9242
             PSKIP_KET when only META_KET ends the skip
9243
9244
Returns:     new value of pptr
9245
             NULL if META_END is reached - should never occur
9246
               or for an unknown meta value - likewise
9247
*/
9248
9249
static uint32_t *
9250
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9251
0
{
9252
0
uint32_t nestlevel = 0;
9253
9254
0
for (;; pptr++)
9255
0
  {
9256
0
  uint32_t meta = META_CODE(*pptr);
9257
9258
0
  switch(meta)
9259
0
    {
9260
0
    default:  /* Just skip over most items */
9261
0
    if (meta < META_END) continue;  /* Literal */
9262
0
    break;
9263
9264
    /* This should never occur. */
9265
9266
0
    case META_END:
9267
0
    return NULL;
9268
9269
    /* The data for these items is variable in length. */
9270
9271
0
    case META_BACKREF:  /* Offset is present only if group >= 10 */
9272
0
    if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9273
0
    break;
9274
9275
0
    case META_ESCAPE:   /* A few escapes are followed by data items. */
9276
0
    switch (META_DATA(*pptr))
9277
0
      {
9278
0
      case ESC_P:
9279
0
      case ESC_p:
9280
0
      pptr += 1;
9281
0
      break;
9282
9283
0
      case ESC_g:
9284
0
      case ESC_k:
9285
0
      pptr += 1 + SIZEOFFSET;
9286
0
      break;
9287
0
      }
9288
0
    break;
9289
9290
0
    case META_MARK:     /* Add the length of the name. */
9291
0
    case META_COMMIT_ARG:
9292
0
    case META_PRUNE_ARG:
9293
0
    case META_SKIP_ARG:
9294
0
    case META_THEN_ARG:
9295
0
    pptr += pptr[1];
9296
0
    break;
9297
9298
    /* These are the "active" items in this loop. */
9299
9300
0
    case META_CLASS_END:
9301
0
    if (skiptype == PSKIP_CLASS) return pptr;
9302
0
    break;
9303
9304
0
    case META_ATOMIC:
9305
0
    case META_CAPTURE:
9306
0
    case META_COND_ASSERT:
9307
0
    case META_COND_DEFINE:
9308
0
    case META_COND_NAME:
9309
0
    case META_COND_NUMBER:
9310
0
    case META_COND_RNAME:
9311
0
    case META_COND_RNUMBER:
9312
0
    case META_COND_VERSION:
9313
0
    case META_LOOKAHEAD:
9314
0
    case META_LOOKAHEADNOT:
9315
0
    case META_LOOKAHEAD_NA:
9316
0
    case META_LOOKBEHIND:
9317
0
    case META_LOOKBEHINDNOT:
9318
0
    case META_LOOKBEHIND_NA:
9319
0
    case META_NOCAPTURE:
9320
0
    case META_SCRIPT_RUN:
9321
0
    nestlevel++;
9322
0
    break;
9323
9324
0
    case META_ALT:
9325
0
    if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9326
0
    break;
9327
9328
0
    case META_KET:
9329
0
    if (nestlevel == 0) return pptr;
9330
0
    nestlevel--;
9331
0
    break;
9332
0
    }
9333
9334
  /* The extra data item length for each meta is in a table. */
9335
9336
0
  meta = (meta >> 16) & 0x7fff;
9337
0
  if (meta >= sizeof(meta_extra_lengths)) return NULL;
9338
0
  pptr += meta_extra_lengths[meta];
9339
0
  }
9340
/* Control never reaches here */
9341
0
return pptr;
9342
0
}
9343
9344
9345
9346
/*************************************************
9347
*       Find length of a parsed group            *
9348
*************************************************/
9349
9350
/* This is called for nested groups within a branch of a lookbehind whose
9351
length is being computed. On entry, the pointer must be at the first element
9352
after the group initializing code. On exit it points to OP_KET. Caching is used
9353
to improve processing speed when the same capturing group occurs many times.
9354
9355
Arguments:
9356
  pptrptr     pointer to pointer in the parsed pattern
9357
  minptr      where to return the minimum length
9358
  isinline    FALSE if a reference or recursion; TRUE for inline group
9359
  errcodeptr  pointer to the errorcode
9360
  lcptr       pointer to the loop counter
9361
  group       number of captured group or -1 for a non-capturing group
9362
  recurses    chain of recurse_check to catch mutual recursion
9363
  cb          pointer to the compile data
9364
9365
Returns:      the maximum group length or a negative number
9366
*/
9367
9368
static int
9369
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9370
  int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9371
0
{
9372
0
uint32_t *gi = cb->groupinfo + 2 * group;
9373
0
int branchlength, branchminlength;
9374
0
int grouplength = -1;
9375
0
int groupminlength = INT_MAX;
9376
9377
/* The cache can be used only if there is no possibility of there being two
9378
groups with the same number. We do not need to set the end pointer for a group
9379
that is being processed as a back reference or recursion, but we must do so for
9380
an inline group. */
9381
9382
0
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9383
0
  {
9384
0
  uint32_t groupinfo = gi[0];
9385
0
  if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9386
0
  if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9387
0
    {
9388
0
    if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9389
0
    *minptr = gi[1];
9390
0
    return groupinfo & GI_FIXED_LENGTH_MASK;
9391
0
    }
9392
0
  }
9393
9394
/* Scan the group. In this case we find the end pointer of necessity. */
9395
9396
0
for(;;)
9397
0
  {
9398
0
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9399
0
    recurses, cb);
9400
0
  if (branchlength < 0) goto ISNOTFIXED;
9401
0
  if (branchlength > grouplength) grouplength = branchlength;
9402
0
  if (branchminlength < groupminlength) groupminlength = branchminlength;
9403
0
  if (**pptrptr == META_KET) break;
9404
0
  *pptrptr += 1;   /* Skip META_ALT */
9405
0
  }
9406
9407
0
if (group > 0)
9408
0
  {
9409
0
  gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9410
0
  gi[1] = groupminlength;
9411
0
  }
9412
9413
0
*minptr = groupminlength;
9414
0
return grouplength;
9415
9416
0
ISNOTFIXED:
9417
0
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9418
0
return -1;
9419
0
}
9420
9421
9422
9423
/*************************************************
9424
*        Find length of a parsed branch          *
9425
*************************************************/
9426
9427
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9428
giving an error if the length is not limited. On entry, *pptrptr points to the
9429
first element inside the branch. On exit it is set to point to the ALT or KET.
9430
9431
Arguments:
9432
  pptrptr     pointer to pointer in the parsed pattern
9433
  minptr      where to return the minimum length
9434
  errcodeptr  pointer to error code
9435
  lcptr       pointer to loop counter
9436
  recurses    chain of recurse_check to catch mutual recursion
9437
  cb          pointer to compile block
9438
9439
Returns:      the maximum length, or a negative value on error
9440
*/
9441
9442
static int
9443
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9444
  parsed_recurse_check *recurses, compile_block *cb)
9445
4
{
9446
4
int branchlength = 0;
9447
4
int branchminlength = 0;
9448
4
int grouplength, groupminlength;
9449
4
uint32_t lastitemlength = 0;
9450
4
uint32_t lastitemminlength = 0;
9451
4
uint32_t *pptr = *pptrptr;
9452
4
PCRE2_SIZE offset;
9453
4
parsed_recurse_check this_recurse;
9454
9455
/* A large and/or complex regex can take too long to process. This can happen
9456
more often when (?| groups are present in the pattern because their length
9457
cannot be cached. */
9458
9459
4
if ((*lcptr)++ > 2000)
9460
0
  {
9461
0
  *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9462
0
  return -1;
9463
0
  }
9464
9465
/* Scan the branch, accumulating the length. */
9466
9467
74
for (;; pptr++)
9468
78
  {
9469
78
  parsed_recurse_check *r;
9470
78
  uint32_t *gptr, *gptrend;
9471
78
  uint32_t escape;
9472
78
  uint32_t group = 0;
9473
78
  uint32_t itemlength = 0;
9474
78
  uint32_t itemminlength = 0;
9475
78
  uint32_t min, max;
9476
9477
78
  if (*pptr < META_END)
9478
74
    {
9479
74
    itemlength = itemminlength = 1;
9480
74
    }
9481
9482
4
  else switch (META_CODE(*pptr))
9483
4
    {
9484
0
    case META_KET:
9485
2
    case META_ALT:
9486
2
    goto EXIT;
9487
9488
    /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9489
    actual termination. */
9490
9491
0
    case META_ACCEPT:
9492
0
    case META_FAIL:
9493
0
    pptr = parsed_skip(pptr, PSKIP_ALT);
9494
0
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9495
0
    goto EXIT;
9496
9497
0
    case META_MARK:
9498
0
    case META_COMMIT_ARG:
9499
0
    case META_PRUNE_ARG:
9500
0
    case META_SKIP_ARG:
9501
0
    case META_THEN_ARG:
9502
0
    pptr += pptr[1] + 1;
9503
0
    break;
9504
9505
0
    case META_CIRCUMFLEX:
9506
0
    case META_COMMIT:
9507
0
    case META_DOLLAR:
9508
0
    case META_PRUNE:
9509
0
    case META_SKIP:
9510
0
    case META_THEN:
9511
0
    break;
9512
9513
0
    case META_OPTIONS:
9514
0
    pptr += 2;
9515
0
    break;
9516
9517
0
    case META_BIGVALUE:
9518
0
    itemlength = itemminlength = 1;
9519
0
    pptr += 1;
9520
0
    break;
9521
9522
0
    case META_CLASS:
9523
0
    case META_CLASS_NOT:
9524
0
    itemlength = itemminlength = 1;
9525
0
    pptr = parsed_skip(pptr, PSKIP_CLASS);
9526
0
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9527
0
    break;
9528
9529
0
    case META_CLASS_EMPTY_NOT:
9530
0
    case META_DOT:
9531
0
    itemlength = itemminlength = 1;
9532
0
    break;
9533
9534
0
    case META_CALLOUT_NUMBER:
9535
0
    pptr += 3;
9536
0
    break;
9537
9538
0
    case META_CALLOUT_STRING:
9539
0
    pptr += 3 + SIZEOFFSET;
9540
0
    break;
9541
9542
    /* Only some escapes consume a character. Of those, \R can match one or two
9543
    characters, but \X is never allowed because it matches an unknown number of
9544
    characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9545
9546
2
    case META_ESCAPE:
9547
2
    escape = META_DATA(*pptr);
9548
2
    if (escape == ESC_X) return -1;
9549
0
    if (escape == ESC_R)
9550
0
      {
9551
0
      itemminlength = 1;
9552
0
      itemlength = 2;
9553
0
      }
9554
0
    else if (escape > ESC_b && escape < ESC_Z)
9555
0
      {
9556
0
#if PCRE2_CODE_UNIT_WIDTH != 32
9557
0
      if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9558
0
        {
9559
0
        *errcodeptr = ERR36;
9560
0
        return -1;
9561
0
        }
9562
0
#endif
9563
0
      itemlength = itemminlength = 1;
9564
0
      if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9565
0
      }
9566
0
    break;
9567
9568
    /* Lookaheads do not contribute to the length of this branch, but they may
9569
    contain lookbehinds within them whose lengths need to be set. */
9570
9571
0
    case META_LOOKAHEAD:
9572
0
    case META_LOOKAHEADNOT:
9573
0
    case META_LOOKAHEAD_NA:
9574
0
    *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9575
0
    if (*errcodeptr != 0) return -1;
9576
9577
    /* Ignore any qualifiers that follow a lookahead assertion. */
9578
9579
0
    switch (pptr[1])
9580
0
      {
9581
0
      case META_ASTERISK:
9582
0
      case META_ASTERISK_PLUS:
9583
0
      case META_ASTERISK_QUERY:
9584
0
      case META_PLUS:
9585
0
      case META_PLUS_PLUS:
9586
0
      case META_PLUS_QUERY:
9587
0
      case META_QUERY:
9588
0
      case META_QUERY_PLUS:
9589
0
      case META_QUERY_QUERY:
9590
0
      pptr++;
9591
0
      break;
9592
9593
0
      case META_MINMAX:
9594
0
      case META_MINMAX_PLUS:
9595
0
      case META_MINMAX_QUERY:
9596
0
      pptr += 3;
9597
0
      break;
9598
9599
0
      default:
9600
0
      break;
9601
0
      }
9602
0
    break;
9603
9604
    /* A nested lookbehind does not contribute any length to this lookbehind,
9605
    but must itself be checked and have its lengths set. */
9606
9607
0
    case META_LOOKBEHIND:
9608
0
    case META_LOOKBEHINDNOT:
9609
0
    case META_LOOKBEHIND_NA:
9610
0
    if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9611
0
      return -1;
9612
0
    break;
9613
9614
    /* Back references and recursions are handled by very similar code. At this
9615
    stage, the names generated in the parsing pass are available, but the main
9616
    name table has not yet been created. So for the named varieties, scan the
9617
    list of names in order to get the number of the first one in the pattern,
9618
    and whether or not this name is duplicated. */
9619
9620
0
    case META_BACKREF_BYNAME:
9621
0
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9622
0
      goto ISNOTFIXED;
9623
    /* Fall through */
9624
9625
0
    case META_RECURSE_BYNAME:
9626
0
      {
9627
0
      int i;
9628
0
      PCRE2_SPTR name;
9629
0
      BOOL is_dupname = FALSE;
9630
0
      named_group *ng = cb->named_groups;
9631
0
      uint32_t meta_code = META_CODE(*pptr);
9632
0
      uint32_t length = *(++pptr);
9633
9634
0
      GETPLUSOFFSET(offset, pptr);
9635
0
      name = cb->start_pattern + offset;
9636
0
      for (i = 0; i < cb->names_found; i++, ng++)
9637
0
        {
9638
0
        if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9639
0
          {
9640
0
          group = ng->number;
9641
0
          is_dupname = ng->isdup;
9642
0
          break;
9643
0
          }
9644
0
        }
9645
9646
0
      if (group == 0)
9647
0
        {
9648
0
        *errcodeptr = ERR15;  /* Non-existent subpattern */
9649
0
        cb->erroroffset = offset;
9650
0
        return -1;
9651
0
        }
9652
9653
      /* A numerical back reference can be fixed length if duplicate capturing
9654
      groups are not being used. A non-duplicate named back reference can also
9655
      be handled. */
9656
9657
0
      if (meta_code == META_RECURSE_BYNAME ||
9658
0
          (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9659
0
        goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9660
0
      }
9661
0
    goto ISNOTFIXED;                     /* Duplicate name or number */
9662
9663
    /* The offset values for back references < 10 are in a separate vector
9664
    because otherwise they would use more than two parsed pattern elements on
9665
    64-bit systems. */
9666
9667
0
    case META_BACKREF:
9668
0
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9669
0
        (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9670
0
      goto ISNOTFIXED;
9671
0
    group = META_DATA(*pptr);
9672
0
    if (group < 10)
9673
0
      {
9674
0
      offset = cb->small_ref_offset[group];
9675
0
      goto RECURSE_OR_BACKREF_LENGTH;
9676
0
      }
9677
9678
    /* Fall through */
9679
    /* For groups >= 10 - picking up group twice does no harm. */
9680
9681
    /* A true recursion implies not fixed length, but a subroutine call may
9682
    be OK. Back reference "recursions" are also failed. */
9683
9684
0
    case META_RECURSE:
9685
0
    group = META_DATA(*pptr);
9686
0
    GETPLUSOFFSET(offset, pptr);
9687
9688
0
    RECURSE_OR_BACKREF_LENGTH:
9689
0
    if (group > cb->bracount)
9690
0
      {
9691
0
      cb->erroroffset = offset;
9692
0
      *errcodeptr = ERR15;  /* Non-existent subpattern */
9693
0
      return -1;
9694
0
      }
9695
0
    if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9696
0
    for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9697
0
      {
9698
0
      if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9699
0
        else if (*gptr == (META_CAPTURE | group)) break;
9700
0
      }
9701
9702
    /* We must start the search for the end of the group at the first meta code
9703
    inside the group. Otherwise it will be treated as an enclosed group. */
9704
9705
0
    gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9706
0
    if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9707
0
    if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9708
0
    for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9709
0
    if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9710
0
    this_recurse.prev = recurses;
9711
0
    this_recurse.groupptr = gptr;
9712
9713
    /* We do not need to know the position of the end of the group, that is,
9714
    gptr is not used after the call to get_grouplength(). Setting the second
9715
    argument FALSE stops it scanning for the end when the length can be found
9716
    in the cache. */
9717
9718
0
    gptr++;
9719
0
    grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9720
0
      lcptr, group, &this_recurse, cb);
9721
0
    if (grouplength < 0)
9722
0
      {
9723
0
      if (*errcodeptr == 0) goto ISNOTFIXED;
9724
0
      return -1;  /* Error already set */
9725
0
      }
9726
0
    itemlength = grouplength;
9727
0
    itemminlength = groupminlength;
9728
0
    break;
9729
9730
    /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9731
    the length of this branch. Skip from the following item to the next
9732
    unpaired ket. */
9733
9734
0
    case META_COND_DEFINE:
9735
0
    pptr = parsed_skip(pptr + 1, PSKIP_KET);
9736
0
    break;
9737
9738
    /* Check other nested groups - advance past the initial data for each type
9739
    and then seek a fixed length with get_grouplength(). */
9740
9741
0
    case META_COND_NAME:
9742
0
    case META_COND_NUMBER:
9743
0
    case META_COND_RNAME:
9744
0
    case META_COND_RNUMBER:
9745
0
    pptr += 2 + SIZEOFFSET;
9746
0
    goto CHECK_GROUP;
9747
9748
0
    case META_COND_ASSERT:
9749
0
    pptr += 1;
9750
0
    goto CHECK_GROUP;
9751
9752
0
    case META_COND_VERSION:
9753
0
    pptr += 4;
9754
0
    goto CHECK_GROUP;
9755
9756
0
    case META_CAPTURE:
9757
0
    group = META_DATA(*pptr);
9758
    /* Fall through */
9759
9760
0
    case META_ATOMIC:
9761
0
    case META_NOCAPTURE:
9762
0
    case META_SCRIPT_RUN:
9763
0
    pptr++;
9764
0
    CHECK_GROUP:
9765
0
    grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9766
0
      lcptr, group, recurses, cb);
9767
0
    if (grouplength < 0) return -1;
9768
0
    itemlength = grouplength;
9769
0
    itemminlength = groupminlength;
9770
0
    break;
9771
9772
0
    case META_QUERY:
9773
0
    case META_QUERY_PLUS:
9774
0
    case META_QUERY_QUERY:
9775
0
    min = 0;
9776
0
    max = 1;
9777
0
    goto REPETITION;
9778
9779
    /* Exact repetition is OK; variable repetition is not. A repetition of zero
9780
    must subtract the length that has already been added. */
9781
9782
0
    case META_MINMAX:
9783
0
    case META_MINMAX_PLUS:
9784
0
    case META_MINMAX_QUERY:
9785
0
    min = pptr[1];
9786
0
    max = pptr[2];
9787
0
    pptr += 2;
9788
9789
0
    REPETITION:
9790
0
    if (max != REPEAT_UNLIMITED)
9791
0
      {
9792
0
      if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9793
0
          max != 0 &&
9794
0
          (INT_MAX - branchlength)/lastitemlength < max - 1)
9795
0
        {
9796
0
        *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9797
0
        return -1;
9798
0
        }
9799
0
      if (min == 0) branchminlength -= lastitemminlength;
9800
0
        else itemminlength = (min - 1) * lastitemminlength;
9801
0
      if (max == 0) branchlength -= lastitemlength;
9802
0
        else itemlength = (max - 1) * lastitemlength;
9803
0
      break;
9804
0
      }
9805
    /* Fall through */
9806
9807
    /* Any other item means this branch does not have a fixed length. */
9808
9809
0
    default:
9810
0
    ISNOTFIXED:
9811
0
    *errcodeptr = ERR25;   /* Not fixed length */
9812
0
    return -1;
9813
4
    }
9814
9815
  /* Add the item length to the branchlength, checking for integer overflow and
9816
  for the branch length exceeding the overall limit. Later, if there is at
9817
  least one variable-length branch in the group, there is a test for the
9818
  (smaller) variable-length branch length limit. */
9819
9820
74
  if (INT_MAX - branchlength < (int)itemlength ||
9821
74
      (branchlength += itemlength) > LOOKBEHIND_MAX)
9822
0
    {
9823
0
    *errcodeptr = ERR87;
9824
0
    return -1;
9825
0
    }
9826
9827
74
  branchminlength += itemminlength;
9828
9829
  /* Save this item length for use if the next item is a quantifier. */
9830
9831
74
  lastitemlength = itemlength;
9832
74
  lastitemminlength = itemminlength;
9833
74
  }
9834
9835
2
EXIT:
9836
2
*pptrptr = pptr;
9837
2
*minptr = branchminlength;
9838
2
return branchlength;
9839
9840
0
PARSED_SKIP_FAILED:
9841
0
*errcodeptr = ERR90;
9842
0
return -1;
9843
4
}
9844
9845
9846
9847
/*************************************************
9848
*        Set lengths in a lookbehind             *
9849
*************************************************/
9850
9851
/* This function is called for each lookbehind, to set the lengths in its
9852
branches. An error occurs if any branch does not have a limited maximum length
9853
that is less than the limit (65535). On exit, the pointer must be left on the
9854
final ket.
9855
9856
The function also maintains the max_lookbehind value. Any lookbehind branch
9857
that contains a nested lookbehind may actually look further back than the
9858
length of the branch. The additional amount is passed back from
9859
get_branchlength() as an "extra" value.
9860
9861
Arguments:
9862
  pptrptr     pointer to pointer in the parsed pattern
9863
  errcodeptr  pointer to error code
9864
  lcptr       pointer to loop counter
9865
  recurses    chain of recurse_check to catch mutual recursion
9866
  cb          pointer to compile block
9867
9868
Returns:      TRUE if all is well
9869
              FALSE otherwise, with error code and offset set
9870
*/
9871
9872
static BOOL
9873
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9874
  parsed_recurse_check *recurses, compile_block *cb)
9875
2
{
9876
2
PCRE2_SIZE offset;
9877
2
uint32_t *bptr = *pptrptr;
9878
2
uint32_t *gbptr = bptr;
9879
2
int maxlength = 0;
9880
2
int minlength = INT_MAX;
9881
2
BOOL variable = FALSE;
9882
9883
2
READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9884
2
*pptrptr += SIZEOFFSET;
9885
9886
/* Each branch can have a different maximum length, but we can keep only a
9887
single minimum for the whole group, because there's nowhere to save individual
9888
values in the META_ALT item. */
9889
9890
2
do
9891
4
  {
9892
4
  int branchlength, branchminlength;
9893
9894
4
  *pptrptr += 1;
9895
4
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9896
4
    recurses, cb);
9897
9898
4
  if (branchlength < 0)
9899
2
    {
9900
    /* The errorcode and offset may already be set from a nested lookbehind. */
9901
2
    if (*errcodeptr == 0) *errcodeptr = ERR25;
9902
2
    if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9903
2
    return FALSE;
9904
2
    }
9905
9906
2
  if (branchlength != branchminlength) variable = TRUE;
9907
2
  if (branchminlength < minlength) minlength = branchminlength;
9908
2
  if (branchlength > maxlength) maxlength = branchlength;
9909
2
  if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9910
2
  *bptr |= branchlength;  /* branchlength never more than 65535 */
9911
2
  bptr = *pptrptr;
9912
2
  }
9913
2
while (META_CODE(*bptr) == META_ALT);
9914
9915
/* If any branch is of variable length, the whole lookbehind is of variable
9916
length. If the maximum length of any branch exceeds the maximum for variable
9917
lookbehinds, give an error. Otherwise, the minimum length is set in the word
9918
that follows the original group META value. For a fixed-length lookbehind, this
9919
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9920
possibly different) length. */
9921
9922
0
if (variable)
9923
0
  {
9924
0
  gbptr[1] = minlength;
9925
0
  if ((uint32_t)maxlength > cb->max_varlookbehind)
9926
0
    {
9927
0
    *errcodeptr = ERR100;
9928
0
    cb->erroroffset = offset;
9929
0
    return FALSE;
9930
0
    }
9931
0
  }
9932
0
else gbptr[1] = LOOKBEHIND_MAX;
9933
9934
9935
0
gbptr[1] = variable? minlength : LOOKBEHIND_MAX;
9936
0
return TRUE;
9937
0
}
9938
9939
9940
9941
/*************************************************
9942
*         Check parsed pattern lookbehinds       *
9943
*************************************************/
9944
9945
/* This function is called at the end of parsing a pattern if any lookbehinds
9946
were encountered. It scans the parsed pattern for them, calling
9947
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9948
the error offset is marked unset. The enables the functions above not to
9949
override settings from deeper nestings.
9950
9951
This function is called recursively from get_branchlength() for lookaheads in
9952
order to process any lookbehinds that they may contain. It stops when it hits a
9953
non-nested closing parenthesis in this case, returning a pointer to it.
9954
9955
Arguments
9956
  pptr      points to where to start (start of pattern or start of lookahead)
9957
  retptr    if not NULL, return the ket pointer here
9958
  recurses  chain of recurse_check to catch mutual recursion
9959
  cb        points to the compile block
9960
  lcptr     points to loop counter
9961
9962
Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9963
*/
9964
9965
static int
9966
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9967
  parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9968
2
{
9969
2
int errorcode = 0;
9970
2
int nestlevel = 0;
9971
9972
2
cb->erroroffset = PCRE2_UNSET;
9973
9974
448
for (; *pptr != META_END; pptr++)
9975
448
  {
9976
448
  if (*pptr < META_END) continue;  /* Literal */
9977
9978
36
  switch (META_CODE(*pptr))
9979
36
    {
9980
0
    default:
9981
0
    return ERR70;  /* Unrecognized meta code */
9982
9983
0
    case META_ESCAPE:
9984
0
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9985
0
      pptr += 1;
9986
0
    break;
9987
9988
4
    case META_KET:
9989
4
    if (--nestlevel < 0)
9990
0
      {
9991
0
      if (retptr != NULL) *retptr = pptr;
9992
0
      return 0;
9993
0
      }
9994
4
    break;
9995
9996
4
    case META_ATOMIC:
9997
4
    case META_CAPTURE:
9998
4
    case META_COND_ASSERT:
9999
4
    case META_LOOKAHEAD:
10000
4
    case META_LOOKAHEADNOT:
10001
4
    case META_LOOKAHEAD_NA:
10002
4
    case META_NOCAPTURE:
10003
4
    case META_SCRIPT_RUN:
10004
4
    nestlevel++;
10005
4
    break;
10006
10007
0
    case META_ACCEPT:
10008
2
    case META_ALT:
10009
20
    case META_ASTERISK:
10010
20
    case META_ASTERISK_PLUS:
10011
20
    case META_ASTERISK_QUERY:
10012
20
    case META_BACKREF:
10013
20
    case META_CIRCUMFLEX:
10014
20
    case META_CLASS:
10015
20
    case META_CLASS_EMPTY:
10016
20
    case META_CLASS_EMPTY_NOT:
10017
20
    case META_CLASS_END:
10018
20
    case META_CLASS_NOT:
10019
20
    case META_COMMIT:
10020
20
    case META_DOLLAR:
10021
22
    case META_DOT:
10022
22
    case META_FAIL:
10023
26
    case META_PLUS:
10024
26
    case META_PLUS_PLUS:
10025
26
    case META_PLUS_QUERY:
10026
26
    case META_PRUNE:
10027
26
    case META_QUERY:
10028
26
    case META_QUERY_PLUS:
10029
26
    case META_QUERY_QUERY:
10030
26
    case META_RANGE_ESCAPED:
10031
26
    case META_RANGE_LITERAL:
10032
26
    case META_SKIP:
10033
26
    case META_THEN:
10034
26
    break;
10035
10036
0
    case META_RECURSE:
10037
0
    pptr += SIZEOFFSET;
10038
0
    break;
10039
10040
0
    case META_BACKREF_BYNAME:
10041
0
    case META_RECURSE_BYNAME:
10042
0
    pptr += 1 + SIZEOFFSET;
10043
0
    break;
10044
10045
0
    case META_COND_DEFINE:
10046
0
    pptr += SIZEOFFSET;
10047
0
    nestlevel++;
10048
0
    break;
10049
10050
0
    case META_COND_NAME:
10051
0
    case META_COND_NUMBER:
10052
0
    case META_COND_RNAME:
10053
0
    case META_COND_RNUMBER:
10054
0
    pptr += 1 + SIZEOFFSET;
10055
0
    nestlevel++;
10056
0
    break;
10057
10058
0
    case META_COND_VERSION:
10059
0
    pptr += 3;
10060
0
    nestlevel++;
10061
0
    break;
10062
10063
0
    case META_CALLOUT_STRING:
10064
0
    pptr += 3 + SIZEOFFSET;
10065
0
    break;
10066
10067
0
    case META_BIGVALUE:
10068
0
    case META_POSIX:
10069
0
    case META_POSIX_NEG:
10070
0
    pptr += 1;
10071
0
    break;
10072
10073
0
    case META_MINMAX:
10074
0
    case META_MINMAX_QUERY:
10075
0
    case META_MINMAX_PLUS:
10076
0
    case META_OPTIONS:
10077
0
    pptr += 2;
10078
0
    break;
10079
10080
0
    case META_CALLOUT_NUMBER:
10081
0
    pptr += 3;
10082
0
    break;
10083
10084
0
    case META_MARK:
10085
0
    case META_COMMIT_ARG:
10086
0
    case META_PRUNE_ARG:
10087
0
    case META_SKIP_ARG:
10088
0
    case META_THEN_ARG:
10089
0
    pptr += 1 + pptr[1];
10090
0
    break;
10091
10092
2
    case META_LOOKBEHIND:
10093
2
    case META_LOOKBEHINDNOT:
10094
2
    case META_LOOKBEHIND_NA:
10095
2
    if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10096
2
      return errorcode;
10097
0
    break;
10098
36
    }
10099
36
  }
10100
10101
0
return 0;
10102
2
}
10103
10104
10105
10106
/*************************************************
10107
*     External function to compile a pattern     *
10108
*************************************************/
10109
10110
/* This function reads a regular expression in the form of a string and returns
10111
a pointer to a block of store holding a compiled version of the expression.
10112
10113
Arguments:
10114
  pattern       the regular expression
10115
  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10116
  options       option bits
10117
  errorptr      pointer to errorcode
10118
  erroroffset   pointer to error offset
10119
  ccontext      points to a compile context or is NULL
10120
10121
Returns:        pointer to compiled data block, or NULL on error,
10122
                with errorcode and erroroffset set
10123
*/
10124
10125
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10126
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10127
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10128
1.52k
{
10129
1.52k
BOOL utf;                             /* Set TRUE for UTF mode */
10130
1.52k
BOOL ucp;                             /* Set TRUE for UCP mode */
10131
1.52k
BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10132
1.52k
BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10133
1.52k
pcre2_real_code *re = NULL;           /* What we will return */
10134
1.52k
compile_block cb;                     /* "Static" compile-time data */
10135
1.52k
const uint8_t *tables;                /* Char tables base pointer */
10136
10137
1.52k
PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10138
1.52k
PCRE2_SPTR codestart;                 /* Start of compiled code */
10139
1.52k
PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10140
1.52k
uint32_t *pptr;                       /* Current pointer in parsed pattern */
10141
10142
1.52k
PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10143
1.52k
PCRE2_SIZE usedlength;                /* Actual length used */
10144
1.52k
PCRE2_SIZE re_blocksize;              /* Size of memory block */
10145
1.52k
PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
10146
1.52k
PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10147
10148
1.52k
uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10149
1.52k
uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10150
1.52k
uint32_t setflags = 0;                /* NL and BSR set flags */
10151
10152
1.52k
uint32_t skipatstart;                 /* When checking (*UTF) etc */
10153
1.52k
uint32_t limit_heap  = UINT32_MAX;
10154
1.52k
uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10155
1.52k
uint32_t limit_depth = UINT32_MAX;
10156
10157
1.52k
int newline = 0;                      /* Unset; can be set by the pattern */
10158
1.52k
int bsr = 0;                          /* Unset; can be set by the pattern */
10159
1.52k
int errorcode = 0;                    /* Initialize to avoid compiler warn */
10160
1.52k
int regexrc;                          /* Return from compile */
10161
10162
1.52k
uint32_t i;                           /* Local loop counter */
10163
10164
/* Comments at the head of this file explain about these variables. */
10165
10166
1.52k
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10167
1.52k
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10168
1.52k
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10169
10170
/* The workspace is used in different ways in the different compiling phases.
10171
It needs to be 16-bit aligned for the preliminary parsing scan. */
10172
10173
1.52k
uint32_t c16workspace[C16_WORK_SIZE];
10174
1.52k
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10175
10176
10177
/* -------------- Check arguments and set up the pattern ----------------- */
10178
10179
/* There must be error code and offset pointers. */
10180
10181
1.52k
if (errorptr == NULL || erroroffset == NULL) return NULL;
10182
1.52k
*errorptr = ERR0;
10183
1.52k
*erroroffset = 0;
10184
10185
/* There must be a pattern, but NULL is allowed with zero length. */
10186
10187
1.52k
if (pattern == NULL)
10188
0
  {
10189
0
  if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10190
0
    {
10191
0
    *errorptr = ERR16;
10192
0
    return NULL;
10193
0
    }
10194
0
  }
10195
10196
/* A NULL compile context means "use a default context" */
10197
10198
1.52k
if (ccontext == NULL)
10199
0
  ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10200
10201
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10202
10203
1.52k
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10204
10205
/* Check that all undefined public option bits are zero. */
10206
10207
1.52k
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10208
1.52k
    (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10209
0
  {
10210
0
  *errorptr = ERR17;
10211
0
  return NULL;
10212
0
  }
10213
10214
1.52k
if ((options & PCRE2_LITERAL) != 0 &&
10215
0
    ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10216
0
     (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10217
0
  {
10218
0
  *errorptr = ERR92;
10219
0
  return NULL;
10220
0
  }
10221
10222
/* A zero-terminated pattern is indicated by the special length value
10223
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10224
10225
1.52k
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10226
0
  patlen = PRIV(strlen)(pattern);
10227
10228
1.52k
if (patlen > ccontext->max_pattern_length)
10229
0
  {
10230
0
  *errorptr = ERR88;
10231
0
  return NULL;
10232
0
  }
10233
10234
/* From here on, all returns from this function should end up going via the
10235
EXIT label. */
10236
10237
10238
/* ------------ Initialize the "static" compile data -------------- */
10239
10240
1.52k
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10241
10242
1.52k
cb.lcc = tables + lcc_offset;          /* Individual */
10243
1.52k
cb.fcc = tables + fcc_offset;          /*   character */
10244
1.52k
cb.cbits = tables + cbits_offset;      /*      tables */
10245
1.52k
cb.ctypes = tables + ctypes_offset;
10246
10247
1.52k
cb.assert_depth = 0;
10248
1.52k
cb.bracount = 0;
10249
1.52k
cb.cx = ccontext;
10250
1.52k
cb.dupnames = FALSE;
10251
1.52k
cb.end_pattern = pattern + patlen;
10252
1.52k
cb.erroroffset = 0;
10253
1.52k
cb.external_flags = 0;
10254
1.52k
cb.external_options = options;
10255
1.52k
cb.groupinfo = stack_groupinfo;
10256
1.52k
cb.had_recurse = FALSE;
10257
1.52k
cb.lastcapture = 0;
10258
1.52k
cb.max_lookbehind = 0;                               /* Max encountered */
10259
1.52k
cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10260
1.52k
cb.name_entry_size = 0;
10261
1.52k
cb.name_table = NULL;
10262
1.52k
cb.named_groups = named_groups;
10263
1.52k
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10264
1.52k
cb.names_found = 0;
10265
1.52k
cb.parens_depth = 0;
10266
1.52k
cb.parsed_pattern = stack_parsed_pattern;
10267
1.52k
cb.req_varyopt = 0;
10268
1.52k
cb.start_code = cworkspace;
10269
1.52k
cb.start_pattern = pattern;
10270
1.52k
cb.start_workspace = cworkspace;
10271
1.52k
cb.workspace_size = COMPILE_WORK_SIZE;
10272
10273
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10274
references to help in deciding whether (.*) can be treated as anchored or not.
10275
*/
10276
10277
1.52k
cb.top_backref = 0;
10278
1.52k
cb.backref_map = 0;
10279
10280
/* Escape sequences \1 to \9 are always back references, but as they are only
10281
two characters long, only two elements can be used in the parsed_pattern
10282
vector. The first contains the reference, and we'd like to use the second to
10283
record the offset in the pattern, so that forward references to non-existent
10284
groups can be diagnosed later with an offset. However, on 64-bit systems,
10285
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10286
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10287
references have enough space for the offset to be put into the parsed pattern.
10288
*/
10289
10290
16.8k
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10291
10292
10293
/* --------------- Start looking at the pattern --------------- */
10294
10295
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10296
the start of the pattern, and remember the offset to the actual regex. With
10297
valgrind support, make the terminator of a zero-terminated pattern
10298
inaccessible. This catches bugs that would otherwise only show up for
10299
non-zero-terminated patterns. */
10300
10301
#ifdef SUPPORT_VALGRIND
10302
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10303
#endif
10304
10305
1.52k
ptr = pattern;
10306
1.52k
skipatstart = 0;
10307
10308
1.52k
if ((options & PCRE2_LITERAL) == 0)
10309
1.52k
  {
10310
1.52k
  while (patlen - skipatstart >= 2 &&
10311
1.52k
         ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10312
15
         ptr[skipatstart+1] == CHAR_ASTERISK)
10313
0
    {
10314
0
    for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10315
0
      {
10316
0
      uint32_t c, pp;
10317
0
      const pso *p = pso_list + i;
10318
10319
0
      if (patlen - skipatstart - 2 >= p->length &&
10320
0
          PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
10321
0
            p->length) == 0)
10322
0
        {
10323
0
        skipatstart += p->length + 2;
10324
0
        switch(p->type)
10325
0
          {
10326
0
          case PSO_OPT:
10327
0
          cb.external_options |= p->value;
10328
0
          break;
10329
10330
0
          case PSO_FLG:
10331
0
          setflags |= p->value;
10332
0
          break;
10333
10334
0
          case PSO_NL:
10335
0
          newline = p->value;
10336
0
          setflags |= PCRE2_NL_SET;
10337
0
          break;
10338
10339
0
          case PSO_BSR:
10340
0
          bsr = p->value;
10341
0
          setflags |= PCRE2_BSR_SET;
10342
0
          break;
10343
10344
0
          case PSO_LIMM:
10345
0
          case PSO_LIMD:
10346
0
          case PSO_LIMH:
10347
0
          c = 0;
10348
0
          pp = skipatstart;
10349
0
          if (!IS_DIGIT(ptr[pp]))
10350
0
            {
10351
0
            errorcode = ERR60;
10352
0
            ptr += pp;
10353
0
            goto HAD_EARLY_ERROR;
10354
0
            }
10355
0
          while (IS_DIGIT(ptr[pp]))
10356
0
            {
10357
0
            if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10358
0
            c = c*10 + (ptr[pp++] - CHAR_0);
10359
0
            }
10360
0
          if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
10361
0
            {
10362
0
            errorcode = ERR60;
10363
0
            ptr += pp;
10364
0
            goto HAD_EARLY_ERROR;
10365
0
            }
10366
0
          if (p->type == PSO_LIMH) limit_heap = c;
10367
0
            else if (p->type == PSO_LIMM) limit_match = c;
10368
0
            else limit_depth = c;
10369
0
          skipatstart += pp - skipatstart;
10370
0
          break;
10371
0
          }
10372
0
        break;   /* Out of the table scan loop */
10373
0
        }
10374
0
      }
10375
0
    if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10376
0
    }
10377
1.52k
  }
10378
10379
/* End of pattern-start options; advance to start of real regex. */
10380
10381
1.52k
ptr += skipatstart;
10382
10383
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10384
10385
#ifndef SUPPORT_UNICODE
10386
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10387
  {
10388
  errorcode = ERR32;
10389
  goto HAD_EARLY_ERROR;
10390
  }
10391
#endif
10392
10393
/* Check UTF. We have the original options in 'options', with that value as
10394
modified by (*UTF) etc in cb->external_options. The extra option
10395
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10396
surrogate code points cannot be represented in UTF-16. */
10397
10398
1.52k
utf = (cb.external_options & PCRE2_UTF) != 0;
10399
1.52k
if (utf)
10400
383
  {
10401
383
  if ((options & PCRE2_NEVER_UTF) != 0)
10402
0
    {
10403
0
    errorcode = ERR74;
10404
0
    goto HAD_EARLY_ERROR;
10405
0
    }
10406
383
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10407
383
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10408
26
    goto HAD_ERROR;  /* Offset was set by valid_utf() */
10409
10410
#if PCRE2_CODE_UNIT_WIDTH == 16
10411
  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10412
    {
10413
    errorcode = ERR91;
10414
    goto HAD_EARLY_ERROR;
10415
    }
10416
#endif
10417
383
  }
10418
10419
/* Check UCP lockout. */
10420
10421
1.50k
ucp = (cb.external_options & PCRE2_UCP) != 0;
10422
1.50k
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10423
0
  {
10424
0
  errorcode = ERR75;
10425
0
  goto HAD_EARLY_ERROR;
10426
0
  }
10427
10428
/* Process the BSR setting. */
10429
10430
1.50k
if (bsr == 0) bsr = ccontext->bsr_convention;
10431
10432
/* Process the newline setting. */
10433
10434
1.50k
if (newline == 0) newline = ccontext->newline_convention;
10435
1.50k
cb.nltype = NLTYPE_FIXED;
10436
1.50k
switch(newline)
10437
1.50k
  {
10438
0
  case PCRE2_NEWLINE_CR:
10439
0
  cb.nllen = 1;
10440
0
  cb.nl[0] = CHAR_CR;
10441
0
  break;
10442
10443
1.50k
  case PCRE2_NEWLINE_LF:
10444
1.50k
  cb.nllen = 1;
10445
1.50k
  cb.nl[0] = CHAR_NL;
10446
1.50k
  break;
10447
10448
0
  case PCRE2_NEWLINE_NUL:
10449
0
  cb.nllen = 1;
10450
0
  cb.nl[0] = CHAR_NUL;
10451
0
  break;
10452
10453
0
  case PCRE2_NEWLINE_CRLF:
10454
0
  cb.nllen = 2;
10455
0
  cb.nl[0] = CHAR_CR;
10456
0
  cb.nl[1] = CHAR_NL;
10457
0
  break;
10458
10459
0
  case PCRE2_NEWLINE_ANY:
10460
0
  cb.nltype = NLTYPE_ANY;
10461
0
  break;
10462
10463
0
  case PCRE2_NEWLINE_ANYCRLF:
10464
0
  cb.nltype = NLTYPE_ANYCRLF;
10465
0
  break;
10466
10467
0
  default:
10468
0
  errorcode = ERR56;
10469
0
  goto HAD_EARLY_ERROR;
10470
1.50k
  }
10471
10472
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10473
their numerical equivalents, so that this information is always available for
10474
the remaining processing. (2) At the same time, parse the pattern and put a
10475
processed version into the parsed_pattern vector. This has escapes interpreted
10476
and comments removed (amongst other things).
10477
10478
In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10479
32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10480
one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10481
set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10482
characters greater than META_END (0x80000000) have to be coded as two units. In
10483
this case, therefore, we scan the pattern to check for such values. */
10484
10485
#if PCRE2_CODE_UNIT_WIDTH == 32
10486
if (!utf)
10487
  {
10488
  PCRE2_SPTR p;
10489
  for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10490
  }
10491
#endif
10492
10493
/* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10494
is set we have to assume a numerical callout (4 elements) for each character
10495
plus one at the end. This is overkill, but memory is plentiful these days. For
10496
many smaller patterns the vector on the stack (which was set up above) can be
10497
used. */
10498
10499
1.50k
parsed_size_needed = patlen - skipatstart + big32count;
10500
10501
1.50k
if ((ccontext->extra_options &
10502
1.50k
     (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10503
0
  parsed_size_needed += 4;
10504
10505
1.50k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10506
0
  parsed_size_needed = (parsed_size_needed + 1) * 5;
10507
10508
1.50k
if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10509
137
  {
10510
137
  uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10511
137
    (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10512
137
  if (heap_parsed_pattern == NULL)
10513
0
    {
10514
0
    *errorptr = ERR21;
10515
0
    goto EXIT;
10516
0
    }
10517
137
  cb.parsed_pattern = heap_parsed_pattern;
10518
137
  }
10519
1.50k
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10520
10521
/* Do the parsing scan. */
10522
10523
1.50k
errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10524
1.50k
if (errorcode != 0) goto HAD_CB_ERROR;
10525
10526
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10527
lengths. Workspace is needed to remember whether numbered groups are or are not
10528
of limited length, and if limited, what the minimum and maximum lengths are.
10529
This caching saves re-computing the length of any group that is referenced more
10530
than once, which is particularly relevant when recursion is involved.
10531
Unnumbered groups do not have this exposure because they cannot be referenced.
10532
If there are sufficiently few groups, the default index vector on the stack, as
10533
set up above, can be used. Otherwise we have to get/free some heap memory. The
10534
vector must be initialized to zero. */
10535
10536
1.03k
if (has_lookbehind)
10537
2
  {
10538
2
  int loopcount = 0;
10539
2
  if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10540
0
    {
10541
0
    cb.groupinfo = ccontext->memctl.malloc(
10542
0
      (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10543
0
    if (cb.groupinfo == NULL)
10544
0
      {
10545
0
      errorcode = ERR21;
10546
0
      cb.erroroffset = 0;
10547
0
      goto HAD_CB_ERROR;
10548
0
      }
10549
0
    }
10550
2
  memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10551
2
  errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10552
2
  if (errorcode != 0) goto HAD_CB_ERROR;
10553
2
  }
10554
10555
/* For debugging, there is a function that shows the parsed pattern vector. */
10556
10557
#ifdef DEBUG_SHOW_PARSED
10558
fprintf(stderr, "+++ Pre-scan complete:\n");
10559
show_parsed(&cb);
10560
#endif
10561
10562
/* For debugging capturing information this code can be enabled. */
10563
10564
#ifdef DEBUG_SHOW_CAPTURES
10565
  {
10566
  named_group *ng = cb.named_groups;
10567
  fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10568
  for (i = 0; i < cb.names_found; i++, ng++)
10569
    {
10570
    fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10571
    }
10572
  }
10573
#endif
10574
10575
/* Pretend to compile the pattern while actually just accumulating the amount
10576
of memory required in the 'length' variable. This behaviour is triggered by
10577
passing a non-NULL final argument to compile_regex(). We pass a block of
10578
workspace (cworkspace) for it to compile parts of the pattern into; the
10579
compiled code is discarded when it is no longer needed, so hopefully this
10580
workspace will never overflow, though there is a test for its doing so.
10581
10582
On error, errorcode will be set non-zero, so we don't need to look at the
10583
result of the function. The initial options have been put into the cb block,
10584
but we still have to pass a separate options variable (the first argument)
10585
because the options may change as the pattern is processed. */
10586
10587
1.03k
cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10588
1.03k
pptr = cb.parsed_pattern;
10589
1.03k
code = cworkspace;
10590
1.03k
*code = OP_BRA;
10591
10592
1.03k
(void)compile_regex(cb.external_options, ccontext->extra_options, &code, &pptr,
10593
1.03k
   &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10594
1.03k
   &cb, &length);
10595
10596
1.03k
if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10597
10598
/* This should be caught in compile_regex(), but just in case... */
10599
10600
962
if (length > MAX_PATTERN_SIZE)
10601
0
  {
10602
0
  errorcode = ERR20;
10603
0
  goto HAD_CB_ERROR;
10604
0
  }
10605
10606
/* Compute the size of, then, if not too large, get and initialize the data
10607
block for storing the compiled pattern and names table. Integer overflow should
10608
no longer be possible because nowadays we limit the maximum value of
10609
cb.names_found and cb.name_entry_size. */
10610
10611
962
re_blocksize = sizeof(pcre2_real_code) +
10612
962
  CU2BYTES(length +
10613
962
  (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10614
10615
962
if (re_blocksize > ccontext->max_pattern_compiled_length)
10616
0
  {
10617
0
  errorcode = ERR101;
10618
0
  goto HAD_CB_ERROR;
10619
0
  }
10620
10621
962
re = (pcre2_real_code *)
10622
962
  ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10623
962
if (re == NULL)
10624
0
  {
10625
0
  errorcode = ERR21;
10626
0
  goto HAD_CB_ERROR;
10627
0
  }
10628
10629
/* The compiler may put padding at the end of the pcre2_real_code structure in
10630
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10631
compiled pattern is copied (for example, when serialized) undefined bytes are
10632
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10633
write to the last 8 bytes of the structure before setting the fields. */
10634
10635
962
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10636
962
re->memctl = ccontext->memctl;
10637
962
re->tables = tables;
10638
962
re->executable_jit = NULL;
10639
962
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10640
962
re->blocksize = re_blocksize;
10641
962
re->magic_number = MAGIC_NUMBER;
10642
962
re->compile_options = options;
10643
962
re->overall_options = cb.external_options;
10644
962
re->extra_options = ccontext->extra_options;
10645
962
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10646
962
re->limit_heap = limit_heap;
10647
962
re->limit_match = limit_match;
10648
962
re->limit_depth = limit_depth;
10649
962
re->first_codeunit = 0;
10650
962
re->last_codeunit = 0;
10651
962
re->bsr_convention = bsr;
10652
962
re->newline_convention = newline;
10653
962
re->max_lookbehind = 0;
10654
962
re->minlength = 0;
10655
962
re->top_bracket = 0;
10656
962
re->top_backref = 0;
10657
962
re->name_entry_size = cb.name_entry_size;
10658
962
re->name_count = cb.names_found;
10659
10660
/* The basic block is immediately followed by the name table, and the compiled
10661
code follows after that. */
10662
10663
962
codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10664
962
  re->name_entry_size * re->name_count;
10665
10666
/* Update the compile data block for the actual compile. The starting points of
10667
the name/number translation table and of the code are passed around in the
10668
compile data block. The start/end pattern and initial options are already set
10669
from the pre-compile phase, as is the name_entry_size field. */
10670
10671
962
cb.parens_depth = 0;
10672
962
cb.assert_depth = 0;
10673
962
cb.lastcapture = 0;
10674
962
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10675
962
cb.start_code = codestart;
10676
962
cb.req_varyopt = 0;
10677
962
cb.had_accept = FALSE;
10678
962
cb.had_pruneorskip = FALSE;
10679
10680
/* If any named groups were found, create the name/number table from the list
10681
created in the pre-pass. */
10682
10683
962
if (cb.names_found > 0)
10684
0
  {
10685
0
  named_group *ng = cb.named_groups;
10686
0
  for (i = 0; i < cb.names_found; i++, ng++)
10687
0
    add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10688
0
  }
10689
10690
/* Set up a starting, non-extracting bracket, then compile the expression. On
10691
error, errorcode will be set non-zero, so we don't need to look at the result
10692
of the function here. */
10693
10694
962
pptr = cb.parsed_pattern;
10695
962
code = (PCRE2_UCHAR *)codestart;
10696
962
*code = OP_BRA;
10697
962
regexrc = compile_regex(re->overall_options, ccontext->extra_options, &code,
10698
962
  &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10699
962
  NULL, &cb, NULL);
10700
962
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10701
962
re->top_bracket = cb.bracount;
10702
962
re->top_backref = cb.top_backref;
10703
962
re->max_lookbehind = cb.max_lookbehind;
10704
10705
962
if (cb.had_accept)
10706
0
  {
10707
0
  reqcu = 0;                     /* Must disable after (*ACCEPT) */
10708
0
  reqcuflags = REQ_NONE;
10709
0
  re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10710
0
  }
10711
10712
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10713
but the estimated length exceeds the really used length, adjust the value of
10714
re->blocksize, and if valgrind support is configured, mark the extra allocated
10715
memory as unaddressable, so that any out-of-bound reads can be detected. */
10716
10717
962
*code++ = OP_END;
10718
962
usedlength = code - codestart;
10719
962
if (usedlength > length) errorcode = ERR23; else
10720
962
  {
10721
962
  re->blocksize -= CU2BYTES(length - usedlength);
10722
#ifdef SUPPORT_VALGRIND
10723
  VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10724
#endif
10725
962
  }
10726
10727
/* Scan the pattern for recursion/subroutine calls and convert the group
10728
numbers into offsets. Maintain a small cache so that repeated groups containing
10729
recursions are efficiently handled. */
10730
10731
962
#define RSCAN_CACHE_SIZE 8
10732
10733
962
if (errorcode == 0 && cb.had_recurse)
10734
0
  {
10735
0
  PCRE2_UCHAR *rcode;
10736
0
  PCRE2_SPTR rgroup;
10737
0
  unsigned int ccount = 0;
10738
0
  int start = RSCAN_CACHE_SIZE;
10739
0
  recurse_cache rc[RSCAN_CACHE_SIZE];
10740
10741
0
  for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10742
0
       rcode != NULL;
10743
0
       rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10744
0
    {
10745
0
    int p, groupnumber;
10746
10747
0
    groupnumber = (int)GET(rcode, 1);
10748
0
    if (groupnumber == 0) rgroup = codestart; else
10749
0
      {
10750
0
      PCRE2_SPTR search_from = codestart;
10751
0
      rgroup = NULL;
10752
0
      for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10753
0
        {
10754
0
        if (groupnumber == rc[p].groupnumber)
10755
0
          {
10756
0
          rgroup = rc[p].group;
10757
0
          break;
10758
0
          }
10759
10760
        /* Group n+1 must always start to the right of group n, so we can save
10761
        search time below when the new group number is greater than any of the
10762
        previously found groups. */
10763
10764
0
        if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10765
0
        }
10766
10767
0
      if (rgroup == NULL)
10768
0
        {
10769
0
        rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10770
0
        if (rgroup == NULL)
10771
0
          {
10772
0
          errorcode = ERR53;
10773
0
          break;
10774
0
          }
10775
0
        if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10776
0
        rc[start].groupnumber = groupnumber;
10777
0
        rc[start].group = rgroup;
10778
0
        if (ccount < RSCAN_CACHE_SIZE) ccount++;
10779
0
        }
10780
0
      }
10781
10782
0
    PUT(rcode, 1, rgroup - codestart);
10783
0
    }
10784
0
  }
10785
10786
/* In rare debugging situations we sometimes need to look at the compiled code
10787
at this stage. */
10788
10789
#ifdef DEBUG_CALL_PRINTINT
10790
pcre2_printint(re, stderr, TRUE);
10791
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10792
#endif
10793
10794
/* Unless disabled, check whether any single character iterators can be
10795
auto-possessified. The function overwrites the appropriate opcode values, so
10796
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10797
used in this code because at least one compiler gives a warning about loss of
10798
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10799
function call. */
10800
10801
962
if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10802
962
  {
10803
962
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10804
962
  if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10805
962
  }
10806
10807
/* Failed to compile, or error while post-processing. */
10808
10809
962
if (errorcode != 0) goto HAD_CB_ERROR;
10810
10811
/* Successful compile. If the anchored option was not passed, set it if
10812
we can determine that the pattern is anchored by virtue of ^ characters or \A
10813
or anything else, such as starting with non-atomic .* when DOTALL is set and
10814
there are no occurrences of *PRUNE or *SKIP (though there is an option to
10815
disable this case). */
10816
10817
962
if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10818
939
     is_anchored(codestart, 0, &cb, 0, FALSE))
10819
9
  re->overall_options |= PCRE2_ANCHORED;
10820
10821
/* Set up the first code unit or startline flag, the required code unit, and
10822
then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10823
is set, as the data it would create will not be used. Note that a first code
10824
unit (but not the startline flag) is useful for anchored patterns because it
10825
can still give a quick "no match" and also avoid searching for a last code
10826
unit. */
10827
10828
962
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10829
962
  {
10830
962
  int minminlength = 0;  /* For minimal minlength from first/required CU */
10831
10832
  /* If we do not have a first code unit, see if there is one that is asserted
10833
  (these are not saved during the compile because they can cause conflicts with
10834
  actual literals that follow). */
10835
10836
962
  if (firstcuflags >= REQ_NONE)
10837
872
    firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10838
10839
  /* Save the data for a first code unit. The existence of one means the
10840
  minimum length must be at least 1. */
10841
10842
962
  if (firstcuflags < REQ_NONE)
10843
90
    {
10844
90
    re->first_codeunit = firstcu;
10845
90
    re->flags |= PCRE2_FIRSTSET;
10846
90
    minminlength++;
10847
10848
    /* Handle caseless first code units. */
10849
10850
90
    if ((firstcuflags & REQ_CASELESS) != 0)
10851
44
      {
10852
44
      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10853
43
        {
10854
43
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10855
43
        }
10856
10857
      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10858
      In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10859
      points and cannot have another case, but if UCP is set they may do. */
10860
10861
1
#ifdef SUPPORT_UNICODE
10862
1
#if PCRE2_CODE_UNIT_WIDTH == 8
10863
1
      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10864
0
        re->flags |= PCRE2_FIRSTCASELESS;
10865
#else
10866
      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10867
               UCD_OTHERCASE(firstcu) != firstcu)
10868
        re->flags |= PCRE2_FIRSTCASELESS;
10869
#endif
10870
44
#endif  /* SUPPORT_UNICODE */
10871
44
      }
10872
90
    }
10873
10874
  /* When there is no first code unit, for non-anchored patterns, see if we can
10875
  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10876
  branches start with ^ and also when all branches start with non-atomic .* for
10877
  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10878
  that disables this case.) */
10879
10880
872
  else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10881
847
           is_startline(codestart, 0, &cb, 0, FALSE))
10882
1
    re->flags |= PCRE2_STARTLINE;
10883
10884
  /* Handle the "required code unit", if one is set. In the UTF case we can
10885
  increment the minimum minimum length only if we are sure this really is a
10886
  different character and not a non-starting code unit of the first character,
10887
  because the minimum length count is in characters, not code units. */
10888
10889
962
  if (reqcuflags < REQ_NONE)
10890
234
    {
10891
#if PCRE2_CODE_UNIT_WIDTH == 16
10892
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10893
        firstcuflags >= REQ_NONE ||                 /* First not set */
10894
        (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10895
        (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10896
#elif PCRE2_CODE_UNIT_WIDTH == 8
10897
234
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10898
127
        firstcuflags >= REQ_NONE ||                 /* First not set */
10899
23
        (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10900
0
        (reqcu & 0x80) == 0)                        /* Req is ASCII */
10901
234
#endif
10902
234
      {
10903
234
      minminlength++;
10904
234
      }
10905
10906
    /* In the case of an anchored pattern, set up the value only if it follows
10907
    a variable length item in the pattern. */
10908
10909
234
    if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10910
7
        (reqcuflags & REQ_VARY) != 0)
10911
231
      {
10912
231
      re->last_codeunit = reqcu;
10913
231
      re->flags |= PCRE2_LASTSET;
10914
10915
      /* Handle caseless required code units as for first code units (above). */
10916
10917
231
      if ((reqcuflags & REQ_CASELESS) != 0)
10918
85
        {
10919
85
        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10920
85
          {
10921
85
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10922
85
          }
10923
0
#ifdef SUPPORT_UNICODE
10924
0
#if PCRE2_CODE_UNIT_WIDTH == 8
10925
0
      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10926
0
        re->flags |= PCRE2_LASTCASELESS;
10927
#else
10928
      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10929
               UCD_OTHERCASE(reqcu) != reqcu)
10930
        re->flags |= PCRE2_LASTCASELESS;
10931
#endif
10932
85
#endif  /* SUPPORT_UNICODE */
10933
85
        }
10934
231
      }
10935
234
    }
10936
10937
  /* Study the compiled pattern to set up information such as a bitmap of
10938
  starting code units and a minimum matching length. */
10939
10940
962
  if (PRIV(study)(re) != 0)
10941
0
    {
10942
0
    errorcode = ERR31;
10943
0
    goto HAD_CB_ERROR;
10944
0
    }
10945
10946
  /* If study() set a bitmap of starting code units, it implies a minimum
10947
  length of at least one. */
10948
10949
962
  if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10950
395
    minminlength = 1;
10951
10952
  /* If the minimum length set (or not set) by study() is less than the minimum
10953
  implied by required code units, override it. */
10954
10955
962
  if (re->minlength < minminlength) re->minlength = minminlength;
10956
962
  }   /* End of start-of-match optimizations. */
10957
10958
/* Control ends up here in all cases. When running under valgrind, make a
10959
pattern's terminating zero defined again. If memory was obtained for the parsed
10960
version of the pattern, free it before returning. Also free the list of named
10961
groups if a larger one had to be obtained, and likewise the group information
10962
vector. */
10963
10964
1.52k
EXIT:
10965
#ifdef SUPPORT_VALGRIND
10966
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10967
#endif
10968
1.52k
if (cb.parsed_pattern != stack_parsed_pattern)
10969
137
  ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10970
1.52k
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10971
0
  ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10972
1.52k
if (cb.groupinfo != stack_groupinfo)
10973
0
  ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10974
1.52k
return re;    /* Will be NULL after an error */
10975
10976
/* Errors discovered in parse_regex() set the offset value in the compile
10977
block. Errors discovered before it is called must compute it from the ptr
10978
value. After parse_regex() is called, the offset in the compile block is set to
10979
the end of the pattern, but certain errors in compile_regex() may reset it if
10980
an offset is available in the parsed pattern. */
10981
10982
540
HAD_CB_ERROR:
10983
540
ptr = pattern + cb.erroroffset;
10984
10985
540
HAD_EARLY_ERROR:
10986
540
*erroroffset = ptr - pattern;
10987
10988
566
HAD_ERROR:
10989
566
*errorptr = errorcode;
10990
566
pcre2_code_free(re);
10991
re = NULL;
10992
566
goto EXIT;
10993
540
}
10994
10995
/* These #undefs are here to enable unity builds with CMake. */
10996
10997
#undef NLBLOCK /* Block containing newline information */
10998
#undef PSSTART /* Field containing processed string start */
10999
#undef PSEND   /* Field containing processed string end */
11000
11001
/* End of pcre2_compile.c */