Coverage Report

Created: 2025-10-13 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/glib/glib/pcre/pcre_compile.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
           Copyright (c) 1997-2012 University of Cambridge
10
11
-----------------------------------------------------------------------------
12
Redistribution and use in source and binary forms, with or without
13
modification, are permitted provided that the following conditions are met:
14
15
    * Redistributions of source code must retain the above copyright notice,
16
      this list of conditions and the following disclaimer.
17
18
    * Redistributions in binary form must reproduce the above copyright
19
      notice, this list of conditions and the following disclaimer in the
20
      documentation and/or other materials provided with the distribution.
21
22
    * Neither the name of the University of Cambridge nor the names of its
23
      contributors may be used to endorse or promote products derived from
24
      this software without specific prior written permission.
25
26
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36
POSSIBILITY OF SUCH DAMAGE.
37
-----------------------------------------------------------------------------
38
*/
39
40
41
/* This module contains the external function pcre_compile(), along with
42
supporting internal functions that are not used by other modules. */
43
44
45
#include "config.h"
46
47
0
#define NLBLOCK cd             /* Block containing newline information */
48
#define PSSTART start_pattern  /* Field containing processed string start */
49
0
#define PSEND   end_pattern    /* Field containing processed string end */
50
51
#include "pcre_internal.h"
52
53
#ifdef GLIB_COMPILATION
54
#include "gstrfuncs.h"
55
#else
56
#include <glib.h>
57
#endif
58
59
/* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
60
is also used by pcretest. PCRE_DEBUG is not defined when building a production
61
library. We do not need to select pcre16_printint.c specially, because the
62
COMPILE_PCREx macro will already be appropriately set. */
63
64
#ifdef PCRE_DEBUG
65
/* pcre_printint.c should not include any headers */
66
#define PCRE_INCLUDED
67
#include "pcre_printint.c"
68
#undef PCRE_INCLUDED
69
#endif
70
71
72
/* Macro for setting individual bits in class bitmaps. */
73
74
0
#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
75
76
/* Maximum length value to check against when making sure that the integer that
77
holds the compiled pattern length does not overflow. We make it a bit less than
78
INT_MAX to allow for adding in group terminating bytes, so that we don't have
79
to check them every time. */
80
81
0
#define OFLOW_MAX (INT_MAX - 20)
82
83
84
/*************************************************
85
*      Code parameters and static tables         *
86
*************************************************/
87
88
/* This value specifies the size of stack workspace that is used during the
89
first pre-compile phase that determines how much memory is required. The regex
90
is partly compiled into this space, but the compiled parts are discarded as
91
soon as they can be, so that hopefully there will never be an overrun. The code
92
does, however, check for an overrun. The largest amount I've seen used is 218,
93
so this number is very generous.
94
95
The same workspace is used during the second, actual compile phase for
96
remembering forward references to groups so that they can be filled in at the
97
end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
98
is 4 there is plenty of room for most patterns. However, the memory can get
99
filled up by repetitions of forward references, for example patterns like
100
/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
101
that the workspace is expanded using malloc() in this situation. The value
102
below is therefore a minimum, and we put a maximum on it for safety. The
103
minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
104
kicks in at the same number of forward references in all cases. */
105
106
0
#define COMPILE_WORK_SIZE (2048*LINK_SIZE)
107
0
#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
108
109
/* The overrun tests check for a slightly smaller size so that they detect the
110
overrun before it actually does run off the end of the data block. */
111
112
0
#define WORK_SIZE_SAFETY_MARGIN (100)
113
114
/* Private flags added to firstchar and reqchar. */
115
116
0
#define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
117
0
#define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
118
119
/* Repeated character flags. */
120
121
0
#define UTF_LENGTH     0x10000000l      /* The char contains its length. */
122
123
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
124
are simple data values; negative values are for special things like \d and so
125
on. Zero means further processing is needed (for things like \x), or the escape
126
is invalid. */
127
128
#ifndef EBCDIC
129
130
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
131
in UTF-8 mode. */
132
133
static const short int escapes[] = {
134
     0,                       0,
135
     0,                       0,
136
     0,                       0,
137
     0,                       0,
138
     0,                       0,
139
     CHAR_COLON,              CHAR_SEMICOLON,
140
     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
141
     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
142
     CHAR_COMMERCIAL_AT,      -ESC_A,
143
     -ESC_B,                  -ESC_C,
144
     -ESC_D,                  -ESC_E,
145
     0,                       -ESC_G,
146
     -ESC_H,                  0,
147
     0,                       -ESC_K,
148
     0,                       0,
149
     -ESC_N,                  0,
150
     -ESC_P,                  -ESC_Q,
151
     -ESC_R,                  -ESC_S,
152
     0,                       0,
153
     -ESC_V,                  -ESC_W,
154
     -ESC_X,                  0,
155
     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
156
     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
157
     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
158
     CHAR_GRAVE_ACCENT,       7,
159
     -ESC_b,                  0,
160
     -ESC_d,                  ESC_e,
161
     ESC_f,                   0,
162
     -ESC_h,                  0,
163
     0,                       -ESC_k,
164
     0,                       0,
165
     ESC_n,                   0,
166
     -ESC_p,                  0,
167
     ESC_r,                   -ESC_s,
168
     ESC_tee,                 0,
169
     -ESC_v,                  -ESC_w,
170
     0,                       0,
171
     -ESC_z
172
};
173
174
#else
175
176
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
177
178
static const short int escapes[] = {
179
/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
180
/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
181
/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
182
/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
183
/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
184
/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
185
/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
186
/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
187
/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
188
/*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
189
/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
190
/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
191
/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
192
/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
193
/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
194
/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
195
/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
196
/*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
197
/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
198
/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
199
/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
200
/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
201
/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
202
};
203
#endif
204
205
206
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
207
searched linearly. Put all the names into a single string, in order to reduce
208
the number of relocations when a shared library is dynamically linked. The
209
string is built from string macros so that it works in UTF-8 mode on EBCDIC
210
platforms. */
211
212
typedef struct verbitem {
213
  int   len;                 /* Length of verb name */
214
  int   op;                  /* Op when no arg, or -1 if arg mandatory */
215
  int   op_arg;              /* Op when arg present, or -1 if not allowed */
216
} verbitem;
217
218
static const char verbnames[] =
219
  "\0"                       /* Empty name is a shorthand for MARK */
220
  STRING_MARK0
221
  STRING_ACCEPT0
222
  STRING_COMMIT0
223
  STRING_F0
224
  STRING_FAIL0
225
  STRING_PRUNE0
226
  STRING_SKIP0
227
  STRING_THEN;
228
229
static const verbitem verbs[] = {
230
  { 0, -1,        OP_MARK },
231
  { 4, -1,        OP_MARK },
232
  { 6, OP_ACCEPT, -1 },
233
  { 6, OP_COMMIT, -1 },
234
  { 1, OP_FAIL,   -1 },
235
  { 4, OP_FAIL,   -1 },
236
  { 5, OP_PRUNE,  OP_PRUNE_ARG },
237
  { 4, OP_SKIP,   OP_SKIP_ARG  },
238
  { 4, OP_THEN,   OP_THEN_ARG  }
239
};
240
241
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
242
243
244
/* Tables of names of POSIX character classes and their lengths. The names are
245
now all in a single string, to reduce the number of relocations when a shared
246
library is dynamically loaded. The list of lengths is terminated by a zero
247
length entry. The first three must be alpha, lower, upper, as this is assumed
248
for handling case independence. */
249
250
static const char posix_names[] =
251
  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
252
  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
253
  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
254
  STRING_word0  STRING_xdigit;
255
256
static const pcre_uint8 posix_name_lengths[] = {
257
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
258
259
/* Table of class bit maps for each POSIX class. Each class is formed from a
260
base map, with an optional addition or removal of another map. Then, for some
261
classes, there is some additional tweaking: for [:blank:] the vertical space
262
characters are removed, and for [:alpha:] and [:alnum:] the underscore
263
character is removed. The triples in the table consist of the base map offset,
264
second map offset or -1 if no second map, and a non-negative value for map
265
addition or a negative value for map subtraction (if there are two maps). The
266
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
267
remove vertical space characters, 2 => remove underscore. */
268
269
static const int posix_class_maps[] = {
270
  cbit_word,  cbit_digit, -2,             /* alpha */
271
  cbit_lower, -1,          0,             /* lower */
272
  cbit_upper, -1,          0,             /* upper */
273
  cbit_word,  -1,          2,             /* alnum - word without underscore */
274
  cbit_print, cbit_cntrl,  0,             /* ascii */
275
  cbit_space, -1,          1,             /* blank - a GNU extension */
276
  cbit_cntrl, -1,          0,             /* cntrl */
277
  cbit_digit, -1,          0,             /* digit */
278
  cbit_graph, -1,          0,             /* graph */
279
  cbit_print, -1,          0,             /* print */
280
  cbit_punct, -1,          0,             /* punct */
281
  cbit_space, -1,          0,             /* space */
282
  cbit_word,  -1,          0,             /* word - a Perl extension */
283
  cbit_xdigit,-1,          0              /* xdigit */
284
};
285
286
/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
287
substitutes must be in the order of the names, defined above, and there are
288
both positive and negative cases. NULL means no substitute. */
289
290
#ifdef SUPPORT_UCP
291
static const pcre_uchar string_PNd[]  = {
292
  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
293
  CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294
static const pcre_uchar string_pNd[]  = {
295
  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
296
  CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297
static const pcre_uchar string_PXsp[] = {
298
  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
299
  CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300
static const pcre_uchar string_pXsp[] = {
301
  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
302
  CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303
static const pcre_uchar string_PXwd[] = {
304
  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
305
  CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306
static const pcre_uchar string_pXwd[] = {
307
  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
308
  CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
309
310
static const pcre_uchar *substitutes[] = {
311
  string_PNd,           /* \D */
312
  string_pNd,           /* \d */
313
  string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
314
  string_pXsp,          /* \s */
315
  string_PXwd,          /* \W */
316
  string_pXwd           /* \w */
317
};
318
319
static const pcre_uchar string_pL[] =   {
320
  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321
  CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322
static const pcre_uchar string_pLl[] =  {
323
  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324
  CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325
static const pcre_uchar string_pLu[] =  {
326
  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327
  CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328
static const pcre_uchar string_pXan[] = {
329
  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
330
  CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
331
static const pcre_uchar string_h[] =    {
332
  CHAR_BACKSLASH, CHAR_h, '\0' };
333
static const pcre_uchar string_pXps[] = {
334
  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
335
  CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336
static const pcre_uchar string_PL[] =   {
337
  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338
  CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339
static const pcre_uchar string_PLl[] =  {
340
  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341
  CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342
static const pcre_uchar string_PLu[] =  {
343
  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344
  CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345
static const pcre_uchar string_PXan[] = {
346
  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
347
  CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
348
static const pcre_uchar string_H[] =    {
349
  CHAR_BACKSLASH, CHAR_H, '\0' };
350
static const pcre_uchar string_PXps[] = {
351
  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
352
  CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
353
354
static const pcre_uchar *posix_substitutes[] = {
355
  string_pL,            /* alpha */
356
  string_pLl,           /* lower */
357
  string_pLu,           /* upper */
358
  string_pXan,          /* alnum */
359
  NULL,                 /* ascii */
360
  string_h,             /* blank */
361
  NULL,                 /* cntrl */
362
  string_pNd,           /* digit */
363
  NULL,                 /* graph */
364
  NULL,                 /* print */
365
  NULL,                 /* punct */
366
  string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
367
  string_pXwd,          /* word */
368
  NULL,                 /* xdigit */
369
  /* Negated cases */
370
  string_PL,            /* ^alpha */
371
  string_PLl,           /* ^lower */
372
  string_PLu,           /* ^upper */
373
  string_PXan,          /* ^alnum */
374
  NULL,                 /* ^ascii */
375
  string_H,             /* ^blank */
376
  NULL,                 /* ^cntrl */
377
  string_PNd,           /* ^digit */
378
  NULL,                 /* ^graph */
379
  NULL,                 /* ^print */
380
  NULL,                 /* ^punct */
381
  string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
382
  string_PXwd,          /* ^word */
383
  NULL                  /* ^xdigit */
384
};
385
0
#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
386
#endif
387
388
#define STRING(a)  # a
389
#define XSTRING(s) STRING(s)
390
391
/* The texts of compile-time error messages. These are "char *" because they
392
are passed to the outside world. Do not ever re-use any error number, because
393
they are documented. Always add a new error instead. Messages marked DEAD below
394
are no longer used. This used to be a table of strings, but in order to reduce
395
the number of relocations needed when a shared library is loaded dynamically,
396
it is now one long string. We cannot use a table of offsets, because the
397
lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
398
simply count through to the one we want - this isn't a performance issue
399
because these strings are used only when there is a compilation error.
400
401
Each substring ends with \0 to insert a null character. This includes the final
402
substring, so that the whole string ends with \0\0, which can be detected when
403
counting through. */
404
405
static const char error_texts[] =
406
  "no error\0"
407
  "\\ at end of pattern\0"
408
  "\\c at end of pattern\0"
409
  "unrecognized character follows \\\0"
410
  "numbers out of order in {} quantifier\0"
411
  /* 5 */
412
  "number too big in {} quantifier\0"
413
  "missing terminating ] for character class\0"
414
  "invalid escape sequence in character class\0"
415
  "range out of order in character class\0"
416
  "nothing to repeat\0"
417
  /* 10 */
418
  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
419
  "internal error: unexpected repeat\0"
420
  "unrecognized character after (? or (?-\0"
421
  "POSIX named classes are supported only within a class\0"
422
  "missing )\0"
423
  /* 15 */
424
  "reference to non-existent subpattern\0"
425
  "erroffset passed as NULL\0"
426
  "unknown option bit(s) set\0"
427
  "missing ) after comment\0"
428
  "parentheses nested too deeply\0"  /** DEAD **/
429
  /* 20 */
430
  "regular expression is too large\0"
431
  "failed to get memory\0"
432
  "unmatched parentheses\0"
433
  "internal error: code overflow\0"
434
  "unrecognized character after (?<\0"
435
  /* 25 */
436
  "lookbehind assertion is not fixed length\0"
437
  "malformed number or name after (?(\0"
438
  "conditional group contains more than two branches\0"
439
  "assertion expected after (?(\0"
440
  "(?R or (?[+-]digits must be followed by )\0"
441
  /* 30 */
442
  "unknown POSIX class name\0"
443
  "POSIX collating elements are not supported\0"
444
  "this version of PCRE is compiled without UTF support\0"
445
  "spare error\0"  /** DEAD **/
446
  "character value in \\x{...} sequence is too large\0"
447
  /* 35 */
448
  "invalid condition (?(0)\0"
449
  "\\C not allowed in lookbehind assertion\0"
450
  "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
451
  "number after (?C is > 255\0"
452
  "closing ) for (?C expected\0"
453
  /* 40 */
454
  "recursive call could loop indefinitely\0"
455
  "unrecognized character after (?P\0"
456
  "syntax error in subpattern name (missing terminator)\0"
457
  "two named subpatterns have the same name\0"
458
  "invalid UTF-8 string\0"
459
  /* 45 */
460
  "support for \\P, \\p, and \\X has not been compiled\0"
461
  "malformed \\P or \\p sequence\0"
462
  "unknown property name after \\P or \\p\0"
463
  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
464
  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
465
  /* 50 */
466
  "repeated subpattern is too long\0"    /** DEAD **/
467
  "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
468
  "internal error: overran compiling workspace\0"
469
  "internal error: previously-checked referenced subpattern not found\0"
470
  "DEFINE group contains more than one branch\0"
471
  /* 55 */
472
  "repeating a DEFINE group is not allowed\0"  /** DEAD **/
473
  "inconsistent NEWLINE options\0"
474
  "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
475
  "a numbered reference must not be zero\0"
476
  "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
477
  /* 60 */
478
  "(*VERB) not recognized\0"
479
  "number is too big\0"
480
  "subpattern name expected\0"
481
  "digit expected after (?+\0"
482
  "] is an invalid data character in JavaScript compatibility mode\0"
483
  /* 65 */
484
  "different names for subpatterns of the same number are not allowed\0"
485
  "(*MARK) must have an argument\0"
486
  "this version of PCRE is not compiled with Unicode property support\0"
487
  "\\c must be followed by an ASCII character\0"
488
  "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
489
  /* 70 */
490
  "internal error: unknown opcode in find_fixedlength()\0"
491
  "\\N is not supported in a class\0"
492
  "too many forward references\0"
493
  "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
494
  "invalid UTF-16 string\0"
495
  /* 75 */
496
  "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
497
  "character value in \\u.... sequence is too large\0"
498
  ;
499
500
/* Table to identify digits and hex digits. This is used when compiling
501
patterns. Note that the tables in chartables are dependent on the locale, and
502
may mark arbitrary characters as digits - but the PCRE compiling code expects
503
to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
504
a private table here. It costs 256 bytes, but it is a lot faster than doing
505
character value tests (at least in some simple cases I timed), and in some
506
applications one wants PCRE to compile efficiently as well as match
507
efficiently.
508
509
For convenience, we use the same bit definitions as in chartables:
510
511
  0x04   decimal digit
512
  0x08   hexadecimal digit
513
514
Then we can use ctype_digit and ctype_xdigit in the code. */
515
516
/* Using a simple comparison for decimal numbers rather than a memory read
517
is much faster, and the resulting code is simpler (the compiler turns it
518
into a subtraction and unsigned comparison). */
519
520
0
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
521
522
#if 0
523
#ifndef EBCDIC
524
525
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
526
UTF-8 mode. */
527
528
static const pcre_uint8 digitab[] =
529
  {
530
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
531
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
532
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
533
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
534
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
535
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
536
  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
537
  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
538
  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
539
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
540
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
541
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
542
  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
543
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
544
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
545
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
546
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
547
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
548
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
549
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
550
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
551
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
552
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
553
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
554
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
555
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
556
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
557
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
558
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
559
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
560
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
561
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
562
563
#else
564
565
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
566
567
static const pcre_uint8 digitab[] =
568
  {
569
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
570
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
571
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
572
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
573
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
574
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
575
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
576
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
577
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
578
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
579
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
580
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
581
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
582
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
583
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
584
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
585
  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
586
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
587
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
588
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
589
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
590
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
591
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
592
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
593
  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
594
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
595
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
596
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
597
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
598
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
599
  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
600
  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
601
602
static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
603
  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
604
  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
605
  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
606
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
607
  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
608
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
609
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
610
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
611
  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
612
  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
613
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
614
  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
615
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
616
  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
617
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
618
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
619
  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
620
  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
621
  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
622
  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
623
  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
624
  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
625
  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
626
  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
627
  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
628
  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
629
  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
630
  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
631
  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
632
  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
633
  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
634
  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
635
#endif
636
#endif /* 0 */
637
638
/* Definition to allow mutual recursion */
639
640
static BOOL
641
  compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
642
    int *, int *, branch_chain *, compile_data *, int *);
643
644
645
646
/*************************************************
647
*            Find an error text                  *
648
*************************************************/
649
650
/* The error texts are now all in one long string, to save on relocations. As
651
some of the text is of unknown length, we can't use a table of offsets.
652
Instead, just count through the strings. This is not a performance issue
653
because it happens only when there has been a compilation error.
654
655
Argument:   the error number
656
Returns:    pointer to the error string
657
*/
658
659
static const char *
660
find_error_text(int n)
661
0
{
662
0
const char *s = error_texts;
663
0
for (; n > 0; n--)
664
0
  {
665
0
  while (*s++ != 0) {};
666
0
  if (*s == 0) return "Error text not found (please report)";
667
0
  }
668
0
return s;
669
0
}
670
671
672
/*************************************************
673
*           Expand the workspace                 *
674
*************************************************/
675
676
/* This function is called during the second compiling phase, if the number of
677
forward references fills the existing workspace, which is originally a block on
678
the stack. A larger block is obtained from malloc() unless the ultimate limit
679
has been reached or the increase will be rather small.
680
681
Argument: pointer to the compile data block
682
Returns:  0 if all went well, else an error number
683
*/
684
685
static int
686
expand_workspace(compile_data *cd)
687
0
{
688
0
pcre_uchar *newspace;
689
0
int newsize = cd->workspace_size * 2;
690
691
0
if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
692
0
if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
693
0
    newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
694
0
 return ERR72;
695
696
0
newspace = (PUBL(malloc))(IN_UCHARS(newsize));
697
0
if (newspace == NULL) return ERR21;
698
0
memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
699
0
cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
700
0
if (cd->workspace_size > COMPILE_WORK_SIZE)
701
0
  (PUBL(free))((void *)cd->start_workspace);
702
0
cd->start_workspace = newspace;
703
0
cd->workspace_size = newsize;
704
0
return 0;
705
0
}
706
707
708
709
/*************************************************
710
*            Check for counted repeat            *
711
*************************************************/
712
713
/* This function is called when a '{' is encountered in a place where it might
714
start a quantifier. It looks ahead to see if it really is a quantifier or not.
715
It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
716
where the ddds are digits.
717
718
Arguments:
719
  p         pointer to the first char after '{'
720
721
Returns:    TRUE or FALSE
722
*/
723
724
static BOOL
725
is_counted_repeat(const pcre_uchar *p)
726
0
{
727
0
if (!IS_DIGIT(*p)) return FALSE;
728
0
p++;
729
0
while (IS_DIGIT(*p)) p++;
730
0
if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
731
732
0
if (*p++ != CHAR_COMMA) return FALSE;
733
0
if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
734
735
0
if (!IS_DIGIT(*p)) return FALSE;
736
0
p++;
737
0
while (IS_DIGIT(*p)) p++;
738
739
0
return (*p == CHAR_RIGHT_CURLY_BRACKET);
740
0
}
741
742
743
744
/*************************************************
745
*            Handle escapes                      *
746
*************************************************/
747
748
/* This function is called when a \ has been encountered. It either returns a
749
positive value for a simple escape such as \n, or a negative value which
750
encodes one of the more complicated things such as \d. A backreference to group
751
n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
752
UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
753
ptr is pointing at the \. On exit, it is on the final character of the escape
754
sequence.
755
756
Arguments:
757
  ptrptr         points to the pattern position pointer
758
  errorcodeptr   points to the errorcode variable
759
  bracount       number of previous extracting brackets
760
  options        the options bits
761
  isclass        TRUE if inside a character class
762
763
Returns:         zero or positive => a data character
764
                 negative => a special escape sequence
765
                 on error, errorcodeptr is set
766
*/
767
768
static int
769
check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
770
  int options, BOOL isclass)
771
0
{
772
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
773
0
BOOL utf = (options & PCRE_UTF8) != 0;
774
0
const pcre_uchar *ptr = *ptrptr + 1;
775
0
pcre_int32 c;
776
0
int i;
777
778
0
GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
779
0
ptr--;                            /* Set pointer back to the last byte */
780
781
/* If backslash is at the end of the pattern, it's an error. */
782
783
0
if (c == 0) *errorcodeptr = ERR1;
784
785
/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
786
in a table. A non-zero result is something that can be returned immediately.
787
Otherwise further processing may be required. */
788
789
0
#ifndef EBCDIC  /* ASCII/UTF-8 coding */
790
/* Not alphanumeric */
791
0
else if (c < CHAR_0 || c > CHAR_z) {}
792
0
else if ((i = escapes[c - CHAR_0]) != 0) c = i;
793
794
#else           /* EBCDIC coding */
795
/* Not alphanumeric */
796
else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
797
else if ((i = escapes[c - 0x48]) != 0)  c = i;
798
#endif
799
800
/* Escapes that need further processing, or are illegal. */
801
802
0
else
803
0
  {
804
0
  const pcre_uchar *oldptr;
805
0
  BOOL braced, negated;
806
807
0
  switch (c)
808
0
    {
809
    /* A number of Perl escapes are not handled by PCRE. We give an explicit
810
    error. */
811
812
0
    case CHAR_l:
813
0
    case CHAR_L:
814
0
    *errorcodeptr = ERR37;
815
0
    break;
816
817
0
    case CHAR_u:
818
0
    if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
819
0
      {
820
      /* In JavaScript, \u must be followed by four hexadecimal numbers.
821
      Otherwise it is a lowercase u letter. */
822
0
      if (MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0
823
0
        && MAX_255(ptr[2]) && g_ascii_isxdigit(ptr[2]) != 0
824
0
        && MAX_255(ptr[3]) && g_ascii_isxdigit(ptr[3]) != 0
825
0
        && MAX_255(ptr[4]) && g_ascii_isxdigit(ptr[4]) != 0)
826
0
        {
827
0
        c = 0;
828
0
        for (i = 0; i < 4; ++i)
829
0
          {
830
0
          int cc = *(++ptr);
831
0
#ifndef EBCDIC  /* ASCII/UTF-8 coding */
832
0
          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
833
0
          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
834
#else           /* EBCDIC coding */
835
          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
836
          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
837
#endif
838
0
          }
839
840
0
#ifdef COMPILE_PCRE8
841
0
        if (c > (utf ? 0x10ffff : 0xff))
842
#else
843
#ifdef COMPILE_PCRE16
844
        if (c > (utf ? 0x10ffff : 0xffff))
845
#endif
846
#endif
847
0
          {
848
0
          *errorcodeptr = ERR76;
849
0
          }
850
0
        else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
851
0
        }
852
0
      }
853
0
    else
854
0
      *errorcodeptr = ERR37;
855
0
    break;
856
857
0
    case CHAR_U:
858
    /* In JavaScript, \U is an uppercase U letter. */
859
0
    if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
860
0
    break;
861
862
    /* In a character class, \g is just a literal "g". Outside a character
863
    class, \g must be followed by one of a number of specific things:
864
865
    (1) A number, either plain or braced. If positive, it is an absolute
866
    backreference. If negative, it is a relative backreference. This is a Perl
867
    5.10 feature.
868
869
    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
870
    is part of Perl's movement towards a unified syntax for back references. As
871
    this is synonymous with \k{name}, we fudge it up by pretending it really
872
    was \k.
873
874
    (3) For Oniguruma compatibility we also support \g followed by a name or a
875
    number either in angle brackets or in single quotes. However, these are
876
    (possibly recursive) subroutine calls, _not_ backreferences. Just return
877
    the -ESC_g code (cf \k). */
878
879
0
    case CHAR_g:
880
0
    if (isclass) break;
881
0
    if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
882
0
      {
883
0
      c = -ESC_g;
884
0
      break;
885
0
      }
886
887
    /* Handle the Perl-compatible cases */
888
889
0
    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
890
0
      {
891
0
      const pcre_uchar *p;
892
0
      for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
893
0
        if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
894
0
      if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
895
0
        {
896
0
        c = -ESC_k;
897
0
        break;
898
0
        }
899
0
      braced = TRUE;
900
0
      ptr++;
901
0
      }
902
0
    else braced = FALSE;
903
904
0
    if (ptr[1] == CHAR_MINUS)
905
0
      {
906
0
      negated = TRUE;
907
0
      ptr++;
908
0
      }
909
0
    else negated = FALSE;
910
911
    /* The integer range is limited by the machine's int representation. */
912
0
    c = 0;
913
0
    while (IS_DIGIT(ptr[1]))
914
0
      {
915
0
      if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
916
0
        {
917
0
        c = -1;
918
0
        break;
919
0
        }
920
0
      c = c * 10 + *(++ptr) - CHAR_0;
921
0
      }
922
0
    if (((unsigned int)c) > INT_MAX) /* Integer overflow */
923
0
      {
924
0
      while (IS_DIGIT(ptr[1]))
925
0
        ptr++;
926
0
      *errorcodeptr = ERR61;
927
0
      break;
928
0
      }
929
930
0
    if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
931
0
      {
932
0
      *errorcodeptr = ERR57;
933
0
      break;
934
0
      }
935
936
0
    if (c == 0)
937
0
      {
938
0
      *errorcodeptr = ERR58;
939
0
      break;
940
0
      }
941
942
0
    if (negated)
943
0
      {
944
0
      if (c > bracount)
945
0
        {
946
0
        *errorcodeptr = ERR15;
947
0
        break;
948
0
        }
949
0
      c = bracount - (c - 1);
950
0
      }
951
952
0
    c = -(ESC_REF + c);
953
0
    break;
954
955
    /* The handling of escape sequences consisting of a string of digits
956
    starting with one that is not zero is not straightforward. By experiment,
957
    the way Perl works seems to be as follows:
958
959
    Outside a character class, the digits are read as a decimal number. If the
960
    number is less than 10, or if there are that many previous extracting
961
    left brackets, then it is a back reference. Otherwise, up to three octal
962
    digits are read to form an escaped byte. Thus \123 is likely to be octal
963
    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
964
    value is greater than 377, the least significant 8 bits are taken. Inside a
965
    character class, \ followed by a digit is always an octal number. */
966
967
0
    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
968
0
    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
969
970
0
    if (!isclass)
971
0
      {
972
0
      oldptr = ptr;
973
      /* The integer range is limited by the machine's int representation. */
974
0
      c -= CHAR_0;
975
0
      while (IS_DIGIT(ptr[1]))
976
0
        {
977
0
        if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
978
0
          {
979
0
          c = -1;
980
0
          break;
981
0
          }
982
0
        c = c * 10 + *(++ptr) - CHAR_0;
983
0
        }
984
0
      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
985
0
        {
986
0
        while (IS_DIGIT(ptr[1]))
987
0
          ptr++;
988
0
        *errorcodeptr = ERR61;
989
0
        break;
990
0
        }
991
0
      if (c < 10 || c <= bracount)
992
0
        {
993
0
        c = -(ESC_REF + c);
994
0
        break;
995
0
        }
996
0
      ptr = oldptr;      /* Put the pointer back and fall through */
997
0
      }
998
999
    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
1000
    generates a binary zero byte and treats the digit as a following literal.
1001
    Thus we have to pull back the pointer by one. */
1002
1003
0
    if ((c = *ptr) >= CHAR_8)
1004
0
      {
1005
0
      ptr--;
1006
0
      c = 0;
1007
0
      break;
1008
0
      }
1009
1010
    /* \0 always starts an octal number, but we may drop through to here with a
1011
    larger first octal digit. The original code used just to take the least
1012
    significant 8 bits of octal numbers (I think this is what early Perls used
1013
    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1014
    but no more than 3 octal digits. */
1015
1016
0
    case CHAR_0:
1017
0
    c -= CHAR_0;
1018
0
    while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1019
0
        c = c * 8 + *(++ptr) - CHAR_0;
1020
0
#ifdef COMPILE_PCRE8
1021
0
    if (!utf && c > 0xff) *errorcodeptr = ERR51;
1022
0
#endif
1023
0
    break;
1024
1025
    /* \x is complicated. \x{ddd} is a character number which can be greater
1026
    than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1027
    If not, { is treated as a data character. */
1028
1029
0
    case CHAR_x:
1030
0
    if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1031
0
      {
1032
      /* In JavaScript, \x must be followed by two hexadecimal numbers.
1033
      Otherwise it is a lowercase x letter. */
1034
0
      if (MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0
1035
0
        && MAX_255(ptr[2]) && g_ascii_isxdigit(ptr[2]) != 0)
1036
0
        {
1037
0
        c = 0;
1038
0
        for (i = 0; i < 2; ++i)
1039
0
          {
1040
0
          int cc = *(++ptr);
1041
0
#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1042
0
          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1043
0
          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1044
#else           /* EBCDIC coding */
1045
          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1046
          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1047
#endif
1048
0
          }
1049
0
        }
1050
0
      break;
1051
0
      }
1052
1053
0
    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1054
0
      {
1055
0
      const pcre_uchar *pt = ptr + 2;
1056
1057
0
      c = 0;
1058
0
      while (MAX_255(*pt) && g_ascii_isxdigit(*pt) != 0)
1059
0
        {
1060
0
        int cc = *pt++;
1061
0
        if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1062
1063
0
#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1064
0
        if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1065
0
        c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1066
#else           /* EBCDIC coding */
1067
        if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1068
        c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1069
#endif
1070
1071
0
#ifdef COMPILE_PCRE8
1072
0
        if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1073
#else
1074
#ifdef COMPILE_PCRE16
1075
        if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1076
#endif
1077
#endif
1078
0
        }
1079
1080
0
      if (c < 0)
1081
0
        {
1082
0
        while (MAX_255(*pt) && g_ascii_isxdigit(*pt) != 0) pt++;
1083
0
        *errorcodeptr = ERR34;
1084
0
        }
1085
1086
0
      if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1087
0
        {
1088
0
        if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1089
0
        ptr = pt;
1090
0
        break;
1091
0
        }
1092
1093
      /* If the sequence of hex digits does not end with '}', then we don't
1094
      recognize this construct; fall through to the normal \x handling. */
1095
0
      }
1096
1097
    /* Read just a single-byte hex-defined char */
1098
1099
0
    c = 0;
1100
0
    while (i++ < 2 && MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0)
1101
0
      {
1102
0
      int cc;                                  /* Some compilers don't like */
1103
0
      cc = *(++ptr);                           /* ++ in initializers */
1104
0
#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1105
0
      if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1106
0
      c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1107
#else           /* EBCDIC coding */
1108
      if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1109
      c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1110
#endif
1111
0
      }
1112
0
    break;
1113
1114
    /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1115
    An error is given if the byte following \c is not an ASCII character. This
1116
    coding is ASCII-specific, but then the whole concept of \cx is
1117
    ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1118
1119
0
    case CHAR_c:
1120
0
    c = *(++ptr);
1121
0
    if (c == 0)
1122
0
      {
1123
0
      *errorcodeptr = ERR2;
1124
0
      break;
1125
0
      }
1126
0
#ifndef EBCDIC    /* ASCII/UTF-8 coding */
1127
0
    if (c > 127)  /* Excludes all non-ASCII in either mode */
1128
0
      {
1129
0
      *errorcodeptr = ERR68;
1130
0
      break;
1131
0
      }
1132
0
    if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1133
0
    c ^= 0x40;
1134
#else             /* EBCDIC coding */
1135
    if (c >= CHAR_a && c <= CHAR_z) c += 64;
1136
    c ^= 0xC0;
1137
#endif
1138
0
    break;
1139
1140
    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1141
    other alphanumeric following \ is an error if PCRE_EXTRA was set;
1142
    otherwise, for Perl compatibility, it is a literal. This code looks a bit
1143
    odd, but there used to be some cases other than the default, and there may
1144
    be again in future, so I haven't "optimized" it. */
1145
1146
0
    default:
1147
0
    if ((options & PCRE_EXTRA) != 0) switch(c)
1148
0
      {
1149
0
      default:
1150
0
      *errorcodeptr = ERR3;
1151
0
      break;
1152
0
      }
1153
0
    break;
1154
0
    }
1155
0
  }
1156
1157
/* Perl supports \N{name} for character names, as well as plain \N for "not
1158
newline". PCRE does not support \N{name}. However, it does support
1159
quantification such as \N{2,3}. */
1160
1161
0
if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1162
0
     !is_counted_repeat(ptr+2))
1163
0
  *errorcodeptr = ERR37;
1164
1165
/* If PCRE_UCP is set, we change the values for \d etc. */
1166
1167
0
if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1168
0
  c -= (ESC_DU - ESC_D);
1169
1170
/* Set the pointer to the final character before returning. */
1171
1172
0
*ptrptr = ptr;
1173
0
return c;
1174
0
}
1175
1176
1177
1178
#ifdef SUPPORT_UCP
1179
/*************************************************
1180
*               Handle \P and \p                 *
1181
*************************************************/
1182
1183
/* This function is called after \P or \p has been encountered, provided that
1184
PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1185
pointing at the P or p. On exit, it is pointing at the final character of the
1186
escape sequence.
1187
1188
Argument:
1189
  ptrptr         points to the pattern position pointer
1190
  negptr         points to a boolean that is set TRUE for negation else FALSE
1191
  dptr           points to an int that is set to the detailed property value
1192
  errorcodeptr   points to the error code variable
1193
1194
Returns:         type value from ucp_type_table, or -1 for an invalid type
1195
*/
1196
1197
static int
1198
get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1199
0
{
1200
0
int c, i, bot, top;
1201
0
const pcre_uchar *ptr = *ptrptr;
1202
0
pcre_uchar name[32];
1203
1204
0
c = *(++ptr);
1205
0
if (c == 0) goto ERROR_RETURN;
1206
1207
0
*negptr = FALSE;
1208
1209
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1210
negation. */
1211
1212
0
if (c == CHAR_LEFT_CURLY_BRACKET)
1213
0
  {
1214
0
  if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1215
0
    {
1216
0
    *negptr = TRUE;
1217
0
    ptr++;
1218
0
    }
1219
0
  for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1220
0
    {
1221
0
    c = *(++ptr);
1222
0
    if (c == 0) goto ERROR_RETURN;
1223
0
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1224
0
    name[i] = c;
1225
0
    }
1226
0
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1227
0
  name[i] = 0;
1228
0
  }
1229
1230
/* Otherwise there is just one following character */
1231
1232
0
else
1233
0
  {
1234
0
  name[0] = c;
1235
0
  name[1] = 0;
1236
0
  }
1237
1238
0
*ptrptr = ptr;
1239
1240
/* Search for a recognized property name using binary chop */
1241
1242
0
bot = 0;
1243
0
top = PRIV(utt_size);
1244
1245
0
while (bot < top)
1246
0
  {
1247
0
  i = (bot + top) >> 1;
1248
0
  c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1249
0
  if (c == 0)
1250
0
    {
1251
0
    *dptr = PRIV(utt)[i].value;
1252
0
    return PRIV(utt)[i].type;
1253
0
    }
1254
0
  if (c > 0) bot = i + 1; else top = i;
1255
0
  }
1256
1257
0
*errorcodeptr = ERR47;
1258
0
*ptrptr = ptr;
1259
0
return -1;
1260
1261
0
ERROR_RETURN:
1262
0
*errorcodeptr = ERR46;
1263
0
*ptrptr = ptr;
1264
0
return -1;
1265
0
}
1266
#endif
1267
1268
1269
1270
1271
/*************************************************
1272
*         Read repeat counts                     *
1273
*************************************************/
1274
1275
/* Read an item of the form {n,m} and return the values. This is called only
1276
after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1277
so the syntax is guaranteed to be correct, but we need to check the values.
1278
1279
Arguments:
1280
  p              pointer to first char after '{'
1281
  minp           pointer to int for min
1282
  maxp           pointer to int for max
1283
                 returned as -1 if no max
1284
  errorcodeptr   points to error code variable
1285
1286
Returns:         pointer to '}' on success;
1287
                 current ptr on error, with errorcodeptr set non-zero
1288
*/
1289
1290
static const pcre_uchar *
1291
read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1292
0
{
1293
0
int min = 0;
1294
0
int max = -1;
1295
1296
/* Read the minimum value and do a paranoid check: a negative value indicates
1297
an integer overflow. */
1298
1299
0
while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1300
0
if (min < 0 || min > 65535)
1301
0
  {
1302
0
  *errorcodeptr = ERR5;
1303
0
  return p;
1304
0
  }
1305
1306
/* Read the maximum value if there is one, and again do a paranoid on its size.
1307
Also, max must not be less than min. */
1308
1309
0
if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1310
0
  {
1311
0
  if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1312
0
    {
1313
0
    max = 0;
1314
0
    while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1315
0
    if (max < 0 || max > 65535)
1316
0
      {
1317
0
      *errorcodeptr = ERR5;
1318
0
      return p;
1319
0
      }
1320
0
    if (max < min)
1321
0
      {
1322
0
      *errorcodeptr = ERR4;
1323
0
      return p;
1324
0
      }
1325
0
    }
1326
0
  }
1327
1328
/* Fill in the required variables, and pass back the pointer to the terminating
1329
'}'. */
1330
1331
0
*minp = min;
1332
0
*maxp = max;
1333
0
return p;
1334
0
}
1335
1336
1337
1338
/*************************************************
1339
*  Subroutine for finding forward reference      *
1340
*************************************************/
1341
1342
/* This recursive function is called only from find_parens() below. The
1343
top-level call starts at the beginning of the pattern. All other calls must
1344
start at a parenthesis. It scans along a pattern's text looking for capturing
1345
subpatterns, and counting them. If it finds a named pattern that matches the
1346
name it is given, it returns its number. Alternatively, if the name is NULL, it
1347
returns when it reaches a given numbered subpattern. Recursion is used to keep
1348
track of subpatterns that reset the capturing group numbers - the (?| feature.
1349
1350
This function was originally called only from the second pass, in which we know
1351
that if (?< or (?' or (?P< is encountered, the name will be correctly
1352
terminated because that is checked in the first pass. There is now one call to
1353
this function in the first pass, to check for a recursive back reference by
1354
name (so that we can make the whole group atomic). In this case, we need check
1355
only up to the current position in the pattern, and that is still OK because
1356
and previous occurrences will have been checked. To make this work, the test
1357
for "end of pattern" is a check against cd->end_pattern in the main loop,
1358
instead of looking for a binary zero. This means that the special first-pass
1359
call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1360
processing items within the loop are OK, because afterwards the main loop will
1361
terminate.)
1362
1363
Arguments:
1364
  ptrptr       address of the current character pointer (updated)
1365
  cd           compile background data
1366
  name         name to seek, or NULL if seeking a numbered subpattern
1367
  lorn         name length, or subpattern number if name is NULL
1368
  xmode        TRUE if we are in /x mode
1369
  utf          TRUE if we are in UTF-8 / UTF-16 mode
1370
  count        pointer to the current capturing subpattern number (updated)
1371
1372
Returns:       the number of the named subpattern, or -1 if not found
1373
*/
1374
1375
static int
1376
find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1377
  BOOL xmode, BOOL utf, int *count)
1378
0
{
1379
0
pcre_uchar *ptr = *ptrptr;
1380
0
int start_count = *count;
1381
0
int hwm_count = start_count;
1382
0
BOOL dup_parens = FALSE;
1383
1384
/* If the first character is a parenthesis, check on the type of group we are
1385
dealing with. The very first call may not start with a parenthesis. */
1386
1387
0
if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1388
0
  {
1389
  /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1390
1391
0
  if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1392
1393
  /* Handle a normal, unnamed capturing parenthesis. */
1394
1395
0
  else if (ptr[1] != CHAR_QUESTION_MARK)
1396
0
    {
1397
0
    *count += 1;
1398
0
    if (name == NULL && *count == lorn) return *count;
1399
0
    ptr++;
1400
0
    }
1401
1402
  /* All cases now have (? at the start. Remember when we are in a group
1403
  where the parenthesis numbers are duplicated. */
1404
1405
0
  else if (ptr[2] == CHAR_VERTICAL_LINE)
1406
0
    {
1407
0
    ptr += 3;
1408
0
    dup_parens = TRUE;
1409
0
    }
1410
1411
  /* Handle comments; all characters are allowed until a ket is reached. */
1412
1413
0
  else if (ptr[2] == CHAR_NUMBER_SIGN)
1414
0
    {
1415
0
    for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1416
0
    goto FAIL_EXIT;
1417
0
    }
1418
1419
  /* Handle a condition. If it is an assertion, just carry on so that it
1420
  is processed as normal. If not, skip to the closing parenthesis of the
1421
  condition (there can't be any nested parens). */
1422
1423
0
  else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1424
0
    {
1425
0
    ptr += 2;
1426
0
    if (ptr[1] != CHAR_QUESTION_MARK)
1427
0
      {
1428
0
      while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1429
0
      if (*ptr != 0) ptr++;
1430
0
      }
1431
0
    }
1432
1433
  /* Start with (? but not a condition. */
1434
1435
0
  else
1436
0
    {
1437
0
    ptr += 2;
1438
0
    if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1439
1440
    /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1441
1442
0
    if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1443
0
        ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1444
0
      {
1445
0
      int term;
1446
0
      const pcre_uchar *thisname;
1447
0
      *count += 1;
1448
0
      if (name == NULL && *count == lorn) return *count;
1449
0
      term = *ptr++;
1450
0
      if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1451
0
      thisname = ptr;
1452
0
      while (*ptr != term) ptr++;
1453
0
      if (name != NULL && lorn == ptr - thisname &&
1454
0
          STRNCMP_UC_UC(name, thisname, lorn) == 0)
1455
0
        return *count;
1456
0
      term++;
1457
0
      }
1458
0
    }
1459
0
  }
1460
1461
/* Past any initial parenthesis handling, scan for parentheses or vertical
1462
bars. Stop if we get to cd->end_pattern. Note that this is important for the
1463
first-pass call when this value is temporarily adjusted to stop at the current
1464
position. So DO NOT change this to a test for binary zero. */
1465
1466
0
for (; ptr < cd->end_pattern; ptr++)
1467
0
  {
1468
  /* Skip over backslashed characters and also entire \Q...\E */
1469
1470
0
  if (*ptr == CHAR_BACKSLASH)
1471
0
    {
1472
0
    if (*(++ptr) == 0) goto FAIL_EXIT;
1473
0
    if (*ptr == CHAR_Q) for (;;)
1474
0
      {
1475
0
      while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1476
0
      if (*ptr == 0) goto FAIL_EXIT;
1477
0
      if (*(++ptr) == CHAR_E) break;
1478
0
      }
1479
0
    continue;
1480
0
    }
1481
1482
  /* Skip over character classes; this logic must be similar to the way they
1483
  are handled for real. If the first character is '^', skip it. Also, if the
1484
  first few characters (either before or after ^) are \Q\E or \E we skip them
1485
  too. This makes for compatibility with Perl. Note the use of STR macros to
1486
  encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1487
1488
0
  if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1489
0
    {
1490
0
    BOOL negate_class = FALSE;
1491
0
    for (;;)
1492
0
      {
1493
0
      if (ptr[1] == CHAR_BACKSLASH)
1494
0
        {
1495
0
        if (ptr[2] == CHAR_E)
1496
0
          ptr+= 2;
1497
0
        else if (STRNCMP_UC_C8(ptr + 2,
1498
0
                 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1499
0
          ptr += 4;
1500
0
        else
1501
0
          break;
1502
0
        }
1503
0
      else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1504
0
        {
1505
0
        negate_class = TRUE;
1506
0
        ptr++;
1507
0
        }
1508
0
      else break;
1509
0
      }
1510
1511
    /* If the next character is ']', it is a data character that must be
1512
    skipped, except in JavaScript compatibility mode. */
1513
1514
0
    if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1515
0
        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1516
0
      ptr++;
1517
1518
0
    while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1519
0
      {
1520
0
      if (*ptr == 0) return -1;
1521
0
      if (*ptr == CHAR_BACKSLASH)
1522
0
        {
1523
0
        if (*(++ptr) == 0) goto FAIL_EXIT;
1524
0
        if (*ptr == CHAR_Q) for (;;)
1525
0
          {
1526
0
          while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1527
0
          if (*ptr == 0) goto FAIL_EXIT;
1528
0
          if (*(++ptr) == CHAR_E) break;
1529
0
          }
1530
0
        continue;
1531
0
        }
1532
0
      }
1533
0
    continue;
1534
0
    }
1535
1536
  /* Skip comments in /x mode */
1537
1538
0
  if (xmode && *ptr == CHAR_NUMBER_SIGN)
1539
0
    {
1540
0
    ptr++;
1541
0
    while (*ptr != 0)
1542
0
      {
1543
0
      if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1544
0
      ptr++;
1545
0
#ifdef SUPPORT_UTF
1546
0
      if (utf) FORWARDCHAR(ptr);
1547
0
#endif
1548
0
      }
1549
0
    if (*ptr == 0) goto FAIL_EXIT;
1550
0
    continue;
1551
0
    }
1552
1553
  /* Check for the special metacharacters */
1554
1555
0
  if (*ptr == CHAR_LEFT_PARENTHESIS)
1556
0
    {
1557
0
    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1558
0
    if (rc > 0) return rc;
1559
0
    if (*ptr == 0) goto FAIL_EXIT;
1560
0
    }
1561
1562
0
  else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1563
0
    {
1564
0
    if (dup_parens && *count < hwm_count) *count = hwm_count;
1565
0
    goto FAIL_EXIT;
1566
0
    }
1567
1568
0
  else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1569
0
    {
1570
0
    if (*count > hwm_count) hwm_count = *count;
1571
0
    *count = start_count;
1572
0
    }
1573
0
  }
1574
1575
0
FAIL_EXIT:
1576
0
*ptrptr = ptr;
1577
0
return -1;
1578
0
}
1579
1580
1581
1582
1583
/*************************************************
1584
*       Find forward referenced subpattern       *
1585
*************************************************/
1586
1587
/* This function scans along a pattern's text looking for capturing
1588
subpatterns, and counting them. If it finds a named pattern that matches the
1589
name it is given, it returns its number. Alternatively, if the name is NULL, it
1590
returns when it reaches a given numbered subpattern. This is used for forward
1591
references to subpatterns. We used to be able to start this scan from the
1592
current compiling point, using the current count value from cd->bracount, and
1593
do it all in a single loop, but the addition of the possibility of duplicate
1594
subpattern numbers means that we have to scan from the very start, in order to
1595
take account of such duplicates, and to use a recursive function to keep track
1596
of the different types of group.
1597
1598
Arguments:
1599
  cd           compile background data
1600
  name         name to seek, or NULL if seeking a numbered subpattern
1601
  lorn         name length, or subpattern number if name is NULL
1602
  xmode        TRUE if we are in /x mode
1603
  utf          TRUE if we are in UTF-8 / UTF-16 mode
1604
1605
Returns:       the number of the found subpattern, or -1 if not found
1606
*/
1607
1608
static int
1609
find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1610
  BOOL utf)
1611
0
{
1612
0
pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1613
0
int count = 0;
1614
0
int rc;
1615
1616
/* If the pattern does not start with an opening parenthesis, the first call
1617
to find_parens_sub() will scan right to the end (if necessary). However, if it
1618
does start with a parenthesis, find_parens_sub() will return when it hits the
1619
matching closing parens. That is why we have to have a loop. */
1620
1621
0
for (;;)
1622
0
  {
1623
0
  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1624
0
  if (rc > 0 || *ptr++ == 0) break;
1625
0
  }
1626
1627
0
return rc;
1628
0
}
1629
1630
1631
1632
1633
/*************************************************
1634
*      Find first significant op code            *
1635
*************************************************/
1636
1637
/* This is called by several functions that scan a compiled expression looking
1638
for a fixed first character, or an anchoring op code etc. It skips over things
1639
that do not influence this. For some calls, it makes sense to skip negative
1640
forward and all backward assertions, and also the \b assertion; for others it
1641
does not.
1642
1643
Arguments:
1644
  code         pointer to the start of the group
1645
  skipassert   TRUE if certain assertions are to be skipped
1646
1647
Returns:       pointer to the first significant opcode
1648
*/
1649
1650
static const pcre_uchar*
1651
first_significant_code(const pcre_uchar *code, BOOL skipassert)
1652
0
{
1653
0
for (;;)
1654
0
  {
1655
0
  switch ((int)*code)
1656
0
    {
1657
0
    case OP_ASSERT_NOT:
1658
0
    case OP_ASSERTBACK:
1659
0
    case OP_ASSERTBACK_NOT:
1660
0
    if (!skipassert) return code;
1661
0
    do code += GET(code, 1); while (*code == OP_ALT);
1662
0
    code += PRIV(OP_lengths)[*code];
1663
0
    break;
1664
1665
0
    case OP_WORD_BOUNDARY:
1666
0
    case OP_NOT_WORD_BOUNDARY:
1667
0
    if (!skipassert) return code;
1668
    /* Fall through */
1669
1670
0
    case OP_CALLOUT:
1671
0
    case OP_CREF:
1672
0
    case OP_NCREF:
1673
0
    case OP_RREF:
1674
0
    case OP_NRREF:
1675
0
    case OP_DEF:
1676
0
    code += PRIV(OP_lengths)[*code];
1677
0
    break;
1678
1679
0
    default:
1680
0
    return code;
1681
0
    }
1682
0
  }
1683
/* Control never reaches here */
1684
0
}
1685
1686
1687
1688
1689
/*************************************************
1690
*        Find the fixed length of a branch       *
1691
*************************************************/
1692
1693
/* Scan a branch and compute the fixed length of subject that will match it,
1694
if the length is fixed. This is needed for dealing with backward assertions.
1695
In UTF8 mode, the result is in characters rather than bytes. The branch is
1696
temporarily terminated with OP_END when this function is called.
1697
1698
This function is called when a backward assertion is encountered, so that if it
1699
fails, the error message can point to the correct place in the pattern.
1700
However, we cannot do this when the assertion contains subroutine calls,
1701
because they can be forward references. We solve this by remembering this case
1702
and doing the check at the end; a flag specifies which mode we are running in.
1703
1704
Arguments:
1705
  code     points to the start of the pattern (the bracket)
1706
  utf      TRUE in UTF-8 / UTF-16 mode
1707
  atend    TRUE if called when the pattern is complete
1708
  cd       the "compile data" structure
1709
1710
Returns:   the fixed length,
1711
             or -1 if there is no fixed length,
1712
             or -2 if \C was encountered (in UTF-8 mode only)
1713
             or -3 if an OP_RECURSE item was encountered and atend is FALSE
1714
             or -4 if an unknown opcode was encountered (internal error)
1715
*/
1716
1717
static int
1718
find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1719
0
{
1720
0
int length = -1;
1721
1722
0
int branchlength = 0;
1723
0
pcre_uchar *cc = code + 1 + LINK_SIZE;
1724
1725
/* Scan along the opcodes for this branch. If we get to the end of the
1726
branch, check the length against that of the other branches. */
1727
1728
0
for (;;)
1729
0
  {
1730
0
  int d;
1731
0
  pcre_uchar *ce, *cs;
1732
0
  int op = *cc;
1733
1734
0
  switch (op)
1735
0
    {
1736
    /* We only need to continue for OP_CBRA (normal capturing bracket) and
1737
    OP_BRA (normal non-capturing bracket) because the other variants of these
1738
    opcodes are all concerned with unlimited repeated groups, which of course
1739
    are not of fixed length. */
1740
1741
0
    case OP_CBRA:
1742
0
    case OP_BRA:
1743
0
    case OP_ONCE:
1744
0
    case OP_ONCE_NC:
1745
0
    case OP_COND:
1746
0
    d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1747
0
    if (d < 0) return d;
1748
0
    branchlength += d;
1749
0
    do cc += GET(cc, 1); while (*cc == OP_ALT);
1750
0
    cc += 1 + LINK_SIZE;
1751
0
    break;
1752
1753
    /* Reached end of a branch; if it's a ket it is the end of a nested call.
1754
    If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1755
    an ALT. If it is END it's the end of the outer call. All can be handled by
1756
    the same code. Note that we must not include the OP_KETRxxx opcodes here,
1757
    because they all imply an unlimited repeat. */
1758
1759
0
    case OP_ALT:
1760
0
    case OP_KET:
1761
0
    case OP_END:
1762
0
    case OP_ACCEPT:
1763
0
    case OP_ASSERT_ACCEPT:
1764
0
    if (length < 0) length = branchlength;
1765
0
      else if (length != branchlength) return -1;
1766
0
    if (*cc != OP_ALT) return length;
1767
0
    cc += 1 + LINK_SIZE;
1768
0
    branchlength = 0;
1769
0
    break;
1770
1771
    /* A true recursion implies not fixed length, but a subroutine call may
1772
    be OK. If the subroutine is a forward reference, we can't deal with
1773
    it until the end of the pattern, so return -3. */
1774
1775
0
    case OP_RECURSE:
1776
0
    if (!atend) return -3;
1777
0
    cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1778
0
    do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1779
0
    if (cc > cs && cc < ce) return -1;                    /* Recursion */
1780
0
    d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1781
0
    if (d < 0) return d;
1782
0
    branchlength += d;
1783
0
    cc += 1 + LINK_SIZE;
1784
0
    break;
1785
1786
    /* Skip over assertive subpatterns */
1787
1788
0
    case OP_ASSERT:
1789
0
    case OP_ASSERT_NOT:
1790
0
    case OP_ASSERTBACK:
1791
0
    case OP_ASSERTBACK_NOT:
1792
0
    do cc += GET(cc, 1); while (*cc == OP_ALT);
1793
0
    cc += PRIV(OP_lengths)[*cc];
1794
0
    break;
1795
1796
    /* Skip over things that don't match chars */
1797
1798
0
    case OP_MARK:
1799
0
    case OP_PRUNE_ARG:
1800
0
    case OP_SKIP_ARG:
1801
0
    case OP_THEN_ARG:
1802
0
    cc += cc[1] + PRIV(OP_lengths)[*cc];
1803
0
    break;
1804
1805
0
    case OP_CALLOUT:
1806
0
    case OP_CIRC:
1807
0
    case OP_CIRCM:
1808
0
    case OP_CLOSE:
1809
0
    case OP_COMMIT:
1810
0
    case OP_CREF:
1811
0
    case OP_DEF:
1812
0
    case OP_DOLL:
1813
0
    case OP_DOLLM:
1814
0
    case OP_EOD:
1815
0
    case OP_EODN:
1816
0
    case OP_FAIL:
1817
0
    case OP_NCREF:
1818
0
    case OP_NRREF:
1819
0
    case OP_NOT_WORD_BOUNDARY:
1820
0
    case OP_PRUNE:
1821
0
    case OP_REVERSE:
1822
0
    case OP_RREF:
1823
0
    case OP_SET_SOM:
1824
0
    case OP_SKIP:
1825
0
    case OP_SOD:
1826
0
    case OP_SOM:
1827
0
    case OP_THEN:
1828
0
    case OP_WORD_BOUNDARY:
1829
0
    cc += PRIV(OP_lengths)[*cc];
1830
0
    break;
1831
1832
    /* Handle literal characters */
1833
1834
0
    case OP_CHAR:
1835
0
    case OP_CHARI:
1836
0
    case OP_NOT:
1837
0
    case OP_NOTI:
1838
0
    branchlength++;
1839
0
    cc += 2;
1840
0
#ifdef SUPPORT_UTF
1841
0
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1842
0
#endif
1843
0
    break;
1844
1845
    /* Handle exact repetitions. The count is already in characters, but we
1846
    need to skip over a multibyte character in UTF8 mode.  */
1847
1848
0
    case OP_EXACT:
1849
0
    case OP_EXACTI:
1850
0
    case OP_NOTEXACT:
1851
0
    case OP_NOTEXACTI:
1852
0
    branchlength += GET2(cc,1);
1853
0
    cc += 2 + IMM2_SIZE;
1854
0
#ifdef SUPPORT_UTF
1855
0
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1856
0
#endif
1857
0
    break;
1858
1859
0
    case OP_TYPEEXACT:
1860
0
    branchlength += GET2(cc,1);
1861
0
    if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1862
0
    cc += 1 + IMM2_SIZE + 1;
1863
0
    break;
1864
1865
    /* Handle single-char matchers */
1866
1867
0
    case OP_PROP:
1868
0
    case OP_NOTPROP:
1869
0
    cc += 2;
1870
    /* Fall through */
1871
1872
0
    case OP_HSPACE:
1873
0
    case OP_VSPACE:
1874
0
    case OP_NOT_HSPACE:
1875
0
    case OP_NOT_VSPACE:
1876
0
    case OP_NOT_DIGIT:
1877
0
    case OP_DIGIT:
1878
0
    case OP_NOT_WHITESPACE:
1879
0
    case OP_WHITESPACE:
1880
0
    case OP_NOT_WORDCHAR:
1881
0
    case OP_WORDCHAR:
1882
0
    case OP_ANY:
1883
0
    case OP_ALLANY:
1884
0
    branchlength++;
1885
0
    cc++;
1886
0
    break;
1887
1888
    /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1889
    otherwise \C is coded as OP_ALLANY. */
1890
1891
0
    case OP_ANYBYTE:
1892
0
    return -2;
1893
1894
    /* Check a class for variable quantification */
1895
1896
0
#if defined SUPPORT_UTF || defined COMPILE_PCRE16
1897
0
    case OP_XCLASS:
1898
0
    cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1899
    /* Fall through */
1900
0
#endif
1901
1902
0
    case OP_CLASS:
1903
0
    case OP_NCLASS:
1904
0
    cc += PRIV(OP_lengths)[OP_CLASS];
1905
1906
0
    switch (*cc)
1907
0
      {
1908
0
      case OP_CRPLUS:
1909
0
      case OP_CRMINPLUS:
1910
0
      case OP_CRSTAR:
1911
0
      case OP_CRMINSTAR:
1912
0
      case OP_CRQUERY:
1913
0
      case OP_CRMINQUERY:
1914
0
      return -1;
1915
1916
0
      case OP_CRRANGE:
1917
0
      case OP_CRMINRANGE:
1918
0
      if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1919
0
      branchlength += GET2(cc,1);
1920
0
      cc += 1 + 2 * IMM2_SIZE;
1921
0
      break;
1922
1923
0
      default:
1924
0
      branchlength++;
1925
0
      }
1926
0
    break;
1927
1928
    /* Anything else is variable length */
1929
1930
0
    case OP_ANYNL:
1931
0
    case OP_BRAMINZERO:
1932
0
    case OP_BRAPOS:
1933
0
    case OP_BRAPOSZERO:
1934
0
    case OP_BRAZERO:
1935
0
    case OP_CBRAPOS:
1936
0
    case OP_EXTUNI:
1937
0
    case OP_KETRMAX:
1938
0
    case OP_KETRMIN:
1939
0
    case OP_KETRPOS:
1940
0
    case OP_MINPLUS:
1941
0
    case OP_MINPLUSI:
1942
0
    case OP_MINQUERY:
1943
0
    case OP_MINQUERYI:
1944
0
    case OP_MINSTAR:
1945
0
    case OP_MINSTARI:
1946
0
    case OP_MINUPTO:
1947
0
    case OP_MINUPTOI:
1948
0
    case OP_NOTMINPLUS:
1949
0
    case OP_NOTMINPLUSI:
1950
0
    case OP_NOTMINQUERY:
1951
0
    case OP_NOTMINQUERYI:
1952
0
    case OP_NOTMINSTAR:
1953
0
    case OP_NOTMINSTARI:
1954
0
    case OP_NOTMINUPTO:
1955
0
    case OP_NOTMINUPTOI:
1956
0
    case OP_NOTPLUS:
1957
0
    case OP_NOTPLUSI:
1958
0
    case OP_NOTPOSPLUS:
1959
0
    case OP_NOTPOSPLUSI:
1960
0
    case OP_NOTPOSQUERY:
1961
0
    case OP_NOTPOSQUERYI:
1962
0
    case OP_NOTPOSSTAR:
1963
0
    case OP_NOTPOSSTARI:
1964
0
    case OP_NOTPOSUPTO:
1965
0
    case OP_NOTPOSUPTOI:
1966
0
    case OP_NOTQUERY:
1967
0
    case OP_NOTQUERYI:
1968
0
    case OP_NOTSTAR:
1969
0
    case OP_NOTSTARI:
1970
0
    case OP_NOTUPTO:
1971
0
    case OP_NOTUPTOI:
1972
0
    case OP_PLUS:
1973
0
    case OP_PLUSI:
1974
0
    case OP_POSPLUS:
1975
0
    case OP_POSPLUSI:
1976
0
    case OP_POSQUERY:
1977
0
    case OP_POSQUERYI:
1978
0
    case OP_POSSTAR:
1979
0
    case OP_POSSTARI:
1980
0
    case OP_POSUPTO:
1981
0
    case OP_POSUPTOI:
1982
0
    case OP_QUERY:
1983
0
    case OP_QUERYI:
1984
0
    case OP_REF:
1985
0
    case OP_REFI:
1986
0
    case OP_SBRA:
1987
0
    case OP_SBRAPOS:
1988
0
    case OP_SCBRA:
1989
0
    case OP_SCBRAPOS:
1990
0
    case OP_SCOND:
1991
0
    case OP_SKIPZERO:
1992
0
    case OP_STAR:
1993
0
    case OP_STARI:
1994
0
    case OP_TYPEMINPLUS:
1995
0
    case OP_TYPEMINQUERY:
1996
0
    case OP_TYPEMINSTAR:
1997
0
    case OP_TYPEMINUPTO:
1998
0
    case OP_TYPEPLUS:
1999
0
    case OP_TYPEPOSPLUS:
2000
0
    case OP_TYPEPOSQUERY:
2001
0
    case OP_TYPEPOSSTAR:
2002
0
    case OP_TYPEPOSUPTO:
2003
0
    case OP_TYPEQUERY:
2004
0
    case OP_TYPESTAR:
2005
0
    case OP_TYPEUPTO:
2006
0
    case OP_UPTO:
2007
0
    case OP_UPTOI:
2008
0
    return -1;
2009
2010
    /* Catch unrecognized opcodes so that when new ones are added they
2011
    are not forgotten, as has happened in the past. */
2012
2013
0
    default:
2014
0
    return -4;
2015
0
    }
2016
0
  }
2017
/* Control never gets here */
2018
0
}
2019
2020
2021
2022
2023
/*************************************************
2024
*    Scan compiled regex for specific bracket    *
2025
*************************************************/
2026
2027
/* This little function scans through a compiled pattern until it finds a
2028
capturing bracket with the given number, or, if the number is negative, an
2029
instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2030
so that it can be called from pcre_study() when finding the minimum matching
2031
length.
2032
2033
Arguments:
2034
  code        points to start of expression
2035
  utf         TRUE in UTF-8 / UTF-16 mode
2036
  number      the required bracket number or negative to find a lookbehind
2037
2038
Returns:      pointer to the opcode for the bracket, or NULL if not found
2039
*/
2040
2041
const pcre_uchar *
2042
PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2043
0
{
2044
0
for (;;)
2045
0
  {
2046
0
  int c = *code;
2047
2048
0
  if (c == OP_END) return NULL;
2049
2050
  /* XCLASS is used for classes that cannot be represented just by a bit
2051
  map. This includes negated single high-valued characters. The length in
2052
  the table is zero; the actual length is stored in the compiled code. */
2053
2054
0
  if (c == OP_XCLASS) code += GET(code, 1);
2055
2056
  /* Handle recursion */
2057
2058
0
  else if (c == OP_REVERSE)
2059
0
    {
2060
0
    if (number < 0) return (pcre_uchar *)code;
2061
0
    code += PRIV(OP_lengths)[c];
2062
0
    }
2063
2064
  /* Handle capturing bracket */
2065
2066
0
  else if (c == OP_CBRA || c == OP_SCBRA ||
2067
0
           c == OP_CBRAPOS || c == OP_SCBRAPOS)
2068
0
    {
2069
0
    int n = GET2(code, 1+LINK_SIZE);
2070
0
    if (n == number) return (pcre_uchar *)code;
2071
0
    code += PRIV(OP_lengths)[c];
2072
0
    }
2073
2074
  /* Otherwise, we can get the item's length from the table, except that for
2075
  repeated character types, we have to test for \p and \P, which have an extra
2076
  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2077
  must add in its length. */
2078
2079
0
  else
2080
0
    {
2081
0
    switch(c)
2082
0
      {
2083
0
      case OP_TYPESTAR:
2084
0
      case OP_TYPEMINSTAR:
2085
0
      case OP_TYPEPLUS:
2086
0
      case OP_TYPEMINPLUS:
2087
0
      case OP_TYPEQUERY:
2088
0
      case OP_TYPEMINQUERY:
2089
0
      case OP_TYPEPOSSTAR:
2090
0
      case OP_TYPEPOSPLUS:
2091
0
      case OP_TYPEPOSQUERY:
2092
0
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2093
0
      break;
2094
2095
0
      case OP_TYPEUPTO:
2096
0
      case OP_TYPEMINUPTO:
2097
0
      case OP_TYPEEXACT:
2098
0
      case OP_TYPEPOSUPTO:
2099
0
      if (code[1 + IMM2_SIZE] == OP_PROP
2100
0
        || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2101
0
      break;
2102
2103
0
      case OP_MARK:
2104
0
      case OP_PRUNE_ARG:
2105
0
      case OP_SKIP_ARG:
2106
0
      code += code[1];
2107
0
      break;
2108
2109
0
      case OP_THEN_ARG:
2110
0
      code += code[1];
2111
0
      break;
2112
0
      }
2113
2114
    /* Add in the fixed length from the table */
2115
2116
0
    code += PRIV(OP_lengths)[c];
2117
2118
  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2119
  a multi-byte character. The length in the table is a minimum, so we have to
2120
  arrange to skip the extra bytes. */
2121
2122
0
#ifdef SUPPORT_UTF
2123
0
    if (utf) switch(c)
2124
0
      {
2125
0
      case OP_CHAR:
2126
0
      case OP_CHARI:
2127
0
      case OP_EXACT:
2128
0
      case OP_EXACTI:
2129
0
      case OP_UPTO:
2130
0
      case OP_UPTOI:
2131
0
      case OP_MINUPTO:
2132
0
      case OP_MINUPTOI:
2133
0
      case OP_POSUPTO:
2134
0
      case OP_POSUPTOI:
2135
0
      case OP_STAR:
2136
0
      case OP_STARI:
2137
0
      case OP_MINSTAR:
2138
0
      case OP_MINSTARI:
2139
0
      case OP_POSSTAR:
2140
0
      case OP_POSSTARI:
2141
0
      case OP_PLUS:
2142
0
      case OP_PLUSI:
2143
0
      case OP_MINPLUS:
2144
0
      case OP_MINPLUSI:
2145
0
      case OP_POSPLUS:
2146
0
      case OP_POSPLUSI:
2147
0
      case OP_QUERY:
2148
0
      case OP_QUERYI:
2149
0
      case OP_MINQUERY:
2150
0
      case OP_MINQUERYI:
2151
0
      case OP_POSQUERY:
2152
0
      case OP_POSQUERYI:
2153
0
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2154
0
      break;
2155
0
      }
2156
#else
2157
    (void)(utf);  /* Keep compiler happy by referencing function argument */
2158
#endif
2159
0
    }
2160
0
  }
2161
0
}
2162
2163
2164
2165
/*************************************************
2166
*   Scan compiled regex for recursion reference  *
2167
*************************************************/
2168
2169
/* This little function scans through a compiled pattern until it finds an
2170
instance of OP_RECURSE.
2171
2172
Arguments:
2173
  code        points to start of expression
2174
  utf         TRUE in UTF-8 / UTF-16 mode
2175
2176
Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2177
*/
2178
2179
static const pcre_uchar *
2180
find_recurse(const pcre_uchar *code, BOOL utf)
2181
0
{
2182
0
for (;;)
2183
0
  {
2184
0
  int c = *code;
2185
0
  if (c == OP_END) return NULL;
2186
0
  if (c == OP_RECURSE) return code;
2187
2188
  /* XCLASS is used for classes that cannot be represented just by a bit
2189
  map. This includes negated single high-valued characters. The length in
2190
  the table is zero; the actual length is stored in the compiled code. */
2191
2192
0
  if (c == OP_XCLASS) code += GET(code, 1);
2193
2194
  /* Otherwise, we can get the item's length from the table, except that for
2195
  repeated character types, we have to test for \p and \P, which have an extra
2196
  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2197
  must add in its length. */
2198
2199
0
  else
2200
0
    {
2201
0
    switch(c)
2202
0
      {
2203
0
      case OP_TYPESTAR:
2204
0
      case OP_TYPEMINSTAR:
2205
0
      case OP_TYPEPLUS:
2206
0
      case OP_TYPEMINPLUS:
2207
0
      case OP_TYPEQUERY:
2208
0
      case OP_TYPEMINQUERY:
2209
0
      case OP_TYPEPOSSTAR:
2210
0
      case OP_TYPEPOSPLUS:
2211
0
      case OP_TYPEPOSQUERY:
2212
0
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2213
0
      break;
2214
2215
0
      case OP_TYPEPOSUPTO:
2216
0
      case OP_TYPEUPTO:
2217
0
      case OP_TYPEMINUPTO:
2218
0
      case OP_TYPEEXACT:
2219
0
      if (code[1 + IMM2_SIZE] == OP_PROP
2220
0
        || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2221
0
      break;
2222
2223
0
      case OP_MARK:
2224
0
      case OP_PRUNE_ARG:
2225
0
      case OP_SKIP_ARG:
2226
0
      code += code[1];
2227
0
      break;
2228
2229
0
      case OP_THEN_ARG:
2230
0
      code += code[1];
2231
0
      break;
2232
0
      }
2233
2234
    /* Add in the fixed length from the table */
2235
2236
0
    code += PRIV(OP_lengths)[c];
2237
2238
    /* In UTF-8 mode, opcodes that are followed by a character may be followed
2239
    by a multi-byte character. The length in the table is a minimum, so we have
2240
    to arrange to skip the extra bytes. */
2241
2242
0
#ifdef SUPPORT_UTF
2243
0
    if (utf) switch(c)
2244
0
      {
2245
0
      case OP_CHAR:
2246
0
      case OP_CHARI:
2247
0
      case OP_NOT:
2248
0
      case OP_NOTI:
2249
0
      case OP_EXACT:
2250
0
      case OP_EXACTI:
2251
0
      case OP_NOTEXACT:
2252
0
      case OP_NOTEXACTI:
2253
0
      case OP_UPTO:
2254
0
      case OP_UPTOI:
2255
0
      case OP_NOTUPTO:
2256
0
      case OP_NOTUPTOI:
2257
0
      case OP_MINUPTO:
2258
0
      case OP_MINUPTOI:
2259
0
      case OP_NOTMINUPTO:
2260
0
      case OP_NOTMINUPTOI:
2261
0
      case OP_POSUPTO:
2262
0
      case OP_POSUPTOI:
2263
0
      case OP_NOTPOSUPTO:
2264
0
      case OP_NOTPOSUPTOI:
2265
0
      case OP_STAR:
2266
0
      case OP_STARI:
2267
0
      case OP_NOTSTAR:
2268
0
      case OP_NOTSTARI:
2269
0
      case OP_MINSTAR:
2270
0
      case OP_MINSTARI:
2271
0
      case OP_NOTMINSTAR:
2272
0
      case OP_NOTMINSTARI:
2273
0
      case OP_POSSTAR:
2274
0
      case OP_POSSTARI:
2275
0
      case OP_NOTPOSSTAR:
2276
0
      case OP_NOTPOSSTARI:
2277
0
      case OP_PLUS:
2278
0
      case OP_PLUSI:
2279
0
      case OP_NOTPLUS:
2280
0
      case OP_NOTPLUSI:
2281
0
      case OP_MINPLUS:
2282
0
      case OP_MINPLUSI:
2283
0
      case OP_NOTMINPLUS:
2284
0
      case OP_NOTMINPLUSI:
2285
0
      case OP_POSPLUS:
2286
0
      case OP_POSPLUSI:
2287
0
      case OP_NOTPOSPLUS:
2288
0
      case OP_NOTPOSPLUSI:
2289
0
      case OP_QUERY:
2290
0
      case OP_QUERYI:
2291
0
      case OP_NOTQUERY:
2292
0
      case OP_NOTQUERYI:
2293
0
      case OP_MINQUERY:
2294
0
      case OP_MINQUERYI:
2295
0
      case OP_NOTMINQUERY:
2296
0
      case OP_NOTMINQUERYI:
2297
0
      case OP_POSQUERY:
2298
0
      case OP_POSQUERYI:
2299
0
      case OP_NOTPOSQUERY:
2300
0
      case OP_NOTPOSQUERYI:
2301
0
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2302
0
      break;
2303
0
      }
2304
#else
2305
    (void)(utf);  /* Keep compiler happy by referencing function argument */
2306
#endif
2307
0
    }
2308
0
  }
2309
0
}
2310
2311
2312
2313
/*************************************************
2314
*    Scan compiled branch for non-emptiness      *
2315
*************************************************/
2316
2317
/* This function scans through a branch of a compiled pattern to see whether it
2318
can match the empty string or not. It is called from could_be_empty()
2319
below and from compile_branch() when checking for an unlimited repeat of a
2320
group that can match nothing. Note that first_significant_code() skips over
2321
backward and negative forward assertions when its final argument is TRUE. If we
2322
hit an unclosed bracket, we return "empty" - this means we've struck an inner
2323
bracket whose current branch will already have been scanned.
2324
2325
Arguments:
2326
  code        points to start of search
2327
  endcode     points to where to stop
2328
  utf         TRUE if in UTF-8 / UTF-16 mode
2329
  cd          contains pointers to tables etc.
2330
2331
Returns:      TRUE if what is matched could be empty
2332
*/
2333
2334
static BOOL
2335
could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2336
  BOOL utf, compile_data *cd)
2337
0
{
2338
0
int c;
2339
0
for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2340
0
     code < endcode;
2341
0
     code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2342
0
  {
2343
0
  const pcre_uchar *ccode;
2344
2345
0
  c = *code;
2346
2347
  /* Skip over forward assertions; the other assertions are skipped by
2348
  first_significant_code() with a TRUE final argument. */
2349
2350
0
  if (c == OP_ASSERT)
2351
0
    {
2352
0
    do code += GET(code, 1); while (*code == OP_ALT);
2353
0
    c = *code;
2354
0
    continue;
2355
0
    }
2356
2357
  /* For a recursion/subroutine call, if its end has been reached, which
2358
  implies a backward reference subroutine call, we can scan it. If it's a
2359
  forward reference subroutine call, we can't. To detect forward reference
2360
  we have to scan up the list that is kept in the workspace. This function is
2361
  called only when doing the real compile, not during the pre-compile that
2362
  measures the size of the compiled pattern. */
2363
2364
0
  if (c == OP_RECURSE)
2365
0
    {
2366
0
    const pcre_uchar *scode;
2367
0
    BOOL empty_branch;
2368
2369
    /* Test for forward reference */
2370
2371
0
    for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2372
0
      if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2373
2374
    /* Not a forward reference, test for completed backward reference */
2375
2376
0
    empty_branch = FALSE;
2377
0
    scode = cd->start_code + GET(code, 1);
2378
0
    if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2379
2380
    /* Completed backwards reference */
2381
2382
0
    do
2383
0
      {
2384
0
      if (could_be_empty_branch(scode, endcode, utf, cd))
2385
0
        {
2386
0
        empty_branch = TRUE;
2387
0
        break;
2388
0
        }
2389
0
      scode += GET(scode, 1);
2390
0
      }
2391
0
    while (*scode == OP_ALT);
2392
2393
0
    if (!empty_branch) return FALSE;  /* All branches are non-empty */
2394
0
    continue;
2395
0
    }
2396
2397
  /* Groups with zero repeats can of course be empty; skip them. */
2398
2399
0
  if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2400
0
      c == OP_BRAPOSZERO)
2401
0
    {
2402
0
    code += PRIV(OP_lengths)[c];
2403
0
    do code += GET(code, 1); while (*code == OP_ALT);
2404
0
    c = *code;
2405
0
    continue;
2406
0
    }
2407
2408
  /* A nested group that is already marked as "could be empty" can just be
2409
  skipped. */
2410
2411
0
  if (c == OP_SBRA  || c == OP_SBRAPOS ||
2412
0
      c == OP_SCBRA || c == OP_SCBRAPOS)
2413
0
    {
2414
0
    do code += GET(code, 1); while (*code == OP_ALT);
2415
0
    c = *code;
2416
0
    continue;
2417
0
    }
2418
2419
  /* For other groups, scan the branches. */
2420
2421
0
  if (c == OP_BRA  || c == OP_BRAPOS ||
2422
0
      c == OP_CBRA || c == OP_CBRAPOS ||
2423
0
      c == OP_ONCE || c == OP_ONCE_NC ||
2424
0
      c == OP_COND)
2425
0
    {
2426
0
    BOOL empty_branch;
2427
0
    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2428
2429
    /* If a conditional group has only one branch, there is a second, implied,
2430
    empty branch, so just skip over the conditional, because it could be empty.
2431
    Otherwise, scan the individual branches of the group. */
2432
2433
0
    if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2434
0
      code += GET(code, 1);
2435
0
    else
2436
0
      {
2437
0
      empty_branch = FALSE;
2438
0
      do
2439
0
        {
2440
0
        if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2441
0
          empty_branch = TRUE;
2442
0
        code += GET(code, 1);
2443
0
        }
2444
0
      while (*code == OP_ALT);
2445
0
      if (!empty_branch) return FALSE;   /* All branches are non-empty */
2446
0
      }
2447
2448
0
    c = *code;
2449
0
    continue;
2450
0
    }
2451
2452
  /* Handle the other opcodes */
2453
2454
0
  switch (c)
2455
0
    {
2456
    /* Check for quantifiers after a class. XCLASS is used for classes that
2457
    cannot be represented just by a bit map. This includes negated single
2458
    high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2459
    actual length is stored in the compiled code, so we must update "code"
2460
    here. */
2461
2462
0
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2463
0
    case OP_XCLASS:
2464
0
    ccode = code += GET(code, 1);
2465
0
    goto CHECK_CLASS_REPEAT;
2466
0
#endif
2467
2468
0
    case OP_CLASS:
2469
0
    case OP_NCLASS:
2470
0
    ccode = code + PRIV(OP_lengths)[OP_CLASS];
2471
2472
0
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2473
0
    CHECK_CLASS_REPEAT:
2474
0
#endif
2475
2476
0
    switch (*ccode)
2477
0
      {
2478
0
      case OP_CRSTAR:            /* These could be empty; continue */
2479
0
      case OP_CRMINSTAR:
2480
0
      case OP_CRQUERY:
2481
0
      case OP_CRMINQUERY:
2482
0
      break;
2483
2484
0
      default:                   /* Non-repeat => class must match */
2485
0
      case OP_CRPLUS:            /* These repeats aren't empty */
2486
0
      case OP_CRMINPLUS:
2487
0
      return FALSE;
2488
2489
0
      case OP_CRRANGE:
2490
0
      case OP_CRMINRANGE:
2491
0
      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2492
0
      break;
2493
0
      }
2494
0
    break;
2495
2496
    /* Opcodes that must match a character */
2497
2498
0
    case OP_PROP:
2499
0
    case OP_NOTPROP:
2500
0
    case OP_EXTUNI:
2501
0
    case OP_NOT_DIGIT:
2502
0
    case OP_DIGIT:
2503
0
    case OP_NOT_WHITESPACE:
2504
0
    case OP_WHITESPACE:
2505
0
    case OP_NOT_WORDCHAR:
2506
0
    case OP_WORDCHAR:
2507
0
    case OP_ANY:
2508
0
    case OP_ALLANY:
2509
0
    case OP_ANYBYTE:
2510
0
    case OP_CHAR:
2511
0
    case OP_CHARI:
2512
0
    case OP_NOT:
2513
0
    case OP_NOTI:
2514
0
    case OP_PLUS:
2515
0
    case OP_MINPLUS:
2516
0
    case OP_POSPLUS:
2517
0
    case OP_EXACT:
2518
0
    case OP_NOTPLUS:
2519
0
    case OP_NOTMINPLUS:
2520
0
    case OP_NOTPOSPLUS:
2521
0
    case OP_NOTEXACT:
2522
0
    case OP_TYPEPLUS:
2523
0
    case OP_TYPEMINPLUS:
2524
0
    case OP_TYPEPOSPLUS:
2525
0
    case OP_TYPEEXACT:
2526
0
    return FALSE;
2527
2528
    /* These are going to continue, as they may be empty, but we have to
2529
    fudge the length for the \p and \P cases. */
2530
2531
0
    case OP_TYPESTAR:
2532
0
    case OP_TYPEMINSTAR:
2533
0
    case OP_TYPEPOSSTAR:
2534
0
    case OP_TYPEQUERY:
2535
0
    case OP_TYPEMINQUERY:
2536
0
    case OP_TYPEPOSQUERY:
2537
0
    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2538
0
    break;
2539
2540
    /* Same for these */
2541
2542
0
    case OP_TYPEUPTO:
2543
0
    case OP_TYPEMINUPTO:
2544
0
    case OP_TYPEPOSUPTO:
2545
0
    if (code[1 + IMM2_SIZE] == OP_PROP
2546
0
      || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2547
0
    break;
2548
2549
    /* End of branch */
2550
2551
0
    case OP_KET:
2552
0
    case OP_KETRMAX:
2553
0
    case OP_KETRMIN:
2554
0
    case OP_KETRPOS:
2555
0
    case OP_ALT:
2556
0
    return TRUE;
2557
2558
    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2559
    MINUPTO, and POSUPTO may be followed by a multibyte character */
2560
2561
0
#ifdef SUPPORT_UTF
2562
0
    case OP_STAR:
2563
0
    case OP_STARI:
2564
0
    case OP_MINSTAR:
2565
0
    case OP_MINSTARI:
2566
0
    case OP_POSSTAR:
2567
0
    case OP_POSSTARI:
2568
0
    case OP_QUERY:
2569
0
    case OP_QUERYI:
2570
0
    case OP_MINQUERY:
2571
0
    case OP_MINQUERYI:
2572
0
    case OP_POSQUERY:
2573
0
    case OP_POSQUERYI:
2574
0
    if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2575
0
    break;
2576
2577
0
    case OP_UPTO:
2578
0
    case OP_UPTOI:
2579
0
    case OP_MINUPTO:
2580
0
    case OP_MINUPTOI:
2581
0
    case OP_POSUPTO:
2582
0
    case OP_POSUPTOI:
2583
0
    if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2584
0
    break;
2585
0
#endif
2586
2587
    /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2588
    string. */
2589
2590
0
    case OP_MARK:
2591
0
    case OP_PRUNE_ARG:
2592
0
    case OP_SKIP_ARG:
2593
0
    code += code[1];
2594
0
    break;
2595
2596
0
    case OP_THEN_ARG:
2597
0
    code += code[1];
2598
0
    break;
2599
2600
    /* None of the remaining opcodes are required to match a character. */
2601
2602
0
    default:
2603
0
    break;
2604
0
    }
2605
0
  }
2606
2607
0
return TRUE;
2608
0
}
2609
2610
2611
2612
/*************************************************
2613
*    Scan compiled regex for non-emptiness       *
2614
*************************************************/
2615
2616
/* This function is called to check for left recursive calls. We want to check
2617
the current branch of the current pattern to see if it could match the empty
2618
string. If it could, we must look outwards for branches at other levels,
2619
stopping when we pass beyond the bracket which is the subject of the recursion.
2620
This function is called only during the real compile, not during the
2621
pre-compile.
2622
2623
Arguments:
2624
  code        points to start of the recursion
2625
  endcode     points to where to stop (current RECURSE item)
2626
  bcptr       points to the chain of current (unclosed) branch starts
2627
  utf         TRUE if in UTF-8 / UTF-16 mode
2628
  cd          pointers to tables etc
2629
2630
Returns:      TRUE if what is matched could be empty
2631
*/
2632
2633
static BOOL
2634
could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2635
  branch_chain *bcptr, BOOL utf, compile_data *cd)
2636
0
{
2637
0
while (bcptr != NULL && bcptr->current_branch >= code)
2638
0
  {
2639
0
  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2640
0
    return FALSE;
2641
0
  bcptr = bcptr->outer;
2642
0
  }
2643
0
return TRUE;
2644
0
}
2645
2646
2647
2648
/*************************************************
2649
*           Check for POSIX class syntax         *
2650
*************************************************/
2651
2652
/* This function is called when the sequence "[:" or "[." or "[=" is
2653
encountered in a character class. It checks whether this is followed by a
2654
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2655
reach an unescaped ']' without the special preceding character, return FALSE.
2656
2657
Originally, this function only recognized a sequence of letters between the
2658
terminators, but it seems that Perl recognizes any sequence of characters,
2659
though of course unknown POSIX names are subsequently rejected. Perl gives an
2660
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2661
didn't consider this to be a POSIX class. Likewise for [:1234:].
2662
2663
The problem in trying to be exactly like Perl is in the handling of escapes. We
2664
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2665
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2666
below handles the special case of \], but does not try to do any other escape
2667
processing. This makes it different from Perl for cases such as [:l\ower:]
2668
where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2669
"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2670
I think.
2671
2672
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2673
It seems that the appearance of a nested POSIX class supersedes an apparent
2674
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2675
a digit.
2676
2677
In Perl, unescaped square brackets may also appear as part of class names. For
2678
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2679
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2680
seem right at all. PCRE does not allow closing square brackets in POSIX class
2681
names.
2682
2683
Arguments:
2684
  ptr      pointer to the initial [
2685
  endptr   where to return the end pointer
2686
2687
Returns:   TRUE or FALSE
2688
*/
2689
2690
static BOOL
2691
check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2692
0
{
2693
0
int terminator;          /* Don't combine these lines; the Solaris cc */
2694
0
terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2695
0
for (++ptr; *ptr != 0; ptr++)
2696
0
  {
2697
0
  if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2698
0
    ptr++;
2699
0
  else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2700
0
  else
2701
0
    {
2702
0
    if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2703
0
      {
2704
0
      *endptr = ptr;
2705
0
      return TRUE;
2706
0
      }
2707
0
    if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2708
0
         (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2709
0
          ptr[1] == CHAR_EQUALS_SIGN) &&
2710
0
        check_posix_syntax(ptr, endptr))
2711
0
      return FALSE;
2712
0
    }
2713
0
  }
2714
0
return FALSE;
2715
0
}
2716
2717
2718
2719
2720
/*************************************************
2721
*          Check POSIX class name                *
2722
*************************************************/
2723
2724
/* This function is called to check the name given in a POSIX-style class entry
2725
such as [:alnum:].
2726
2727
Arguments:
2728
  ptr        points to the first letter
2729
  len        the length of the name
2730
2731
Returns:     a value representing the name, or -1 if unknown
2732
*/
2733
2734
static int
2735
check_posix_name(const pcre_uchar *ptr, int len)
2736
0
{
2737
0
const char *pn = posix_names;
2738
0
int yield = 0;
2739
0
while (posix_name_lengths[yield] != 0)
2740
0
  {
2741
0
  if (len == posix_name_lengths[yield] &&
2742
0
    STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2743
0
  pn += posix_name_lengths[yield] + 1;
2744
0
  yield++;
2745
0
  }
2746
0
return -1;
2747
0
}
2748
2749
2750
/*************************************************
2751
*    Adjust OP_RECURSE items in repeated group   *
2752
*************************************************/
2753
2754
/* OP_RECURSE items contain an offset from the start of the regex to the group
2755
that is referenced. This means that groups can be replicated for fixed
2756
repetition simply by copying (because the recursion is allowed to refer to
2757
earlier groups that are outside the current group). However, when a group is
2758
optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2759
inserted before it, after it has been compiled. This means that any OP_RECURSE
2760
items within it that refer to the group itself or any contained groups have to
2761
have their offsets adjusted. That one of the jobs of this function. Before it
2762
is called, the partially compiled regex must be temporarily terminated with
2763
OP_END.
2764
2765
This function has been extended with the possibility of forward references for
2766
recursions and subroutine calls. It must also check the list of such references
2767
for the group we are dealing with. If it finds that one of the recursions in
2768
the current group is on this list, it adjusts the offset in the list, not the
2769
value in the reference (which is a group number).
2770
2771
Arguments:
2772
  group      points to the start of the group
2773
  adjust     the amount by which the group is to be moved
2774
  utf        TRUE in UTF-8 / UTF-16 mode
2775
  cd         contains pointers to tables etc.
2776
  save_hwm   the hwm forward reference pointer at the start of the group
2777
2778
Returns:     nothing
2779
*/
2780
2781
static void
2782
adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2783
  pcre_uchar *save_hwm)
2784
0
{
2785
0
pcre_uchar *ptr = group;
2786
2787
0
while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2788
0
  {
2789
0
  int offset;
2790
0
  pcre_uchar *hc;
2791
2792
  /* See if this recursion is on the forward reference list. If so, adjust the
2793
  reference. */
2794
2795
0
  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2796
0
    {
2797
0
    offset = GET(hc, 0);
2798
0
    if (cd->start_code + offset == ptr + 1)
2799
0
      {
2800
0
      PUT(hc, 0, offset + adjust);
2801
0
      break;
2802
0
      }
2803
0
    }
2804
2805
  /* Otherwise, adjust the recursion offset if it's after the start of this
2806
  group. */
2807
2808
0
  if (hc >= cd->hwm)
2809
0
    {
2810
0
    offset = GET(ptr, 1);
2811
0
    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2812
0
    }
2813
2814
0
  ptr += 1 + LINK_SIZE;
2815
0
  }
2816
0
}
2817
2818
2819
2820
/*************************************************
2821
*        Insert an automatic callout point       *
2822
*************************************************/
2823
2824
/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2825
callout points before each pattern item.
2826
2827
Arguments:
2828
  code           current code pointer
2829
  ptr            current pattern pointer
2830
  cd             pointers to tables etc
2831
2832
Returns:         new code pointer
2833
*/
2834
2835
static pcre_uchar *
2836
auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2837
0
{
2838
0
*code++ = OP_CALLOUT;
2839
0
*code++ = 255;
2840
0
PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2841
0
PUT(code, LINK_SIZE, 0);                       /* Default length */
2842
0
return code + 2 * LINK_SIZE;
2843
0
}
2844
2845
2846
2847
/*************************************************
2848
*         Complete a callout item                *
2849
*************************************************/
2850
2851
/* A callout item contains the length of the next item in the pattern, which
2852
we can't fill in till after we have reached the relevant point. This is used
2853
for both automatic and manual callouts.
2854
2855
Arguments:
2856
  previous_callout   points to previous callout item
2857
  ptr                current pattern pointer
2858
  cd                 pointers to tables etc
2859
2860
Returns:             nothing
2861
*/
2862
2863
static void
2864
complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2865
0
{
2866
0
int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2867
0
PUT(previous_callout, 2 + LINK_SIZE, length);
2868
0
}
2869
2870
2871
2872
#ifdef SUPPORT_UCP
2873
/*************************************************
2874
*           Get othercase range                  *
2875
*************************************************/
2876
2877
/* This function is passed the start and end of a class range, in UTF-8 mode
2878
with UCP support. It searches up the characters, looking for internal ranges of
2879
characters in the "other" case. Each call returns the next one, updating the
2880
start address.
2881
2882
Arguments:
2883
  cptr        points to starting character value; updated
2884
  d           end value
2885
  ocptr       where to put start of othercase range
2886
  odptr       where to put end of othercase range
2887
2888
Yield:        TRUE when range returned; FALSE when no more
2889
*/
2890
2891
static BOOL
2892
get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2893
  unsigned int *odptr)
2894
0
{
2895
0
unsigned int c, othercase, next;
2896
2897
0
for (c = *cptr; c <= d; c++)
2898
0
  { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2899
2900
0
if (c > d) return FALSE;
2901
2902
0
*ocptr = othercase;
2903
0
next = othercase + 1;
2904
2905
0
for (++c; c <= d; c++)
2906
0
  {
2907
0
  if (UCD_OTHERCASE(c) != next) break;
2908
0
  next++;
2909
0
  }
2910
2911
0
*odptr = next - 1;
2912
0
*cptr = c;
2913
2914
0
return TRUE;
2915
0
}
2916
2917
2918
2919
/*************************************************
2920
*        Check a character and a property        *
2921
*************************************************/
2922
2923
/* This function is called by check_auto_possessive() when a property item
2924
is adjacent to a fixed character.
2925
2926
Arguments:
2927
  c            the character
2928
  ptype        the property type
2929
  pdata        the data for the type
2930
  negated      TRUE if it's a negated property (\P or \p{^)
2931
2932
Returns:       TRUE if auto-possessifying is OK
2933
*/
2934
2935
static BOOL
2936
check_char_prop(int c, int ptype, int pdata, BOOL negated)
2937
0
{
2938
0
const pcre_uint8 chartype = UCD_CHARTYPE(c);
2939
0
switch(ptype)
2940
0
  {
2941
0
  case PT_LAMP:
2942
0
  return (chartype == ucp_Lu ||
2943
0
          chartype == ucp_Ll ||
2944
0
          chartype == ucp_Lt) == negated;
2945
2946
0
  case PT_GC:
2947
0
  return (pdata == PRIV(ucp_gentype)[chartype]) == negated;
2948
2949
0
  case PT_PC:
2950
0
  return (pdata == chartype) == negated;
2951
2952
0
  case PT_SC:
2953
0
  return (pdata == UCD_SCRIPT(c)) == negated;
2954
2955
  /* These are specials */
2956
2957
0
  case PT_ALNUM:
2958
0
  return (PRIV(ucp_gentype)[chartype] == ucp_L ||
2959
0
          PRIV(ucp_gentype)[chartype] == ucp_N) == negated;
2960
2961
0
  case PT_SPACE:    /* Perl space */
2962
0
  return (PRIV(ucp_gentype)[chartype] == ucp_Z ||
2963
0
          c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2964
0
          == negated;
2965
2966
0
  case PT_PXSPACE:  /* POSIX space */
2967
0
  return (PRIV(ucp_gentype)[chartype] == ucp_Z ||
2968
0
          c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2969
0
          c == CHAR_FF || c == CHAR_CR)
2970
0
          == negated;
2971
2972
0
  case PT_WORD:
2973
0
  return (PRIV(ucp_gentype)[chartype] == ucp_L ||
2974
0
          PRIV(ucp_gentype)[chartype] == ucp_N ||
2975
0
          c == CHAR_UNDERSCORE) == negated;
2976
0
  }
2977
0
return FALSE;
2978
0
}
2979
#endif  /* SUPPORT_UCP */
2980
2981
2982
2983
/*************************************************
2984
*     Check if auto-possessifying is possible    *
2985
*************************************************/
2986
2987
/* This function is called for unlimited repeats of certain items, to see
2988
whether the next thing could possibly match the repeated item. If not, it makes
2989
sense to automatically possessify the repeated item.
2990
2991
Arguments:
2992
  previous      pointer to the repeated opcode
2993
  utf           TRUE in UTF-8 / UTF-16 mode
2994
  ptr           next character in pattern
2995
  options       options bits
2996
  cd            contains pointers to tables etc.
2997
2998
Returns:        TRUE if possessifying is wanted
2999
*/
3000
3001
static BOOL
3002
check_auto_possessive(const pcre_uchar *previous, BOOL utf,
3003
  const pcre_uchar *ptr, int options, compile_data *cd)
3004
0
{
3005
0
pcre_int32 c, next;
3006
0
int op_code = *previous++;
3007
3008
/* Skip whitespace and comments in extended mode */
3009
3010
0
if ((options & PCRE_EXTENDED) != 0)
3011
0
  {
3012
0
  for (;;)
3013
0
    {
3014
0
    while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3015
0
    if (*ptr == CHAR_NUMBER_SIGN)
3016
0
      {
3017
0
      ptr++;
3018
0
      while (*ptr != 0)
3019
0
        {
3020
0
        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3021
0
        ptr++;
3022
0
#ifdef SUPPORT_UTF
3023
0
        if (utf) FORWARDCHAR(ptr);
3024
0
#endif
3025
0
        }
3026
0
      }
3027
0
    else break;
3028
0
    }
3029
0
  }
3030
3031
/* If the next item is one that we can handle, get its value. A non-negative
3032
value is a character, a negative value is an escape value. */
3033
3034
0
if (*ptr == CHAR_BACKSLASH)
3035
0
  {
3036
0
  int temperrorcode = 0;
3037
0
  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
3038
0
  if (temperrorcode != 0) return FALSE;
3039
0
  ptr++;    /* Point after the escape sequence */
3040
0
  }
3041
0
else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
3042
0
  {
3043
0
#ifdef SUPPORT_UTF
3044
0
  if (utf) { GETCHARINC(next, ptr); } else
3045
0
#endif
3046
0
  next = *ptr++;
3047
0
  }
3048
0
else return FALSE;
3049
3050
/* Skip whitespace and comments in extended mode */
3051
3052
0
if ((options & PCRE_EXTENDED) != 0)
3053
0
  {
3054
0
  for (;;)
3055
0
    {
3056
0
    while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3057
0
    if (*ptr == CHAR_NUMBER_SIGN)
3058
0
      {
3059
0
      ptr++;
3060
0
      while (*ptr != 0)
3061
0
        {
3062
0
        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3063
0
        ptr++;
3064
0
#ifdef SUPPORT_UTF
3065
0
        if (utf) FORWARDCHAR(ptr);
3066
0
#endif
3067
0
        }
3068
0
      }
3069
0
    else break;
3070
0
    }
3071
0
  }
3072
3073
/* If the next thing is itself optional, we have to give up. */
3074
3075
0
if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3076
0
  STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3077
0
    return FALSE;
3078
3079
/* Now compare the next item with the previous opcode. First, handle cases when
3080
the next item is a character. */
3081
3082
0
if (next >= 0) switch(op_code)
3083
0
  {
3084
0
  case OP_CHAR:
3085
0
#ifdef SUPPORT_UTF
3086
0
  GETCHARTEST(c, previous);
3087
#else
3088
  c = *previous;
3089
#endif
3090
0
  return c != next;
3091
3092
  /* For CHARI (caseless character) we must check the other case. If we have
3093
  Unicode property support, we can use it to test the other case of
3094
  high-valued characters. */
3095
3096
0
  case OP_CHARI:
3097
0
#ifdef SUPPORT_UTF
3098
0
  GETCHARTEST(c, previous);
3099
#else
3100
  c = *previous;
3101
#endif
3102
0
  if (c == next) return FALSE;
3103
0
#ifdef SUPPORT_UTF
3104
0
  if (utf)
3105
0
    {
3106
0
    unsigned int othercase;
3107
0
    if (next < 128) othercase = cd->fcc[next]; else
3108
0
#ifdef SUPPORT_UCP
3109
0
    othercase = UCD_OTHERCASE((unsigned int)next);
3110
#else
3111
    othercase = NOTACHAR;
3112
#endif
3113
0
    return (unsigned int)c != othercase;
3114
0
    }
3115
0
  else
3116
0
#endif  /* SUPPORT_UTF */
3117
0
  return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3118
3119
0
  case OP_NOT:
3120
0
#ifdef SUPPORT_UTF
3121
0
  GETCHARTEST(c, previous);
3122
#else
3123
  c = *previous;
3124
#endif
3125
0
  return c == next;
3126
3127
0
  case OP_NOTI:
3128
0
#ifdef SUPPORT_UTF
3129
0
  GETCHARTEST(c, previous);
3130
#else
3131
  c = *previous;
3132
#endif
3133
0
  if (c == next) return TRUE;
3134
0
#ifdef SUPPORT_UTF
3135
0
  if (utf)
3136
0
    {
3137
0
    unsigned int othercase;
3138
0
    if (next < 128) othercase = cd->fcc[next]; else
3139
0
#ifdef SUPPORT_UCP
3140
0
    othercase = UCD_OTHERCASE((unsigned int)next);
3141
#else
3142
    othercase = NOTACHAR;
3143
#endif
3144
0
    return (unsigned int)c == othercase;
3145
0
    }
3146
0
  else
3147
0
#endif  /* SUPPORT_UTF */
3148
0
  return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3149
3150
  /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3151
  When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3152
3153
0
  case OP_DIGIT:
3154
0
  return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3155
3156
0
  case OP_NOT_DIGIT:
3157
0
  return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3158
3159
0
  case OP_WHITESPACE:
3160
0
  return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3161
3162
0
  case OP_NOT_WHITESPACE:
3163
0
  return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3164
3165
0
  case OP_WORDCHAR:
3166
0
  return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3167
3168
0
  case OP_NOT_WORDCHAR:
3169
0
  return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3170
3171
0
  case OP_HSPACE:
3172
0
  case OP_NOT_HSPACE:
3173
0
  switch(next)
3174
0
    {
3175
0
    case 0x09:
3176
0
    case 0x20:
3177
0
    case 0xa0:
3178
0
    case 0x1680:
3179
0
    case 0x180e:
3180
0
    case 0x2000:
3181
0
    case 0x2001:
3182
0
    case 0x2002:
3183
0
    case 0x2003:
3184
0
    case 0x2004:
3185
0
    case 0x2005:
3186
0
    case 0x2006:
3187
0
    case 0x2007:
3188
0
    case 0x2008:
3189
0
    case 0x2009:
3190
0
    case 0x200A:
3191
0
    case 0x202f:
3192
0
    case 0x205f:
3193
0
    case 0x3000:
3194
0
    return op_code == OP_NOT_HSPACE;
3195
0
    default:
3196
0
    return op_code != OP_NOT_HSPACE;
3197
0
    }
3198
3199
0
  case OP_ANYNL:
3200
0
  case OP_VSPACE:
3201
0
  case OP_NOT_VSPACE:
3202
0
  switch(next)
3203
0
    {
3204
0
    case 0x0a:
3205
0
    case 0x0b:
3206
0
    case 0x0c:
3207
0
    case 0x0d:
3208
0
    case 0x85:
3209
0
    case 0x2028:
3210
0
    case 0x2029:
3211
0
    return op_code == OP_NOT_VSPACE;
3212
0
    default:
3213
0
    return op_code != OP_NOT_VSPACE;
3214
0
    }
3215
3216
0
#ifdef SUPPORT_UCP
3217
0
  case OP_PROP:
3218
0
  return check_char_prop(next, previous[0], previous[1], FALSE);
3219
3220
0
  case OP_NOTPROP:
3221
0
  return check_char_prop(next, previous[0], previous[1], TRUE);
3222
0
#endif
3223
3224
0
  default:
3225
0
  return FALSE;
3226
0
  }
3227
3228
3229
/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3230
is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3231
generated only when PCRE_UCP is *not* set, that is, when only ASCII
3232
characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3233
replaced by OP_PROP codes when PCRE_UCP is set. */
3234
3235
0
switch(op_code)
3236
0
  {
3237
0
  case OP_CHAR:
3238
0
  case OP_CHARI:
3239
0
#ifdef SUPPORT_UTF
3240
0
  GETCHARTEST(c, previous);
3241
#else
3242
  c = *previous;
3243
#endif
3244
0
  switch(-next)
3245
0
    {
3246
0
    case ESC_d:
3247
0
    return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3248
3249
0
    case ESC_D:
3250
0
    return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3251
3252
0
    case ESC_s:
3253
0
    return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3254
3255
0
    case ESC_S:
3256
0
    return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3257
3258
0
    case ESC_w:
3259
0
    return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3260
3261
0
    case ESC_W:
3262
0
    return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3263
3264
0
    case ESC_h:
3265
0
    case ESC_H:
3266
0
    switch(c)
3267
0
      {
3268
0
      case 0x09:
3269
0
      case 0x20:
3270
0
      case 0xa0:
3271
0
      case 0x1680:
3272
0
      case 0x180e:
3273
0
      case 0x2000:
3274
0
      case 0x2001:
3275
0
      case 0x2002:
3276
0
      case 0x2003:
3277
0
      case 0x2004:
3278
0
      case 0x2005:
3279
0
      case 0x2006:
3280
0
      case 0x2007:
3281
0
      case 0x2008:
3282
0
      case 0x2009:
3283
0
      case 0x200A:
3284
0
      case 0x202f:
3285
0
      case 0x205f:
3286
0
      case 0x3000:
3287
0
      return -next != ESC_h;
3288
0
      default:
3289
0
      return -next == ESC_h;
3290
0
      }
3291
3292
0
    case ESC_v:
3293
0
    case ESC_V:
3294
0
    switch(c)
3295
0
      {
3296
0
      case 0x0a:
3297
0
      case 0x0b:
3298
0
      case 0x0c:
3299
0
      case 0x0d:
3300
0
      case 0x85:
3301
0
      case 0x2028:
3302
0
      case 0x2029:
3303
0
      return -next != ESC_v;
3304
0
      default:
3305
0
      return -next == ESC_v;
3306
0
      }
3307
3308
    /* When PCRE_UCP is set, these values get generated for \d etc. Find
3309
    their substitutions and process them. The result will always be either
3310
    -ESC_p or -ESC_P. Then fall through to process those values. */
3311
3312
0
#ifdef SUPPORT_UCP
3313
0
    case ESC_du:
3314
0
    case ESC_DU:
3315
0
    case ESC_wu:
3316
0
    case ESC_WU:
3317
0
    case ESC_su:
3318
0
    case ESC_SU:
3319
0
      {
3320
0
      int temperrorcode = 0;
3321
0
      ptr = substitutes[-next - ESC_DU];
3322
0
      next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3323
0
      if (temperrorcode != 0) return FALSE;
3324
0
      ptr++;    /* For compatibility */
3325
0
      }
3326
    /* Fall through */
3327
3328
0
    case ESC_p:
3329
0
    case ESC_P:
3330
0
      {
3331
0
      int ptype, pdata, errorcodeptr;
3332
0
      BOOL negated;
3333
3334
0
      ptr--;      /* Make ptr point at the p or P */
3335
0
      ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3336
0
      if (ptype < 0) return FALSE;
3337
0
      ptr++;      /* Point past the final curly ket */
3338
3339
      /* If the property item is optional, we have to give up. (When generated
3340
      from \d etc by PCRE_UCP, this test will have been applied much earlier,
3341
      to the original \d etc. At this point, ptr will point to a zero byte. */
3342
3343
0
      if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3344
0
        STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3345
0
          return FALSE;
3346
3347
      /* Do the property check. */
3348
3349
0
      return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3350
0
      }
3351
0
#endif
3352
3353
0
    default:
3354
0
    return FALSE;
3355
0
    }
3356
3357
  /* In principle, support for Unicode properties should be integrated here as
3358
  well. It means re-organizing the above code so as to get hold of the property
3359
  values before switching on the op-code. However, I wonder how many patterns
3360
  combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3361
  these op-codes are never generated.) */
3362
3363
0
  case OP_DIGIT:
3364
0
  return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3365
0
         next == -ESC_h || next == -ESC_v || next == -ESC_R;
3366
3367
0
  case OP_NOT_DIGIT:
3368
0
  return next == -ESC_d;
3369
3370
0
  case OP_WHITESPACE:
3371
0
  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
3372
3373
0
  case OP_NOT_WHITESPACE:
3374
0
  return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
3375
3376
0
  case OP_HSPACE:
3377
0
  return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3378
0
         next == -ESC_w || next == -ESC_v || next == -ESC_R;
3379
3380
0
  case OP_NOT_HSPACE:
3381
0
  return next == -ESC_h;
3382
3383
  /* Can't have \S in here because VT matches \S (Perl anomaly) */
3384
0
  case OP_ANYNL:
3385
0
  case OP_VSPACE:
3386
0
  return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3387
3388
0
  case OP_NOT_VSPACE:
3389
0
  return next == -ESC_v || next == -ESC_R;
3390
3391
0
  case OP_WORDCHAR:
3392
0
  return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3393
0
         next == -ESC_v || next == -ESC_R;
3394
3395
0
  case OP_NOT_WORDCHAR:
3396
0
  return next == -ESC_w || next == -ESC_d;
3397
3398
0
  default:
3399
0
  return FALSE;
3400
0
  }
3401
3402
/* Control does not reach here */
3403
0
}
3404
3405
3406
3407
/*************************************************
3408
*           Compile one branch                   *
3409
*************************************************/
3410
3411
/* Scan the pattern, compiling it into the a vector. If the options are
3412
changed during the branch, the pointer is used to change the external options
3413
bits. This function is used during the pre-compile phase when we are trying
3414
to find out the amount of memory needed, as well as during the real compile
3415
phase. The value of lengthptr distinguishes the two phases.
3416
3417
Arguments:
3418
  optionsptr     pointer to the option bits
3419
  codeptr        points to the pointer to the current code point
3420
  ptrptr         points to the current pattern pointer
3421
  errorcodeptr   points to error code variable
3422
  firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3423
  reqcharptr     set to the last literal character required, else < 0
3424
  bcptr          points to current branch chain
3425
  cond_depth     conditional nesting depth
3426
  cd             contains pointers to tables etc.
3427
  lengthptr      NULL during the real compile phase
3428
                 points to length accumulator during pre-compile phase
3429
3430
Returns:         TRUE on success
3431
                 FALSE, with *errorcodeptr set non-zero on error
3432
*/
3433
3434
static BOOL
3435
compile_branch(int *optionsptr, pcre_uchar **codeptr,
3436
  const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3437
  pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3438
  compile_data *cd, int *lengthptr)
3439
0
{
3440
0
int repeat_type, op_type;
3441
0
int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3442
0
int bravalue = 0;
3443
0
int greedy_default, greedy_non_default;
3444
0
pcre_int32 firstchar, reqchar;
3445
0
pcre_int32 zeroreqchar, zerofirstchar;
3446
0
pcre_int32 req_caseopt, reqvary, tempreqvary;
3447
0
int options = *optionsptr;               /* May change dynamically */
3448
0
int after_manual_callout = 0;
3449
0
int length_prevgroup = 0;
3450
0
int c;
3451
0
pcre_uchar *code = *codeptr;
3452
0
pcre_uchar *last_code = code;
3453
0
pcre_uchar *orig_code = code;
3454
0
pcre_uchar *tempcode;
3455
0
BOOL inescq = FALSE;
3456
0
BOOL groupsetfirstchar = FALSE;
3457
0
const pcre_uchar *ptr = *ptrptr;
3458
0
const pcre_uchar *tempptr;
3459
0
const pcre_uchar *nestptr = NULL;
3460
0
pcre_uchar *previous = NULL;
3461
0
pcre_uchar *previous_callout = NULL;
3462
0
pcre_uchar *save_hwm = NULL;
3463
0
pcre_uint8 classbits[32];
3464
3465
/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3466
must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3467
dynamically as we process the pattern. */
3468
3469
0
#ifdef SUPPORT_UTF
3470
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
3471
0
BOOL utf = (options & PCRE_UTF8) != 0;
3472
0
pcre_uchar utf_chars[6];
3473
#else
3474
BOOL utf = FALSE;
3475
#endif
3476
3477
/* Helper variables for OP_XCLASS opcode (for characters > 255). */
3478
3479
0
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3480
0
BOOL xclass;
3481
0
pcre_uchar *class_uchardata;
3482
0
pcre_uchar *class_uchardata_base;
3483
0
#endif
3484
3485
#ifdef PCRE_DEBUG
3486
if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3487
#endif
3488
3489
/* Set up the default and non-default settings for greediness */
3490
3491
0
greedy_default = ((options & PCRE_UNGREEDY) != 0);
3492
0
greedy_non_default = greedy_default ^ 1;
3493
3494
/* Initialize no first byte, no required byte. REQ_UNSET means "no char
3495
matching encountered yet". It gets changed to REQ_NONE if we hit something that
3496
matches a non-fixed char first char; reqchar just remains unset if we never
3497
find one.
3498
3499
When we hit a repeat whose minimum is zero, we may have to adjust these values
3500
to take the zero repeat into account. This is implemented by setting them to
3501
zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3502
item types that can be repeated set these backoff variables appropriately. */
3503
3504
0
firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3505
3506
/* The variable req_caseopt contains either the REQ_CASELESS value
3507
or zero, according to the current setting of the caseless flag. The
3508
REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3509
firstchar or reqchar variables to record the case status of the
3510
value. This is used only for ASCII characters. */
3511
3512
0
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3513
3514
/* Switch on next character until the end of the branch */
3515
3516
0
for (;; ptr++)
3517
0
  {
3518
0
  BOOL negate_class;
3519
0
  BOOL should_flip_negation;
3520
0
  BOOL possessive_quantifier;
3521
0
  BOOL is_quantifier;
3522
0
  BOOL is_recurse;
3523
0
  BOOL reset_bracount;
3524
0
  int class_has_8bitchar;
3525
0
  int class_single_char;
3526
0
  int newoptions;
3527
0
  int recno;
3528
0
  int refsign;
3529
0
  int skipbytes;
3530
0
  int subreqchar;
3531
0
  int subfirstchar;
3532
0
  int terminator;
3533
0
  int mclength;
3534
0
  int tempbracount;
3535
0
  pcre_uchar mcbuffer[8];
3536
3537
  /* Get next character in the pattern */
3538
3539
0
  c = *ptr;
3540
3541
  /* If we are at the end of a nested substitution, revert to the outer level
3542
  string. Nesting only happens one level deep. */
3543
3544
0
  if (c == 0 && nestptr != NULL)
3545
0
    {
3546
0
    ptr = nestptr;
3547
0
    nestptr = NULL;
3548
0
    c = *ptr;
3549
0
    }
3550
3551
  /* If we are in the pre-compile phase, accumulate the length used for the
3552
  previous cycle of this loop. */
3553
3554
0
  if (lengthptr != NULL)
3555
0
    {
3556
#ifdef PCRE_DEBUG
3557
    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3558
#endif
3559
0
    if (code > cd->start_workspace + cd->workspace_size -
3560
0
        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3561
0
      {
3562
0
      *errorcodeptr = ERR52;
3563
0
      goto FAILED;
3564
0
      }
3565
3566
    /* There is at least one situation where code goes backwards: this is the
3567
    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3568
    the class is simply eliminated. However, it is created first, so we have to
3569
    allow memory for it. Therefore, don't ever reduce the length at this point.
3570
    */
3571
3572
0
    if (code < last_code) code = last_code;
3573
3574
    /* Paranoid check for integer overflow */
3575
3576
0
    if (OFLOW_MAX - *lengthptr < code - last_code)
3577
0
      {
3578
0
      *errorcodeptr = ERR20;
3579
0
      goto FAILED;
3580
0
      }
3581
3582
0
    *lengthptr += (int)(code - last_code);
3583
0
    DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3584
0
      (int)(code - last_code), c, c));
3585
3586
    /* If "previous" is set and it is not at the start of the work space, move
3587
    it back to there, in order to avoid filling up the work space. Otherwise,
3588
    if "previous" is NULL, reset the current code pointer to the start. */
3589
3590
0
    if (previous != NULL)
3591
0
      {
3592
0
      if (previous > orig_code)
3593
0
        {
3594
0
        memmove(orig_code, previous, IN_UCHARS(code - previous));
3595
0
        code -= previous - orig_code;
3596
0
        previous = orig_code;
3597
0
        }
3598
0
      }
3599
0
    else code = orig_code;
3600
3601
    /* Remember where this code item starts so we can pick up the length
3602
    next time round. */
3603
3604
0
    last_code = code;
3605
0
    }
3606
3607
  /* In the real compile phase, just check the workspace used by the forward
3608
  reference list. */
3609
3610
0
  else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3611
0
           WORK_SIZE_SAFETY_MARGIN)
3612
0
    {
3613
0
    *errorcodeptr = ERR52;
3614
0
    goto FAILED;
3615
0
    }
3616
3617
  /* If in \Q...\E, check for the end; if not, we have a literal */
3618
3619
0
  if (inescq && c != 0)
3620
0
    {
3621
0
    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3622
0
      {
3623
0
      inescq = FALSE;
3624
0
      ptr++;
3625
0
      continue;
3626
0
      }
3627
0
    else
3628
0
      {
3629
0
      if (previous_callout != NULL)
3630
0
        {
3631
0
        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
3632
0
          complete_callout(previous_callout, ptr, cd);
3633
0
        previous_callout = NULL;
3634
0
        }
3635
0
      if ((options & PCRE_AUTO_CALLOUT) != 0)
3636
0
        {
3637
0
        previous_callout = code;
3638
0
        code = auto_callout(code, ptr, cd);
3639
0
        }
3640
0
      goto NORMAL_CHAR;
3641
0
      }
3642
0
    }
3643
3644
  /* Fill in length of a previous callout, except when the next thing is
3645
  a quantifier. */
3646
3647
0
  is_quantifier =
3648
0
    c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3649
0
    (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3650
3651
0
  if (!is_quantifier && previous_callout != NULL &&
3652
0
       after_manual_callout-- <= 0)
3653
0
    {
3654
0
    if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
3655
0
      complete_callout(previous_callout, ptr, cd);
3656
0
    previous_callout = NULL;
3657
0
    }
3658
3659
  /* In extended mode, skip white space and comments. */
3660
3661
0
  if ((options & PCRE_EXTENDED) != 0)
3662
0
    {
3663
0
    if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3664
0
    if (c == CHAR_NUMBER_SIGN)
3665
0
      {
3666
0
      ptr++;
3667
0
      while (*ptr != 0)
3668
0
        {
3669
0
        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3670
0
        ptr++;
3671
0
#ifdef SUPPORT_UTF
3672
0
        if (utf) FORWARDCHAR(ptr);
3673
0
#endif
3674
0
        }
3675
0
      if (*ptr != 0) continue;
3676
3677
      /* Else fall through to handle end of string */
3678
0
      c = 0;
3679
0
      }
3680
0
    }
3681
3682
  /* No auto callout for quantifiers. */
3683
3684
0
  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3685
0
    {
3686
0
    previous_callout = code;
3687
0
    code = auto_callout(code, ptr, cd);
3688
0
    }
3689
3690
0
  switch(c)
3691
0
    {
3692
    /* ===================================================================*/
3693
0
    case 0:                        /* The branch terminates at string end */
3694
0
    case CHAR_VERTICAL_LINE:       /* or | or ) */
3695
0
    case CHAR_RIGHT_PARENTHESIS:
3696
0
    *firstcharptr = firstchar;
3697
0
    *reqcharptr = reqchar;
3698
0
    *codeptr = code;
3699
0
    *ptrptr = ptr;
3700
0
    if (lengthptr != NULL)
3701
0
      {
3702
0
      if (OFLOW_MAX - *lengthptr < code - last_code)
3703
0
        {
3704
0
        *errorcodeptr = ERR20;
3705
0
        goto FAILED;
3706
0
        }
3707
0
      *lengthptr += (int)(code - last_code);   /* To include callout length */
3708
0
      DPRINTF((">> end branch\n"));
3709
0
      }
3710
0
    return TRUE;
3711
3712
3713
    /* ===================================================================*/
3714
    /* Handle single-character metacharacters. In multiline mode, ^ disables
3715
    the setting of any following char as a first character. */
3716
3717
0
    case CHAR_CIRCUMFLEX_ACCENT:
3718
0
    previous = NULL;
3719
0
    if ((options & PCRE_MULTILINE) != 0)
3720
0
      {
3721
0
      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3722
0
      *code++ = OP_CIRCM;
3723
0
      }
3724
0
    else *code++ = OP_CIRC;
3725
0
    break;
3726
3727
0
    case CHAR_DOLLAR_SIGN:
3728
0
    previous = NULL;
3729
0
    *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3730
0
    break;
3731
3732
    /* There can never be a first char if '.' is first, whatever happens about
3733
    repeats. The value of reqchar doesn't change either. */
3734
3735
0
    case CHAR_DOT:
3736
0
    if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3737
0
    zerofirstchar = firstchar;
3738
0
    zeroreqchar = reqchar;
3739
0
    previous = code;
3740
0
    *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3741
0
    break;
3742
3743
3744
    /* ===================================================================*/
3745
    /* Character classes. If the included characters are all < 256, we build a
3746
    32-byte bitmap of the permitted characters, except in the special case
3747
    where there is only one such character. For negated classes, we build the
3748
    map as usual, then invert it at the end. However, we use a different opcode
3749
    so that data characters > 255 can be handled correctly.
3750
3751
    If the class contains characters outside the 0-255 range, a different
3752
    opcode is compiled. It may optionally have a bit map for characters < 256,
3753
    but those above are are explicitly listed afterwards. A flag byte tells
3754
    whether the bitmap is present, and whether this is a negated class or not.
3755
3756
    In JavaScript compatibility mode, an isolated ']' causes an error. In
3757
    default (Perl) mode, it is treated as a data character. */
3758
3759
0
    case CHAR_RIGHT_SQUARE_BRACKET:
3760
0
    if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3761
0
      {
3762
0
      *errorcodeptr = ERR64;
3763
0
      goto FAILED;
3764
0
      }
3765
0
    goto NORMAL_CHAR;
3766
3767
0
    case CHAR_LEFT_SQUARE_BRACKET:
3768
0
    previous = code;
3769
3770
    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3771
    they are encountered at the top level, so we'll do that too. */
3772
3773
0
    if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3774
0
         ptr[1] == CHAR_EQUALS_SIGN) &&
3775
0
        check_posix_syntax(ptr, &tempptr))
3776
0
      {
3777
0
      *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3778
0
      goto FAILED;
3779
0
      }
3780
3781
    /* If the first character is '^', set the negation flag and skip it. Also,
3782
    if the first few characters (either before or after ^) are \Q\E or \E we
3783
    skip them too. This makes for compatibility with Perl. */
3784
3785
0
    negate_class = FALSE;
3786
0
    for (;;)
3787
0
      {
3788
0
      c = *(++ptr);
3789
0
      if (c == CHAR_BACKSLASH)
3790
0
        {
3791
0
        if (ptr[1] == CHAR_E)
3792
0
          ptr++;
3793
0
        else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3794
0
          ptr += 3;
3795
0
        else
3796
0
          break;
3797
0
        }
3798
0
      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3799
0
        negate_class = TRUE;
3800
0
      else break;
3801
0
      }
3802
3803
    /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3804
    an initial ']' is taken as a data character -- the code below handles
3805
    that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3806
    [^] must match any character, so generate OP_ALLANY. */
3807
3808
0
    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3809
0
        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3810
0
      {
3811
0
      *code++ = negate_class? OP_ALLANY : OP_FAIL;
3812
0
      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3813
0
      zerofirstchar = firstchar;
3814
0
      break;
3815
0
      }
3816
3817
    /* If a class contains a negative special such as \S, we need to flip the
3818
    negation flag at the end, so that support for characters > 255 works
3819
    correctly (they are all included in the class). */
3820
3821
0
    should_flip_negation = FALSE;
3822
3823
    /* For optimization purposes, we track some properties of the class.
3824
    class_has_8bitchar will be non-zero, if the class contains at least one
3825
    < 256 character. class_single_char will be 1 if the class contains only
3826
    a single character. */
3827
3828
0
    class_has_8bitchar = 0;
3829
0
    class_single_char = 0;
3830
3831
    /* Initialize the 32-char bit map to all zeros. We build the map in a
3832
    temporary bit of memory, in case the class contains only 1 character (less
3833
    than 256), because in that case the compiled code doesn't use the bit map.
3834
    */
3835
3836
0
    memset(classbits, 0, 32 * sizeof(pcre_uint8));
3837
3838
0
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3839
0
    xclass = FALSE;                           /* No chars >= 256 */
3840
0
    class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3841
0
    class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3842
0
#endif
3843
3844
    /* Process characters until ] is reached. By writing this as a "do" it
3845
    means that an initial ] is taken as a data character. At the start of the
3846
    loop, c contains the first byte of the character. */
3847
3848
0
    if (c != 0) do
3849
0
      {
3850
0
      const pcre_uchar *oldptr;
3851
3852
0
#ifdef SUPPORT_UTF
3853
0
      if (utf && HAS_EXTRALEN(c))
3854
0
        {                           /* Braces are required because the */
3855
0
        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3856
0
        }
3857
0
#endif
3858
3859
0
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3860
      /* In the pre-compile phase, accumulate the length of any extra
3861
      data and reset the pointer. This is so that very large classes that
3862
      contain a zillion > 255 characters no longer overwrite the work space
3863
      (which is on the stack). */
3864
3865
0
      if (lengthptr != NULL)
3866
0
        {
3867
0
        *lengthptr += class_uchardata - class_uchardata_base;
3868
0
        class_uchardata = class_uchardata_base;
3869
0
        }
3870
0
#endif
3871
3872
      /* Inside \Q...\E everything is literal except \E */
3873
3874
0
      if (inescq)
3875
0
        {
3876
0
        if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3877
0
          {
3878
0
          inescq = FALSE;                   /* Reset literal state */
3879
0
          ptr++;                            /* Skip the 'E' */
3880
0
          continue;                         /* Carry on with next */
3881
0
          }
3882
0
        goto CHECK_RANGE;                   /* Could be range if \E follows */
3883
0
        }
3884
3885
      /* Handle POSIX class names. Perl allows a negation extension of the
3886
      form [:^name:]. A square bracket that doesn't match the syntax is
3887
      treated as a literal. We also recognize the POSIX constructions
3888
      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3889
      5.6 and 5.8 do. */
3890
3891
0
      if (c == CHAR_LEFT_SQUARE_BRACKET &&
3892
0
          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3893
0
           ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3894
0
        {
3895
0
        BOOL local_negate = FALSE;
3896
0
        int posix_class, taboffset, tabopt;
3897
0
        const pcre_uint8 *cbits = cd->cbits;
3898
0
        pcre_uint8 pbits[32];
3899
3900
0
        if (ptr[1] != CHAR_COLON)
3901
0
          {
3902
0
          *errorcodeptr = ERR31;
3903
0
          goto FAILED;
3904
0
          }
3905
3906
0
        ptr += 2;
3907
0
        if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3908
0
          {
3909
0
          local_negate = TRUE;
3910
0
          should_flip_negation = TRUE;  /* Note negative special */
3911
0
          ptr++;
3912
0
          }
3913
3914
0
        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3915
0
        if (posix_class < 0)
3916
0
          {
3917
0
          *errorcodeptr = ERR30;
3918
0
          goto FAILED;
3919
0
          }
3920
3921
        /* If matching is caseless, upper and lower are converted to
3922
        alpha. This relies on the fact that the class table starts with
3923
        alpha, lower, upper as the first 3 entries. */
3924
3925
0
        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3926
0
          posix_class = 0;
3927
3928
        /* When PCRE_UCP is set, some of the POSIX classes are converted to
3929
        different escape sequences that use Unicode properties. */
3930
3931
0
#ifdef SUPPORT_UCP
3932
0
        if ((options & PCRE_UCP) != 0)
3933
0
          {
3934
0
          int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3935
0
          if (posix_substitutes[pc] != NULL)
3936
0
            {
3937
0
            nestptr = tempptr + 1;
3938
0
            ptr = posix_substitutes[pc] - 1;
3939
0
            continue;
3940
0
            }
3941
0
          }
3942
0
#endif
3943
        /* In the non-UCP case, we build the bit map for the POSIX class in a
3944
        chunk of local store because we may be adding and subtracting from it,
3945
        and we don't want to subtract bits that may be in the main map already.
3946
        At the end we or the result into the bit map that is being built. */
3947
3948
0
        posix_class *= 3;
3949
3950
        /* Copy in the first table (always present) */
3951
3952
0
        memcpy(pbits, cbits + posix_class_maps[posix_class],
3953
0
          32 * sizeof(pcre_uint8));
3954
3955
        /* If there is a second table, add or remove it as required. */
3956
3957
0
        taboffset = posix_class_maps[posix_class + 1];
3958
0
        tabopt = posix_class_maps[posix_class + 2];
3959
3960
0
        if (taboffset >= 0)
3961
0
          {
3962
0
          if (tabopt >= 0)
3963
0
            for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3964
0
          else
3965
0
            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3966
0
          }
3967
3968
        /* Not see if we need to remove any special characters. An option
3969
        value of 1 removes vertical space and 2 removes underscore. */
3970
3971
0
        if (tabopt < 0) tabopt = -tabopt;
3972
0
        if (tabopt == 1) pbits[1] &= ~0x3c;
3973
0
          else if (tabopt == 2) pbits[11] &= 0x7f;
3974
3975
        /* Add the POSIX table or its complement into the main table that is
3976
        being built and we are done. */
3977
3978
0
        if (local_negate)
3979
0
          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3980
0
        else
3981
0
          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3982
3983
0
        ptr = tempptr + 1;
3984
        /* Every class contains at least one < 256 characters. */
3985
0
        class_has_8bitchar = 1;
3986
        /* Every class contains at least two characters. */
3987
0
        class_single_char = 2;
3988
0
        continue;    /* End of POSIX syntax handling */
3989
0
        }
3990
3991
      /* Backslash may introduce a single character, or it may introduce one
3992
      of the specials, which just set a flag. The sequence \b is a special
3993
      case. Inside a class (and only there) it is treated as backspace. We
3994
      assume that other escapes have more than one character in them, so
3995
      speculatively set both class_has_8bitchar and class_single_char bigger
3996
      than one. Unrecognized escapes fall through and are either treated
3997
      as literal characters (by default), or are faulted if
3998
      PCRE_EXTRA is set. */
3999
4000
0
      if (c == CHAR_BACKSLASH)
4001
0
        {
4002
0
        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4003
0
        if (*errorcodeptr != 0) goto FAILED;
4004
4005
0
        if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
4006
0
        else if (-c == ESC_N)            /* \N is not supported in a class */
4007
0
          {
4008
0
          *errorcodeptr = ERR71;
4009
0
          goto FAILED;
4010
0
          }
4011
0
        else if (-c == ESC_Q)            /* Handle start of quoted string */
4012
0
          {
4013
0
          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4014
0
            {
4015
0
            ptr += 2; /* avoid empty string */
4016
0
            }
4017
0
          else inescq = TRUE;
4018
0
          continue;
4019
0
          }
4020
0
        else if (-c == ESC_E) continue;  /* Ignore orphan \E */
4021
4022
0
        if (c < 0)
4023
0
          {
4024
0
          const pcre_uint8 *cbits = cd->cbits;
4025
          /* Every class contains at least two < 256 characters. */
4026
0
          class_has_8bitchar++;
4027
          /* Every class contains at least two characters. */
4028
0
          class_single_char += 2;
4029
4030
0
          switch (-c)
4031
0
            {
4032
0
#ifdef SUPPORT_UCP
4033
0
            case ESC_du:     /* These are the values given for \d etc */
4034
0
            case ESC_DU:     /* when PCRE_UCP is set. We replace the */
4035
0
            case ESC_wu:     /* escape sequence with an appropriate \p */
4036
0
            case ESC_WU:     /* or \P to test Unicode properties instead */
4037
0
            case ESC_su:     /* of the default ASCII testing. */
4038
0
            case ESC_SU:
4039
0
            nestptr = ptr;
4040
0
            ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
4041
0
            class_has_8bitchar--;                /* Undo! */
4042
0
            continue;
4043
0
#endif
4044
0
            case ESC_d:
4045
0
            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4046
0
            continue;
4047
4048
0
            case ESC_D:
4049
0
            should_flip_negation = TRUE;
4050
0
            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4051
0
            continue;
4052
4053
0
            case ESC_w:
4054
0
            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4055
0
            continue;
4056
4057
0
            case ESC_W:
4058
0
            should_flip_negation = TRUE;
4059
0
            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4060
0
            continue;
4061
4062
            /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4063
            if it was previously set by something earlier in the character
4064
            class. */
4065
4066
0
            case ESC_s:
4067
0
            classbits[0] |= cbits[cbit_space];
4068
0
            classbits[1] |= cbits[cbit_space+1] & ~0x08;
4069
0
            for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4070
0
            continue;
4071
4072
0
            case ESC_S:
4073
0
            should_flip_negation = TRUE;
4074
0
            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4075
0
            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
4076
0
            continue;
4077
4078
0
            case ESC_h:
4079
0
            SETBIT(classbits, 0x09); /* VT */
4080
0
            SETBIT(classbits, 0x20); /* SPACE */
4081
0
            SETBIT(classbits, 0xa0); /* NSBP */
4082
#ifndef COMPILE_PCRE8
4083
            xclass = TRUE;
4084
            *class_uchardata++ = XCL_SINGLE;
4085
            *class_uchardata++ = 0x1680;
4086
            *class_uchardata++ = XCL_SINGLE;
4087
            *class_uchardata++ = 0x180e;
4088
            *class_uchardata++ = XCL_RANGE;
4089
            *class_uchardata++ = 0x2000;
4090
            *class_uchardata++ = 0x200a;
4091
            *class_uchardata++ = XCL_SINGLE;
4092
            *class_uchardata++ = 0x202f;
4093
            *class_uchardata++ = XCL_SINGLE;
4094
            *class_uchardata++ = 0x205f;
4095
            *class_uchardata++ = XCL_SINGLE;
4096
            *class_uchardata++ = 0x3000;
4097
#elif defined SUPPORT_UTF
4098
0
            if (utf)
4099
0
              {
4100
0
              xclass = TRUE;
4101
0
              *class_uchardata++ = XCL_SINGLE;
4102
0
              class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4103
0
              *class_uchardata++ = XCL_SINGLE;
4104
0
              class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4105
0
              *class_uchardata++ = XCL_RANGE;
4106
0
              class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4107
0
              class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4108
0
              *class_uchardata++ = XCL_SINGLE;
4109
0
              class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4110
0
              *class_uchardata++ = XCL_SINGLE;
4111
0
              class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4112
0
              *class_uchardata++ = XCL_SINGLE;
4113
0
              class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4114
0
              }
4115
0
#endif
4116
0
            continue;
4117
4118
0
            case ESC_H:
4119
0
            for (c = 0; c < 32; c++)
4120
0
              {
4121
0
              int x = 0xff;
4122
0
              switch (c)
4123
0
                {
4124
0
                case 0x09/8: x ^= 1 << (0x09%8); break;
4125
0
                case 0x20/8: x ^= 1 << (0x20%8); break;
4126
0
                case 0xa0/8: x ^= 1 << (0xa0%8); break;
4127
0
                default: break;
4128
0
                }
4129
0
              classbits[c] |= x;
4130
0
              }
4131
#ifndef COMPILE_PCRE8
4132
            xclass = TRUE;
4133
            *class_uchardata++ = XCL_RANGE;
4134
            *class_uchardata++ = 0x0100;
4135
            *class_uchardata++ = 0x167f;
4136
            *class_uchardata++ = XCL_RANGE;
4137
            *class_uchardata++ = 0x1681;
4138
            *class_uchardata++ = 0x180d;
4139
            *class_uchardata++ = XCL_RANGE;
4140
            *class_uchardata++ = 0x180f;
4141
            *class_uchardata++ = 0x1fff;
4142
            *class_uchardata++ = XCL_RANGE;
4143
            *class_uchardata++ = 0x200b;
4144
            *class_uchardata++ = 0x202e;
4145
            *class_uchardata++ = XCL_RANGE;
4146
            *class_uchardata++ = 0x2030;
4147
            *class_uchardata++ = 0x205e;
4148
            *class_uchardata++ = XCL_RANGE;
4149
            *class_uchardata++ = 0x2060;
4150
            *class_uchardata++ = 0x2fff;
4151
            *class_uchardata++ = XCL_RANGE;
4152
            *class_uchardata++ = 0x3001;
4153
#ifdef SUPPORT_UTF
4154
            if (utf)
4155
              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4156
            else
4157
#endif
4158
              *class_uchardata++ = 0xffff;
4159
#elif defined SUPPORT_UTF
4160
0
            if (utf)
4161
0
              {
4162
0
              xclass = TRUE;
4163
0
              *class_uchardata++ = XCL_RANGE;
4164
0
              class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4165
0
              class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4166
0
              *class_uchardata++ = XCL_RANGE;
4167
0
              class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4168
0
              class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4169
0
              *class_uchardata++ = XCL_RANGE;
4170
0
              class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4171
0
              class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4172
0
              *class_uchardata++ = XCL_RANGE;
4173
0
              class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4174
0
              class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4175
0
              *class_uchardata++ = XCL_RANGE;
4176
0
              class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4177
0
              class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4178
0
              *class_uchardata++ = XCL_RANGE;
4179
0
              class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4180
0
              class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4181
0
              *class_uchardata++ = XCL_RANGE;
4182
0
              class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4183
0
              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4184
0
              }
4185
0
#endif
4186
0
            continue;
4187
4188
0
            case ESC_v:
4189
0
            SETBIT(classbits, 0x0a); /* LF */
4190
0
            SETBIT(classbits, 0x0b); /* VT */
4191
0
            SETBIT(classbits, 0x0c); /* FF */
4192
0
            SETBIT(classbits, 0x0d); /* CR */
4193
0
            SETBIT(classbits, 0x85); /* NEL */
4194
#ifndef COMPILE_PCRE8
4195
            xclass = TRUE;
4196
            *class_uchardata++ = XCL_RANGE;
4197
            *class_uchardata++ = 0x2028;
4198
            *class_uchardata++ = 0x2029;
4199
#elif defined SUPPORT_UTF
4200
0
            if (utf)
4201
0
              {
4202
0
              xclass = TRUE;
4203
0
              *class_uchardata++ = XCL_RANGE;
4204
0
              class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4205
0
              class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4206
0
              }
4207
0
#endif
4208
0
            continue;
4209
4210
0
            case ESC_V:
4211
0
            for (c = 0; c < 32; c++)
4212
0
              {
4213
0
              int x = 0xff;
4214
0
              switch (c)
4215
0
                {
4216
0
                case 0x0a/8: x ^= 1 << (0x0a%8);
4217
0
                             x ^= 1 << (0x0b%8);
4218
0
                             x ^= 1 << (0x0c%8);
4219
0
                             x ^= 1 << (0x0d%8);
4220
0
                             break;
4221
0
                case 0x85/8: x ^= 1 << (0x85%8); break;
4222
0
                default: break;
4223
0
                }
4224
0
              classbits[c] |= x;
4225
0
              }
4226
4227
#ifndef COMPILE_PCRE8
4228
            xclass = TRUE;
4229
            *class_uchardata++ = XCL_RANGE;
4230
            *class_uchardata++ = 0x0100;
4231
            *class_uchardata++ = 0x2027;
4232
            *class_uchardata++ = XCL_RANGE;
4233
            *class_uchardata++ = 0x202a;
4234
#ifdef SUPPORT_UTF
4235
            if (utf)
4236
              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4237
            else
4238
#endif
4239
              *class_uchardata++ = 0xffff;
4240
#elif defined SUPPORT_UTF
4241
0
            if (utf)
4242
0
              {
4243
0
              xclass = TRUE;
4244
0
              *class_uchardata++ = XCL_RANGE;
4245
0
              class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4246
0
              class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4247
0
              *class_uchardata++ = XCL_RANGE;
4248
0
              class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4249
0
              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4250
0
              }
4251
0
#endif
4252
0
            continue;
4253
4254
0
#ifdef SUPPORT_UCP
4255
0
            case ESC_p:
4256
0
            case ESC_P:
4257
0
              {
4258
0
              BOOL negated;
4259
0
              int pdata;
4260
0
              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4261
0
              if (ptype < 0) goto FAILED;
4262
0
              xclass = TRUE;
4263
0
              *class_uchardata++ = ((-c == ESC_p) != negated)?
4264
0
                XCL_PROP : XCL_NOTPROP;
4265
0
              *class_uchardata++ = ptype;
4266
0
              *class_uchardata++ = pdata;
4267
0
              class_has_8bitchar--;                /* Undo! */
4268
0
              continue;
4269
0
              }
4270
0
#endif
4271
            /* Unrecognized escapes are faulted if PCRE is running in its
4272
            strict mode. By default, for compatibility with Perl, they are
4273
            treated as literals. */
4274
4275
0
            default:
4276
0
            if ((options & PCRE_EXTRA) != 0)
4277
0
              {
4278
0
              *errorcodeptr = ERR7;
4279
0
              goto FAILED;
4280
0
              }
4281
0
            class_has_8bitchar--;    /* Undo the speculative increase. */
4282
0
            class_single_char -= 2;  /* Undo the speculative increase. */
4283
0
            c = *ptr;                /* Get the final character and fall through */
4284
0
            break;
4285
0
            }
4286
0
          }
4287
4288
        /* Fall through if we have a single character (c >= 0). This may be
4289
        greater than 256. */
4290
4291
0
        }   /* End of backslash handling */
4292
4293
      /* A single character may be followed by '-' to form a range. However,
4294
      Perl does not permit ']' to be the end of the range. A '-' character
4295
      at the end is treated as a literal. Perl ignores orphaned \E sequences
4296
      entirely. The code for handling \Q and \E is messy. */
4297
4298
0
      CHECK_RANGE:
4299
0
      while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4300
0
        {
4301
0
        inescq = FALSE;
4302
0
        ptr += 2;
4303
0
        }
4304
4305
0
      oldptr = ptr;
4306
4307
      /* Remember \r or \n */
4308
4309
0
      if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4310
4311
      /* Check for range */
4312
4313
0
      if (!inescq && ptr[1] == CHAR_MINUS)
4314
0
        {
4315
0
        int d;
4316
0
        ptr += 2;
4317
0
        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4318
4319
        /* If we hit \Q (not followed by \E) at this point, go into escaped
4320
        mode. */
4321
4322
0
        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4323
0
          {
4324
0
          ptr += 2;
4325
0
          if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4326
0
            { ptr += 2; continue; }
4327
0
          inescq = TRUE;
4328
0
          break;
4329
0
          }
4330
4331
0
        if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4332
0
          {
4333
0
          ptr = oldptr;
4334
0
          goto LONE_SINGLE_CHARACTER;
4335
0
          }
4336
4337
0
#ifdef SUPPORT_UTF
4338
0
        if (utf)
4339
0
          {                           /* Braces are required because the */
4340
0
          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4341
0
          }
4342
0
        else
4343
0
#endif
4344
0
        d = *ptr;  /* Not UTF-8 mode */
4345
4346
        /* The second part of a range can be a single-character escape, but
4347
        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4348
        in such circumstances. */
4349
4350
0
        if (!inescq && d == CHAR_BACKSLASH)
4351
0
          {
4352
0
          d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4353
0
          if (*errorcodeptr != 0) goto FAILED;
4354
4355
          /* \b is backspace; any other special means the '-' was literal */
4356
4357
0
          if (d < 0)
4358
0
            {
4359
0
            if (d == -ESC_b) d = CHAR_BS; else
4360
0
              {
4361
0
              ptr = oldptr;
4362
0
              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
4363
0
              }
4364
0
            }
4365
0
          }
4366
4367
        /* Check that the two values are in the correct order. Optimize
4368
        one-character ranges */
4369
4370
0
        if (d < c)
4371
0
          {
4372
0
          *errorcodeptr = ERR8;
4373
0
          goto FAILED;
4374
0
          }
4375
4376
0
        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
4377
4378
        /* Remember \r or \n */
4379
4380
0
        if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4381
4382
        /* Since we found a character range, single character optimizations
4383
        cannot be done anymore. */
4384
0
        class_single_char = 2;
4385
4386
        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4387
        matching, we have to use an XCLASS with extra data items. Caseless
4388
        matching for characters > 127 is available only if UCP support is
4389
        available. */
4390
4391
#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4392
        if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4393
#elif defined  SUPPORT_UTF
4394
0
        if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4395
#elif !(defined COMPILE_PCRE8)
4396
        if (d > 255)
4397
#endif
4398
0
#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4399
0
          {
4400
0
          xclass = TRUE;
4401
4402
          /* With UCP support, we can find the other case equivalents of
4403
          the relevant characters. There may be several ranges. Optimize how
4404
          they fit with the basic range. */
4405
4406
0
#ifdef SUPPORT_UCP
4407
#ifndef COMPILE_PCRE8
4408
          if (utf && (options & PCRE_CASELESS) != 0)
4409
#else
4410
0
          if ((options & PCRE_CASELESS) != 0)
4411
0
#endif
4412
0
            {
4413
0
            unsigned int occ, ocd;
4414
0
            unsigned int cc = c;
4415
0
            unsigned int origd = d;
4416
0
            while (get_othercase_range(&cc, origd, &occ, &ocd))
4417
0
              {
4418
0
              if (occ >= (unsigned int)c &&
4419
0
                  ocd <= (unsigned int)d)
4420
0
                continue;                          /* Skip embedded ranges */
4421
4422
0
              if (occ < (unsigned int)c  &&
4423
0
                  ocd >= (unsigned int)c - 1)      /* Extend the basic range */
4424
0
                {                                  /* if there is overlap,   */
4425
0
                c = occ;                           /* noting that if occ < c */
4426
0
                continue;                          /* we can't have ocd > d  */
4427
0
                }                                  /* because a subrange is  */
4428
0
              if (ocd > (unsigned int)d &&
4429
0
                  occ <= (unsigned int)d + 1)      /* always shorter than    */
4430
0
                {                                  /* the basic range.       */
4431
0
                d = ocd;
4432
0
                continue;
4433
0
                }
4434
4435
0
              if (occ == ocd)
4436
0
                {
4437
0
                *class_uchardata++ = XCL_SINGLE;
4438
0
                }
4439
0
              else
4440
0
                {
4441
0
                *class_uchardata++ = XCL_RANGE;
4442
0
                class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4443
0
                }
4444
0
              class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4445
0
              }
4446
0
            }
4447
0
#endif  /* SUPPORT_UCP */
4448
4449
          /* Now record the original range, possibly modified for UCP caseless
4450
          overlapping ranges. */
4451
4452
0
          *class_uchardata++ = XCL_RANGE;
4453
0
#ifdef SUPPORT_UTF
4454
#ifndef COMPILE_PCRE8
4455
          if (utf)
4456
            {
4457
            class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4458
            class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4459
            }
4460
          else
4461
            {
4462
            *class_uchardata++ = c;
4463
            *class_uchardata++ = d;
4464
            }
4465
#else
4466
0
          class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4467
0
          class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4468
0
#endif
4469
#else /* SUPPORT_UTF */
4470
          *class_uchardata++ = c;
4471
          *class_uchardata++ = d;
4472
#endif /* SUPPORT_UTF */
4473
4474
          /* With UCP support, we are done. Without UCP support, there is no
4475
          caseless matching for UTF characters > 127; we can use the bit map
4476
          for the smaller ones. As for 16 bit characters without UTF, we
4477
          can still use  */
4478
4479
0
#ifdef SUPPORT_UCP
4480
#ifndef COMPILE_PCRE8
4481
          if (utf)
4482
#endif
4483
0
            continue;    /* With next character in the class */
4484
0
#endif  /* SUPPORT_UCP */
4485
4486
#if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4487
          if (utf)
4488
            {
4489
            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4490
            /* Adjust upper limit and fall through to set up the map */
4491
            d = 127;
4492
            }
4493
          else
4494
            {
4495
            if (c > 255) continue;
4496
            /* Adjust upper limit and fall through to set up the map */
4497
            d = 255;
4498
            }
4499
#elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4500
          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4501
          /* Adjust upper limit and fall through to set up the map */
4502
          d = 127;
4503
#else
4504
0
          if (c > 255) continue;
4505
          /* Adjust upper limit and fall through to set up the map */
4506
0
          d = 255;
4507
0
#endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4508
0
          }
4509
0
#endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4510
4511
        /* We use the bit map for 8 bit mode, or when the characters fall
4512
        partially or entirely to [0-255] ([0-127] for UCP) ranges. */
4513
4514
0
        class_has_8bitchar = 1;
4515
4516
        /* We can save a bit of time by skipping this in the pre-compile. */
4517
4518
0
        if (lengthptr == NULL) for (; c <= d; c++)
4519
0
          {
4520
0
          classbits[c/8] |= (1 << (c&7));
4521
0
          if ((options & PCRE_CASELESS) != 0)
4522
0
            {
4523
0
            int uc = cd->fcc[c]; /* flip case */
4524
0
            classbits[uc/8] |= (1 << (uc&7));
4525
0
            }
4526
0
          }
4527
4528
0
        continue;   /* Go get the next char in the class */
4529
0
        }
4530
4531
      /* Handle a lone single character - we can get here for a normal
4532
      non-escape char, or after \ that introduces a single character or for an
4533
      apparent range that isn't. */
4534
4535
0
      LONE_SINGLE_CHARACTER:
4536
4537
      /* Only the value of 1 matters for class_single_char. */
4538
4539
0
      if (class_single_char < 2) class_single_char++;
4540
4541
      /* If class_charcount is 1, we saw precisely one character. As long as
4542
      there was no use of \p or \P, in other words, no use of any XCLASS
4543
      features, we can optimize.
4544
4545
      The optimization throws away the bit map. We turn the item into a
4546
      1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4547
      In the positive case, it can cause firstchar to be set. Otherwise, there
4548
      can be no first char if this item is first, whatever repeat count may
4549
      follow. In the case of reqchar, save the previous value for reinstating. */
4550
4551
0
      if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4552
0
        {
4553
0
        ptr++;
4554
0
        zeroreqchar = reqchar;
4555
4556
0
        if (negate_class)
4557
0
          {
4558
0
          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4559
0
          zerofirstchar = firstchar;
4560
0
          *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4561
0
#ifdef SUPPORT_UTF
4562
0
          if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4563
0
            code += PRIV(ord2utf)(c, code);
4564
0
          else
4565
0
#endif
4566
0
            *code++ = c;
4567
0
          goto NOT_CHAR;
4568
0
          }
4569
4570
        /* For a single, positive character, get the value into mcbuffer, and
4571
        then we can handle this with the normal one-character code. */
4572
4573
0
#ifdef SUPPORT_UTF
4574
0
        if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4575
0
          mclength = PRIV(ord2utf)(c, mcbuffer);
4576
0
        else
4577
0
#endif
4578
0
          {
4579
0
          mcbuffer[0] = c;
4580
0
          mclength = 1;
4581
0
          }
4582
0
        goto ONE_CHAR;
4583
0
        }       /* End of 1-char optimization */
4584
4585
      /* Handle a character that cannot go in the bit map. */
4586
4587
#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4588
      if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4589
#elif defined SUPPORT_UTF
4590
0
      if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4591
#elif !(defined COMPILE_PCRE8)
4592
      if (c > 255)
4593
#endif
4594
4595
0
#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4596
0
        {
4597
0
        xclass = TRUE;
4598
0
        *class_uchardata++ = XCL_SINGLE;
4599
0
#ifdef SUPPORT_UTF
4600
#ifndef COMPILE_PCRE8
4601
        /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4602
        if (!utf)
4603
          *class_uchardata++ = c;
4604
        else
4605
#endif
4606
0
          class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4607
#else /* SUPPORT_UTF */
4608
        *class_uchardata++ = c;
4609
#endif /* SUPPORT_UTF */
4610
4611
0
#ifdef SUPPORT_UCP
4612
0
#ifdef COMPILE_PCRE8
4613
0
        if ((options & PCRE_CASELESS) != 0)
4614
#else
4615
        /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4616
        if (utf && (options & PCRE_CASELESS) != 0)
4617
#endif
4618
0
          {
4619
0
          unsigned int othercase;
4620
0
          if ((int)(othercase = UCD_OTHERCASE(c)) != c)
4621
0
            {
4622
0
            *class_uchardata++ = XCL_SINGLE;
4623
0
            class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4624
0
            }
4625
0
          }
4626
0
#endif  /* SUPPORT_UCP */
4627
4628
0
        }
4629
0
      else
4630
0
#endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4631
4632
      /* Handle a single-byte character */
4633
0
        {
4634
0
        class_has_8bitchar = 1;
4635
0
        classbits[c/8] |= (1 << (c&7));
4636
0
        if ((options & PCRE_CASELESS) != 0)
4637
0
          {
4638
0
          c = cd->fcc[c]; /* flip case */
4639
0
          classbits[c/8] |= (1 << (c&7));
4640
0
          }
4641
0
        }
4642
0
      }
4643
4644
    /* Loop until ']' reached. This "while" is the end of the "do" far above.
4645
    If we are at the end of an internal nested string, revert to the outer
4646
    string. */
4647
4648
0
    while (((c = *(++ptr)) != 0 ||
4649
0
           (nestptr != NULL &&
4650
0
             (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
4651
0
           (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4652
4653
    /* Check for missing terminating ']' */
4654
4655
0
    if (c == 0)
4656
0
      {
4657
0
      *errorcodeptr = ERR6;
4658
0
      goto FAILED;
4659
0
      }
4660
4661
    /* If this is the first thing in the branch, there can be no first char
4662
    setting, whatever the repeat count. Any reqchar setting must remain
4663
    unchanged after any kind of repeat. */
4664
4665
0
    if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4666
0
    zerofirstchar = firstchar;
4667
0
    zeroreqchar = reqchar;
4668
4669
    /* If there are characters with values > 255, we have to compile an
4670
    extended class, with its own opcode, unless there was a negated special
4671
    such as \S in the class, and PCRE_UCP is not set, because in that case all
4672
    characters > 255 are in the class, so any that were explicitly given as
4673
    well can be ignored. If (when there are explicit characters > 255 that must
4674
    be listed) there are no characters < 256, we can omit the bitmap in the
4675
    actual compiled code. */
4676
4677
0
#ifdef SUPPORT_UTF
4678
0
    if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4679
#elif !defined COMPILE_PCRE8
4680
    if (xclass && !should_flip_negation)
4681
#endif
4682
0
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4683
0
      {
4684
0
      *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4685
0
      *code++ = OP_XCLASS;
4686
0
      code += LINK_SIZE;
4687
0
      *code = negate_class? XCL_NOT:0;
4688
4689
      /* If the map is required, move up the extra data to make room for it;
4690
      otherwise just move the code pointer to the end of the extra data. */
4691
4692
0
      if (class_has_8bitchar > 0)
4693
0
        {
4694
0
        *code++ |= XCL_MAP;
4695
0
        memmove(code + (32 / sizeof(pcre_uchar)), code,
4696
0
          IN_UCHARS(class_uchardata - code));
4697
0
        memcpy(code, classbits, 32);
4698
0
        code = class_uchardata + (32 / sizeof(pcre_uchar));
4699
0
        }
4700
0
      else code = class_uchardata;
4701
4702
      /* Now fill in the complete length of the item */
4703
4704
0
      PUT(previous, 1, (int)(code - previous));
4705
0
      break;   /* End of class handling */
4706
0
      }
4707
0
#endif
4708
4709
    /* If there are no characters > 255, or they are all to be included or
4710
    excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4711
    whole class was negated and whether there were negative specials such as \S
4712
    (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4713
    negating it if necessary. */
4714
4715
0
    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4716
0
    if (lengthptr == NULL)    /* Save time in the pre-compile phase */
4717
0
      {
4718
0
      if (negate_class)
4719
0
        for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4720
0
      memcpy(code, classbits, 32);
4721
0
      }
4722
0
    code += 32 / sizeof(pcre_uchar);
4723
0
    NOT_CHAR:
4724
0
    break;
4725
4726
4727
    /* ===================================================================*/
4728
    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4729
    has been tested above. */
4730
4731
0
    case CHAR_LEFT_CURLY_BRACKET:
4732
0
    if (!is_quantifier) goto NORMAL_CHAR;
4733
0
    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4734
0
    if (*errorcodeptr != 0) goto FAILED;
4735
0
    goto REPEAT;
4736
4737
0
    case CHAR_ASTERISK:
4738
0
    repeat_min = 0;
4739
0
    repeat_max = -1;
4740
0
    goto REPEAT;
4741
4742
0
    case CHAR_PLUS:
4743
0
    repeat_min = 1;
4744
0
    repeat_max = -1;
4745
0
    goto REPEAT;
4746
4747
0
    case CHAR_QUESTION_MARK:
4748
0
    repeat_min = 0;
4749
0
    repeat_max = 1;
4750
4751
0
    REPEAT:
4752
0
    if (previous == NULL)
4753
0
      {
4754
0
      *errorcodeptr = ERR9;
4755
0
      goto FAILED;
4756
0
      }
4757
4758
0
    if (repeat_min == 0)
4759
0
      {
4760
0
      firstchar = zerofirstchar;    /* Adjust for zero repeat */
4761
0
      reqchar = zeroreqchar;        /* Ditto */
4762
0
      }
4763
4764
    /* Remember whether this is a variable length repeat */
4765
4766
0
    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4767
4768
0
    op_type = 0;                    /* Default single-char op codes */
4769
0
    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4770
4771
    /* Save start of previous item, in case we have to move it up in order to
4772
    insert something before it. */
4773
4774
0
    tempcode = previous;
4775
4776
    /* If the next character is '+', we have a possessive quantifier. This
4777
    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4778
    If the next character is '?' this is a minimizing repeat, by default,
4779
    but if PCRE_UNGREEDY is set, it works the other way round. We change the
4780
    repeat type to the non-default. */
4781
4782
0
    if (ptr[1] == CHAR_PLUS)
4783
0
      {
4784
0
      repeat_type = 0;                  /* Force greedy */
4785
0
      possessive_quantifier = TRUE;
4786
0
      ptr++;
4787
0
      }
4788
0
    else if (ptr[1] == CHAR_QUESTION_MARK)
4789
0
      {
4790
0
      repeat_type = greedy_non_default;
4791
0
      ptr++;
4792
0
      }
4793
0
    else repeat_type = greedy_default;
4794
4795
    /* If previous was a recursion call, wrap it in atomic brackets so that
4796
    previous becomes the atomic group. All recursions were so wrapped in the
4797
    past, but it no longer happens for non-repeated recursions. In fact, the
4798
    repeated ones could be re-implemented independently so as not to need this,
4799
    but for the moment we rely on the code for repeating groups. */
4800
4801
0
    if (*previous == OP_RECURSE)
4802
0
      {
4803
0
      memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4804
0
      *previous = OP_ONCE;
4805
0
      PUT(previous, 1, 2 + 2*LINK_SIZE);
4806
0
      previous[2 + 2*LINK_SIZE] = OP_KET;
4807
0
      PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4808
0
      code += 2 + 2 * LINK_SIZE;
4809
0
      length_prevgroup = 3 + 3*LINK_SIZE;
4810
4811
      /* When actually compiling, we need to check whether this was a forward
4812
      reference, and if so, adjust the offset. */
4813
4814
0
      if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4815
0
        {
4816
0
        int offset = GET(cd->hwm, -LINK_SIZE);
4817
0
        if (offset == previous + 1 - cd->start_code)
4818
0
          PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4819
0
        }
4820
0
      }
4821
4822
    /* Now handle repetition for the different types of item. */
4823
4824
    /* If previous was a character or negated character match, abolish the item
4825
    and generate a repeat item instead. If a char item has a minimum of more
4826
    than one, ensure that it is set in reqchar - it might not be if a sequence
4827
    such as x{3} is the first thing in a branch because the x will have gone
4828
    into firstchar instead.  */
4829
4830
0
    if (*previous == OP_CHAR || *previous == OP_CHARI
4831
0
        || *previous == OP_NOT || *previous == OP_NOTI)
4832
0
      {
4833
0
      switch (*previous)
4834
0
        {
4835
0
        default: /* Make compiler happy. */
4836
0
        case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
4837
0
        case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4838
0
        case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
4839
0
        case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
4840
0
        }
4841
4842
      /* Deal with UTF characters that take up more than one character. It's
4843
      easier to write this out separately than try to macrify it. Use c to
4844
      hold the length of the character in bytes, plus UTF_LENGTH to flag that
4845
      it's a length rather than a small character. */
4846
4847
0
#ifdef SUPPORT_UTF
4848
0
      if (utf && NOT_FIRSTCHAR(code[-1]))
4849
0
        {
4850
0
        pcre_uchar *lastchar = code - 1;
4851
0
        BACKCHAR(lastchar);
4852
0
        c = (int)(code - lastchar);     /* Length of UTF-8 character */
4853
0
        memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4854
0
        c |= UTF_LENGTH;                /* Flag c as a length */
4855
0
        }
4856
0
      else
4857
0
#endif /* SUPPORT_UTF */
4858
4859
      /* Handle the case of a single character - either with no UTF support, or
4860
      with UTF disabled, or for a single character UTF character. */
4861
0
        {
4862
0
        c = code[-1];
4863
0
        if (*previous <= OP_CHARI && repeat_min > 1)
4864
0
          reqchar = c | req_caseopt | cd->req_varyopt;
4865
0
        }
4866
4867
      /* If the repetition is unlimited, it pays to see if the next thing on
4868
      the line is something that cannot possibly match this character. If so,
4869
      automatically possessifying this item gains some performance in the case
4870
      where the match fails. */
4871
4872
0
      if (!possessive_quantifier &&
4873
0
          repeat_max < 0 &&
4874
0
          check_auto_possessive(previous, utf, ptr + 1, options, cd))
4875
0
        {
4876
0
        repeat_type = 0;    /* Force greedy */
4877
0
        possessive_quantifier = TRUE;
4878
0
        }
4879
4880
0
      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4881
0
      }
4882
4883
    /* If previous was a character type match (\d or similar), abolish it and
4884
    create a suitable repeat item. The code is shared with single-character
4885
    repeats by setting op_type to add a suitable offset into repeat_type. Note
4886
    the the Unicode property types will be present only when SUPPORT_UCP is
4887
    defined, but we don't wrap the little bits of code here because it just
4888
    makes it horribly messy. */
4889
4890
0
    else if (*previous < OP_EODN)
4891
0
      {
4892
0
      pcre_uchar *oldcode;
4893
0
      int prop_type, prop_value;
4894
0
      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4895
0
      c = *previous;
4896
4897
0
      if (!possessive_quantifier &&
4898
0
          repeat_max < 0 &&
4899
0
          check_auto_possessive(previous, utf, ptr + 1, options, cd))
4900
0
        {
4901
0
        repeat_type = 0;    /* Force greedy */
4902
0
        possessive_quantifier = TRUE;
4903
0
        }
4904
4905
0
      OUTPUT_SINGLE_REPEAT:
4906
0
      if (*previous == OP_PROP || *previous == OP_NOTPROP)
4907
0
        {
4908
0
        prop_type = previous[1];
4909
0
        prop_value = previous[2];
4910
0
        }
4911
0
      else prop_type = prop_value = -1;
4912
4913
0
      oldcode = code;
4914
0
      code = previous;                  /* Usually overwrite previous item */
4915
4916
      /* If the maximum is zero then the minimum must also be zero; Perl allows
4917
      this case, so we do too - by simply omitting the item altogether. */
4918
4919
0
      if (repeat_max == 0) goto END_REPEAT;
4920
4921
      /*--------------------------------------------------------------------*/
4922
      /* This code is obsolete from release 8.00; the restriction was finally
4923
      removed: */
4924
4925
      /* All real repeats make it impossible to handle partial matching (maybe
4926
      one day we will be able to remove this restriction). */
4927
4928
      /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4929
      /*--------------------------------------------------------------------*/
4930
4931
      /* Combine the op_type with the repeat_type */
4932
4933
0
      repeat_type += op_type;
4934
4935
      /* A minimum of zero is handled either as the special case * or ?, or as
4936
      an UPTO, with the maximum given. */
4937
4938
0
      if (repeat_min == 0)
4939
0
        {
4940
0
        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4941
0
          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4942
0
        else
4943
0
          {
4944
0
          *code++ = OP_UPTO + repeat_type;
4945
0
          PUT2INC(code, 0, repeat_max);
4946
0
          }
4947
0
        }
4948
4949
      /* A repeat minimum of 1 is optimized into some special cases. If the
4950
      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4951
      left in place and, if the maximum is greater than 1, we use OP_UPTO with
4952
      one less than the maximum. */
4953
4954
0
      else if (repeat_min == 1)
4955
0
        {
4956
0
        if (repeat_max == -1)
4957
0
          *code++ = OP_PLUS + repeat_type;
4958
0
        else
4959
0
          {
4960
0
          code = oldcode;                 /* leave previous item in place */
4961
0
          if (repeat_max == 1) goto END_REPEAT;
4962
0
          *code++ = OP_UPTO + repeat_type;
4963
0
          PUT2INC(code, 0, repeat_max - 1);
4964
0
          }
4965
0
        }
4966
4967
      /* The case {n,n} is just an EXACT, while the general case {n,m} is
4968
      handled as an EXACT followed by an UPTO. */
4969
4970
0
      else
4971
0
        {
4972
0
        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
4973
0
        PUT2INC(code, 0, repeat_min);
4974
4975
        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4976
        we have to insert the character for the previous code. For a repeated
4977
        Unicode property match, there are two extra bytes that define the
4978
        required property. In UTF-8 mode, long characters have their length in
4979
        c, with the UTF_LENGTH bit as a flag. */
4980
4981
0
        if (repeat_max < 0)
4982
0
          {
4983
0
#ifdef SUPPORT_UTF
4984
0
          if (utf && (c & UTF_LENGTH) != 0)
4985
0
            {
4986
0
            memcpy(code, utf_chars, IN_UCHARS(c & 7));
4987
0
            code += c & 7;
4988
0
            }
4989
0
          else
4990
0
#endif
4991
0
            {
4992
0
            *code++ = c;
4993
0
            if (prop_type >= 0)
4994
0
              {
4995
0
              *code++ = prop_type;
4996
0
              *code++ = prop_value;
4997
0
              }
4998
0
            }
4999
0
          *code++ = OP_STAR + repeat_type;
5000
0
          }
5001
5002
        /* Else insert an UPTO if the max is greater than the min, again
5003
        preceded by the character, for the previously inserted code. If the
5004
        UPTO is just for 1 instance, we can use QUERY instead. */
5005
5006
0
        else if (repeat_max != repeat_min)
5007
0
          {
5008
0
#ifdef SUPPORT_UTF
5009
0
          if (utf && (c & UTF_LENGTH) != 0)
5010
0
            {
5011
0
            memcpy(code, utf_chars, IN_UCHARS(c & 7));
5012
0
            code += c & 7;
5013
0
            }
5014
0
          else
5015
0
#endif
5016
0
          *code++ = c;
5017
0
          if (prop_type >= 0)
5018
0
            {
5019
0
            *code++ = prop_type;
5020
0
            *code++ = prop_value;
5021
0
            }
5022
0
          repeat_max -= repeat_min;
5023
5024
0
          if (repeat_max == 1)
5025
0
            {
5026
0
            *code++ = OP_QUERY + repeat_type;
5027
0
            }
5028
0
          else
5029
0
            {
5030
0
            *code++ = OP_UPTO + repeat_type;
5031
0
            PUT2INC(code, 0, repeat_max);
5032
0
            }
5033
0
          }
5034
0
        }
5035
5036
      /* The character or character type itself comes last in all cases. */
5037
5038
0
#ifdef SUPPORT_UTF
5039
0
      if (utf && (c & UTF_LENGTH) != 0)
5040
0
        {
5041
0
        memcpy(code, utf_chars, IN_UCHARS(c & 7));
5042
0
        code += c & 7;
5043
0
        }
5044
0
      else
5045
0
#endif
5046
0
      *code++ = c;
5047
5048
      /* For a repeated Unicode property match, there are two extra bytes that
5049
      define the required property. */
5050
5051
0
#ifdef SUPPORT_UCP
5052
0
      if (prop_type >= 0)
5053
0
        {
5054
0
        *code++ = prop_type;
5055
0
        *code++ = prop_value;
5056
0
        }
5057
0
#endif
5058
0
      }
5059
5060
    /* If previous was a character class or a back reference, we put the repeat
5061
    stuff after it, but just skip the item if the repeat was {0,0}. */
5062
5063
0
    else if (*previous == OP_CLASS ||
5064
0
             *previous == OP_NCLASS ||
5065
0
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5066
0
             *previous == OP_XCLASS ||
5067
0
#endif
5068
0
             *previous == OP_REF ||
5069
0
             *previous == OP_REFI)
5070
0
      {
5071
0
      if (repeat_max == 0)
5072
0
        {
5073
0
        code = previous;
5074
0
        goto END_REPEAT;
5075
0
        }
5076
5077
      /*--------------------------------------------------------------------*/
5078
      /* This code is obsolete from release 8.00; the restriction was finally
5079
      removed: */
5080
5081
      /* All real repeats make it impossible to handle partial matching (maybe
5082
      one day we will be able to remove this restriction). */
5083
5084
      /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
5085
      /*--------------------------------------------------------------------*/
5086
5087
0
      if (repeat_min == 0 && repeat_max == -1)
5088
0
        *code++ = OP_CRSTAR + repeat_type;
5089
0
      else if (repeat_min == 1 && repeat_max == -1)
5090
0
        *code++ = OP_CRPLUS + repeat_type;
5091
0
      else if (repeat_min == 0 && repeat_max == 1)
5092
0
        *code++ = OP_CRQUERY + repeat_type;
5093
0
      else
5094
0
        {
5095
0
        *code++ = OP_CRRANGE + repeat_type;
5096
0
        PUT2INC(code, 0, repeat_min);
5097
0
        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5098
0
        PUT2INC(code, 0, repeat_max);
5099
0
        }
5100
0
      }
5101
5102
    /* If previous was a bracket group, we may have to replicate it in certain
5103
    cases. Note that at this point we can encounter only the "basic" bracket
5104
    opcodes such as BRA and CBRA, as this is the place where they get converted
5105
    into the more special varieties such as BRAPOS and SBRA. A test for >=
5106
    OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5107
    ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5108
    repetition of assertions, but now it does, for Perl compatibility. */
5109
5110
0
    else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5111
0
      {
5112
0
      int i;
5113
0
      int len = (int)(code - previous);
5114
0
      pcre_uchar *bralink = NULL;
5115
0
      pcre_uchar *brazeroptr = NULL;
5116
5117
      /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5118
      we just ignore the repeat. */
5119
5120
0
      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5121
0
        goto END_REPEAT;
5122
5123
      /* There is no sense in actually repeating assertions. The only potential
5124
      use of repetition is in cases when the assertion is optional. Therefore,
5125
      if the minimum is greater than zero, just ignore the repeat. If the
5126
      maximum is not not zero or one, set it to 1. */
5127
5128
0
      if (*previous < OP_ONCE)    /* Assertion */
5129
0
        {
5130
0
        if (repeat_min > 0) goto END_REPEAT;
5131
0
        if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5132
0
        }
5133
5134
      /* The case of a zero minimum is special because of the need to stick
5135
      OP_BRAZERO in front of it, and because the group appears once in the
5136
      data, whereas in other cases it appears the minimum number of times. For
5137
      this reason, it is simplest to treat this case separately, as otherwise
5138
      the code gets far too messy. There are several special subcases when the
5139
      minimum is zero. */
5140
5141
0
      if (repeat_min == 0)
5142
0
        {
5143
        /* If the maximum is also zero, we used to just omit the group from the
5144
        output altogether, like this:
5145
5146
        ** if (repeat_max == 0)
5147
        **   {
5148
        **   code = previous;
5149
        **   goto END_REPEAT;
5150
        **   }
5151
5152
        However, that fails when a group or a subgroup within it is referenced
5153
        as a subroutine from elsewhere in the pattern, so now we stick in
5154
        OP_SKIPZERO in front of it so that it is skipped on execution. As we
5155
        don't have a list of which groups are referenced, we cannot do this
5156
        selectively.
5157
5158
        If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5159
        and do no more at this point. However, we do need to adjust any
5160
        OP_RECURSE calls inside the group that refer to the group itself or any
5161
        internal or forward referenced group, because the offset is from the
5162
        start of the whole regex. Temporarily terminate the pattern while doing
5163
        this. */
5164
5165
0
        if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5166
0
          {
5167
0
          *code = OP_END;
5168
0
          adjust_recurse(previous, 1, utf, cd, save_hwm);
5169
0
          memmove(previous + 1, previous, IN_UCHARS(len));
5170
0
          code++;
5171
0
          if (repeat_max == 0)
5172
0
            {
5173
0
            *previous++ = OP_SKIPZERO;
5174
0
            goto END_REPEAT;
5175
0
            }
5176
0
          brazeroptr = previous;    /* Save for possessive optimizing */
5177
0
          *previous++ = OP_BRAZERO + repeat_type;
5178
0
          }
5179
5180
        /* If the maximum is greater than 1 and limited, we have to replicate
5181
        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5182
        The first one has to be handled carefully because it's the original
5183
        copy, which has to be moved up. The remainder can be handled by code
5184
        that is common with the non-zero minimum case below. We have to
5185
        adjust the value or repeat_max, since one less copy is required. Once
5186
        again, we may have to adjust any OP_RECURSE calls inside the group. */
5187
5188
0
        else
5189
0
          {
5190
0
          int offset;
5191
0
          *code = OP_END;
5192
0
          adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5193
0
          memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5194
0
          code += 2 + LINK_SIZE;
5195
0
          *previous++ = OP_BRAZERO + repeat_type;
5196
0
          *previous++ = OP_BRA;
5197
5198
          /* We chain together the bracket offset fields that have to be
5199
          filled in later when the ends of the brackets are reached. */
5200
5201
0
          offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5202
0
          bralink = previous;
5203
0
          PUTINC(previous, 0, offset);
5204
0
          }
5205
5206
0
        repeat_max--;
5207
0
        }
5208
5209
      /* If the minimum is greater than zero, replicate the group as many
5210
      times as necessary, and adjust the maximum to the number of subsequent
5211
      copies that we need. If we set a first char from the group, and didn't
5212
      set a required char, copy the latter from the former. If there are any
5213
      forward reference subroutine calls in the group, there will be entries on
5214
      the workspace list; replicate these with an appropriate increment. */
5215
5216
0
      else
5217
0
        {
5218
0
        if (repeat_min > 1)
5219
0
          {
5220
          /* In the pre-compile phase, we don't actually do the replication. We
5221
          just adjust the length as if we had. Do some paranoid checks for
5222
          potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5223
          integer type when available, otherwise double. */
5224
5225
0
          if (lengthptr != NULL)
5226
0
            {
5227
0
            int delta = (repeat_min - 1)*length_prevgroup;
5228
0
            if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5229
0
                  (INT64_OR_DOUBLE)length_prevgroup >
5230
0
                    (INT64_OR_DOUBLE)INT_MAX ||
5231
0
                OFLOW_MAX - *lengthptr < delta)
5232
0
              {
5233
0
              *errorcodeptr = ERR20;
5234
0
              goto FAILED;
5235
0
              }
5236
0
            *lengthptr += delta;
5237
0
            }
5238
5239
          /* This is compiling for real. If there is a set first byte for
5240
          the group, and we have not yet set a "required byte", set it. Make
5241
          sure there is enough workspace for copying forward references before
5242
          doing the copy. */
5243
5244
0
          else
5245
0
            {
5246
0
            if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5247
5248
0
            for (i = 1; i < repeat_min; i++)
5249
0
              {
5250
0
              pcre_uchar *hc;
5251
0
              pcre_uchar *this_hwm = cd->hwm;
5252
0
              memcpy(code, previous, IN_UCHARS(len));
5253
5254
0
              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5255
0
                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5256
0
                {
5257
0
                int save_offset = save_hwm - cd->start_workspace;
5258
0
                int this_offset = this_hwm - cd->start_workspace;
5259
0
                *errorcodeptr = expand_workspace(cd);
5260
0
                if (*errorcodeptr != 0) goto FAILED;
5261
0
                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5262
0
                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5263
0
                }
5264
5265
0
              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5266
0
                {
5267
0
                PUT(cd->hwm, 0, GET(hc, 0) + len);
5268
0
                cd->hwm += LINK_SIZE;
5269
0
                }
5270
0
              save_hwm = this_hwm;
5271
0
              code += len;
5272
0
              }
5273
0
            }
5274
0
          }
5275
5276
0
        if (repeat_max > 0) repeat_max -= repeat_min;
5277
0
        }
5278
5279
      /* This code is common to both the zero and non-zero minimum cases. If
5280
      the maximum is limited, it replicates the group in a nested fashion,
5281
      remembering the bracket starts on a stack. In the case of a zero minimum,
5282
      the first one was set up above. In all cases the repeat_max now specifies
5283
      the number of additional copies needed. Again, we must remember to
5284
      replicate entries on the forward reference list. */
5285
5286
0
      if (repeat_max >= 0)
5287
0
        {
5288
        /* In the pre-compile phase, we don't actually do the replication. We
5289
        just adjust the length as if we had. For each repetition we must add 1
5290
        to the length for BRAZERO and for all but the last repetition we must
5291
        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5292
        paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5293
        a 64-bit integer type when available, otherwise double. */
5294
5295
0
        if (lengthptr != NULL && repeat_max > 0)
5296
0
          {
5297
0
          int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5298
0
                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
5299
0
          if ((INT64_OR_DOUBLE)repeat_max *
5300
0
                (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5301
0
                  > (INT64_OR_DOUBLE)INT_MAX ||
5302
0
              OFLOW_MAX - *lengthptr < delta)
5303
0
            {
5304
0
            *errorcodeptr = ERR20;
5305
0
            goto FAILED;
5306
0
            }
5307
0
          *lengthptr += delta;
5308
0
          }
5309
5310
        /* This is compiling for real */
5311
5312
0
        else for (i = repeat_max - 1; i >= 0; i--)
5313
0
          {
5314
0
          pcre_uchar *hc;
5315
0
          pcre_uchar *this_hwm = cd->hwm;
5316
5317
0
          *code++ = OP_BRAZERO + repeat_type;
5318
5319
          /* All but the final copy start a new nesting, maintaining the
5320
          chain of brackets outstanding. */
5321
5322
0
          if (i != 0)
5323
0
            {
5324
0
            int offset;
5325
0
            *code++ = OP_BRA;
5326
0
            offset = (bralink == NULL)? 0 : (int)(code - bralink);
5327
0
            bralink = code;
5328
0
            PUTINC(code, 0, offset);
5329
0
            }
5330
5331
0
          memcpy(code, previous, IN_UCHARS(len));
5332
5333
          /* Ensure there is enough workspace for forward references before
5334
          copying them. */
5335
5336
0
          while (cd->hwm > cd->start_workspace + cd->workspace_size -
5337
0
                 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5338
0
            {
5339
0
            int save_offset = save_hwm - cd->start_workspace;
5340
0
            int this_offset = this_hwm - cd->start_workspace;
5341
0
            *errorcodeptr = expand_workspace(cd);
5342
0
            if (*errorcodeptr != 0) goto FAILED;
5343
0
            save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5344
0
            this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5345
0
            }
5346
5347
0
          for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5348
0
            {
5349
0
            PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5350
0
            cd->hwm += LINK_SIZE;
5351
0
            }
5352
0
          save_hwm = this_hwm;
5353
0
          code += len;
5354
0
          }
5355
5356
        /* Now chain through the pending brackets, and fill in their length
5357
        fields (which are holding the chain links pro tem). */
5358
5359
0
        while (bralink != NULL)
5360
0
          {
5361
0
          int oldlinkoffset;
5362
0
          int offset = (int)(code - bralink + 1);
5363
0
          pcre_uchar *bra = code - offset;
5364
0
          oldlinkoffset = GET(bra, 1);
5365
0
          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5366
0
          *code++ = OP_KET;
5367
0
          PUTINC(code, 0, offset);
5368
0
          PUT(bra, 1, offset);
5369
0
          }
5370
0
        }
5371
5372
      /* If the maximum is unlimited, set a repeater in the final copy. For
5373
      ONCE brackets, that's all we need to do. However, possessively repeated
5374
      ONCE brackets can be converted into non-capturing brackets, as the
5375
      behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5376
      deal with possessive ONCEs specially.
5377
5378
      Otherwise, when we are doing the actual compile phase, check to see
5379
      whether this group is one that could match an empty string. If so,
5380
      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5381
      that runtime checking can be done. [This check is also applied to ONCE
5382
      groups at runtime, but in a different way.]
5383
5384
      Then, if the quantifier was possessive and the bracket is not a
5385
      conditional, we convert the BRA code to the POS form, and the KET code to
5386
      KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5387
      subpattern at both the start and at the end.) The use of special opcodes
5388
      makes it possible to reduce greatly the stack usage in pcre_exec(). If
5389
      the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5390
5391
      Then, if the minimum number of matches is 1 or 0, cancel the possessive
5392
      flag so that the default action below, of wrapping everything inside
5393
      atomic brackets, does not happen. When the minimum is greater than 1,
5394
      there will be earlier copies of the group, and so we still have to wrap
5395
      the whole thing. */
5396
5397
0
      else
5398
0
        {
5399
0
        pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5400
0
        pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5401
5402
        /* Convert possessive ONCE brackets to non-capturing */
5403
5404
0
        if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5405
0
            possessive_quantifier) *bracode = OP_BRA;
5406
5407
        /* For non-possessive ONCE brackets, all we need to do is to
5408
        set the KET. */
5409
5410
0
        if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5411
0
          *ketcode = OP_KETRMAX + repeat_type;
5412
5413
        /* Handle non-ONCE brackets and possessive ONCEs (which have been
5414
        converted to non-capturing above). */
5415
5416
0
        else
5417
0
          {
5418
          /* In the compile phase, check for empty string matching. */
5419
5420
0
          if (lengthptr == NULL)
5421
0
            {
5422
0
            pcre_uchar *scode = bracode;
5423
0
            do
5424
0
              {
5425
0
              if (could_be_empty_branch(scode, ketcode, utf, cd))
5426
0
                {
5427
0
                *bracode += OP_SBRA - OP_BRA;
5428
0
                break;
5429
0
                }
5430
0
              scode += GET(scode, 1);
5431
0
              }
5432
0
            while (*scode == OP_ALT);
5433
0
            }
5434
5435
          /* Handle possessive quantifiers. */
5436
5437
0
          if (possessive_quantifier)
5438
0
            {
5439
            /* For COND brackets, we wrap the whole thing in a possessively
5440
            repeated non-capturing bracket, because we have not invented POS
5441
            versions of the COND opcodes. Because we are moving code along, we
5442
            must ensure that any pending recursive references are updated. */
5443
5444
0
            if (*bracode == OP_COND || *bracode == OP_SCOND)
5445
0
              {
5446
0
              int nlen = (int)(code - bracode);
5447
0
              *code = OP_END;
5448
0
              adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5449
0
              memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5450
0
              code += 1 + LINK_SIZE;
5451
0
              nlen += 1 + LINK_SIZE;
5452
0
              *bracode = OP_BRAPOS;
5453
0
              *code++ = OP_KETRPOS;
5454
0
              PUTINC(code, 0, nlen);
5455
0
              PUT(bracode, 1, nlen);
5456
0
              }
5457
5458
            /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5459
5460
0
            else
5461
0
              {
5462
0
              *bracode += 1;              /* Switch to xxxPOS opcodes */
5463
0
              *ketcode = OP_KETRPOS;
5464
0
              }
5465
5466
            /* If the minimum is zero, mark it as possessive, then unset the
5467
            possessive flag when the minimum is 0 or 1. */
5468
5469
0
            if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5470
0
            if (repeat_min < 2) possessive_quantifier = FALSE;
5471
0
            }
5472
5473
          /* Non-possessive quantifier */
5474
5475
0
          else *ketcode = OP_KETRMAX + repeat_type;
5476
0
          }
5477
0
        }
5478
0
      }
5479
5480
    /* If previous is OP_FAIL, it was generated by an empty class [] in
5481
    JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5482
    by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5483
    error above. We can just ignore the repeat in JS case. */
5484
5485
0
    else if (*previous == OP_FAIL) goto END_REPEAT;
5486
5487
    /* Else there's some kind of shambles */
5488
5489
0
    else
5490
0
      {
5491
0
      *errorcodeptr = ERR11;
5492
0
      goto FAILED;
5493
0
      }
5494
5495
    /* If the character following a repeat is '+', or if certain optimization
5496
    tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5497
    there are special alternative opcodes for this case. For anything else, we
5498
    wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5499
    notation is just syntactic sugar, taken from Sun's Java package, but the
5500
    special opcodes can optimize it.
5501
5502
    Some (but not all) possessively repeated subpatterns have already been
5503
    completely handled in the code just above. For them, possessive_quantifier
5504
    is always FALSE at this stage.
5505
5506
    Note that the repeated item starts at tempcode, not at previous, which
5507
    might be the first part of a string whose (former) last char we repeated.
5508
5509
    Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5510
    an 'upto' may follow. We skip over an 'exact' item, and then test the
5511
    length of what remains before proceeding. */
5512
5513
0
    if (possessive_quantifier)
5514
0
      {
5515
0
      int len;
5516
5517
0
      if (*tempcode == OP_TYPEEXACT)
5518
0
        tempcode += PRIV(OP_lengths)[*tempcode] +
5519
0
          ((tempcode[1 + IMM2_SIZE] == OP_PROP
5520
0
          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5521
5522
0
      else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5523
0
        {
5524
0
        tempcode += PRIV(OP_lengths)[*tempcode];
5525
0
#ifdef SUPPORT_UTF
5526
0
        if (utf && HAS_EXTRALEN(tempcode[-1]))
5527
0
          tempcode += GET_EXTRALEN(tempcode[-1]);
5528
0
#endif
5529
0
        }
5530
5531
0
      len = (int)(code - tempcode);
5532
0
      if (len > 0) switch (*tempcode)
5533
0
        {
5534
0
        case OP_STAR:  *tempcode = OP_POSSTAR; break;
5535
0
        case OP_PLUS:  *tempcode = OP_POSPLUS; break;
5536
0
        case OP_QUERY: *tempcode = OP_POSQUERY; break;
5537
0
        case OP_UPTO:  *tempcode = OP_POSUPTO; break;
5538
5539
0
        case OP_STARI:  *tempcode = OP_POSSTARI; break;
5540
0
        case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
5541
0
        case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5542
0
        case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
5543
5544
0
        case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
5545
0
        case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
5546
0
        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5547
0
        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
5548
5549
0
        case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
5550
0
        case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
5551
0
        case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5552
0
        case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
5553
5554
0
        case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
5555
0
        case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
5556
0
        case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5557
0
        case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
5558
5559
        /* Because we are moving code along, we must ensure that any
5560
        pending recursive references are updated. */
5561
5562
0
        default:
5563
0
        *code = OP_END;
5564
0
        adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5565
0
        memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5566
0
        code += 1 + LINK_SIZE;
5567
0
        len += 1 + LINK_SIZE;
5568
0
        tempcode[0] = OP_ONCE;
5569
0
        *code++ = OP_KET;
5570
0
        PUTINC(code, 0, len);
5571
0
        PUT(tempcode, 1, len);
5572
0
        break;
5573
0
        }
5574
0
      }
5575
5576
    /* In all case we no longer have a previous item. We also set the
5577
    "follows varying string" flag for subsequently encountered reqchars if
5578
    it isn't already set and we have just passed a varying length item. */
5579
5580
0
    END_REPEAT:
5581
0
    previous = NULL;
5582
0
    cd->req_varyopt |= reqvary;
5583
0
    break;
5584
5585
5586
    /* ===================================================================*/
5587
    /* Start of nested parenthesized sub-expression, or comment or lookahead or
5588
    lookbehind or option setting or condition or all the other extended
5589
    parenthesis forms.  */
5590
5591
0
    case CHAR_LEFT_PARENTHESIS:
5592
0
    newoptions = options;
5593
0
    skipbytes = 0;
5594
0
    bravalue = OP_CBRA;
5595
0
    save_hwm = cd->hwm;
5596
0
    reset_bracount = FALSE;
5597
5598
    /* First deal with various "verbs" that can be introduced by '*'. */
5599
5600
0
    ptr++;
5601
0
    if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5602
0
         || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5603
0
      {
5604
0
      int i, namelen;
5605
0
      int arglen = 0;
5606
0
      const char *vn = verbnames;
5607
0
      const pcre_uchar *name = ptr + 1;
5608
0
      const pcre_uchar *arg = NULL;
5609
0
      previous = NULL;
5610
0
      ptr++;
5611
0
      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5612
0
      namelen = (int)(ptr - name);
5613
5614
      /* It appears that Perl allows any characters whatsoever, other than
5615
      a closing parenthesis, to appear in arguments, so we no longer insist on
5616
      letters, digits, and underscores. */
5617
5618
0
      if (*ptr == CHAR_COLON)
5619
0
        {
5620
0
        arg = ++ptr;
5621
0
        while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5622
0
        arglen = (int)(ptr - arg);
5623
0
        if (arglen > (int)MAX_MARK)
5624
0
          {
5625
0
          *errorcodeptr = ERR75;
5626
0
          goto FAILED;
5627
0
          }
5628
0
        }
5629
5630
0
      if (*ptr != CHAR_RIGHT_PARENTHESIS)
5631
0
        {
5632
0
        *errorcodeptr = ERR60;
5633
0
        goto FAILED;
5634
0
        }
5635
5636
      /* Scan the table of verb names */
5637
5638
0
      for (i = 0; i < verbcount; i++)
5639
0
        {
5640
0
        if (namelen == verbs[i].len &&
5641
0
            STRNCMP_UC_C8(name, vn, namelen) == 0)
5642
0
          {
5643
          /* Check for open captures before ACCEPT and convert it to
5644
          ASSERT_ACCEPT if in an assertion. */
5645
5646
0
          if (verbs[i].op == OP_ACCEPT)
5647
0
            {
5648
0
            open_capitem *oc;
5649
0
            if (arglen != 0)
5650
0
              {
5651
0
              *errorcodeptr = ERR59;
5652
0
              goto FAILED;
5653
0
              }
5654
0
            cd->had_accept = TRUE;
5655
0
            for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5656
0
              {
5657
0
              *code++ = OP_CLOSE;
5658
0
              PUT2INC(code, 0, oc->number);
5659
0
              }
5660
0
            *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5661
5662
            /* Do not set firstchar after *ACCEPT */
5663
0
            if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5664
0
            }
5665
5666
          /* Handle other cases with/without an argument */
5667
5668
0
          else if (arglen == 0)
5669
0
            {
5670
0
            if (verbs[i].op < 0)   /* Argument is mandatory */
5671
0
              {
5672
0
              *errorcodeptr = ERR66;
5673
0
              goto FAILED;
5674
0
              }
5675
0
            *code = verbs[i].op;
5676
0
            if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
5677
0
            }
5678
5679
0
          else
5680
0
            {
5681
0
            if (verbs[i].op_arg < 0)   /* Argument is forbidden */
5682
0
              {
5683
0
              *errorcodeptr = ERR59;
5684
0
              goto FAILED;
5685
0
              }
5686
0
            *code = verbs[i].op_arg;
5687
0
            if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
5688
0
            *code++ = arglen;
5689
0
            memcpy(code, arg, IN_UCHARS(arglen));
5690
0
            code += arglen;
5691
0
            *code++ = 0;
5692
0
            }
5693
5694
0
          break;  /* Found verb, exit loop */
5695
0
          }
5696
5697
0
        vn += verbs[i].len + 1;
5698
0
        }
5699
5700
0
      if (i < verbcount) continue;    /* Successfully handled a verb */
5701
0
      *errorcodeptr = ERR60;          /* Verb not recognized */
5702
0
      goto FAILED;
5703
0
      }
5704
5705
    /* Deal with the extended parentheses; all are introduced by '?', and the
5706
    appearance of any of them means that this is not a capturing group. */
5707
5708
0
    else if (*ptr == CHAR_QUESTION_MARK)
5709
0
      {
5710
0
      int i, set, unset, namelen;
5711
0
      int *optset;
5712
0
      const pcre_uchar *name;
5713
0
      pcre_uchar *slot;
5714
5715
0
      switch (*(++ptr))
5716
0
        {
5717
0
        case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
5718
0
        ptr++;
5719
0
        while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5720
0
        if (*ptr == 0)
5721
0
          {
5722
0
          *errorcodeptr = ERR18;
5723
0
          goto FAILED;
5724
0
          }
5725
0
        continue;
5726
5727
5728
        /* ------------------------------------------------------------ */
5729
0
        case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
5730
0
        reset_bracount = TRUE;
5731
        /* Fall through */
5732
5733
        /* ------------------------------------------------------------ */
5734
0
        case CHAR_COLON:          /* Non-capturing bracket */
5735
0
        bravalue = OP_BRA;
5736
0
        ptr++;
5737
0
        break;
5738
5739
5740
        /* ------------------------------------------------------------ */
5741
0
        case CHAR_LEFT_PARENTHESIS:
5742
0
        bravalue = OP_COND;       /* Conditional group */
5743
5744
        /* A condition can be an assertion, a number (referring to a numbered
5745
        group), a name (referring to a named group), or 'R', referring to
5746
        recursion. R<digits> and R&name are also permitted for recursion tests.
5747
5748
        There are several syntaxes for testing a named group: (?(name)) is used
5749
        by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5750
5751
        There are two unfortunate ambiguities, caused by history. (a) 'R' can
5752
        be the recursive thing or the name 'R' (and similarly for 'R' followed
5753
        by digits), and (b) a number could be a name that consists of digits.
5754
        In both cases, we look for a name first; if not found, we try the other
5755
        cases. */
5756
5757
        /* For conditions that are assertions, check the syntax, and then exit
5758
        the switch. This will take control down to where bracketed groups,
5759
        including assertions, are processed. */
5760
5761
0
        if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
5762
0
            ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
5763
0
          break;
5764
5765
        /* Most other conditions use OP_CREF (a couple change to OP_RREF
5766
        below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5767
5768
0
        code[1+LINK_SIZE] = OP_CREF;
5769
0
        skipbytes = 1+IMM2_SIZE;
5770
0
        refsign = -1;
5771
5772
        /* Check for a test for recursion in a named group. */
5773
5774
0
        if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
5775
0
          {
5776
0
          terminator = -1;
5777
0
          ptr += 2;
5778
0
          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
5779
0
          }
5780
5781
        /* Check for a test for a named group's having been set, using the Perl
5782
        syntax (?(<name>) or (?('name') */
5783
5784
0
        else if (ptr[1] == CHAR_LESS_THAN_SIGN)
5785
0
          {
5786
0
          terminator = CHAR_GREATER_THAN_SIGN;
5787
0
          ptr++;
5788
0
          }
5789
0
        else if (ptr[1] == CHAR_APOSTROPHE)
5790
0
          {
5791
0
          terminator = CHAR_APOSTROPHE;
5792
0
          ptr++;
5793
0
          }
5794
0
        else
5795
0
          {
5796
0
          terminator = 0;
5797
0
          if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5798
0
          }
5799
5800
        /* We now expect to read a name; any thing else is an error */
5801
5802
0
        if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5803
0
          {
5804
0
          ptr += 1;  /* To get the right offset */
5805
0
          *errorcodeptr = ERR28;
5806
0
          goto FAILED;
5807
0
          }
5808
5809
        /* Read the name, but also get it as a number if it's all digits */
5810
5811
0
        recno = 0;
5812
0
        name = ++ptr;
5813
0
        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5814
0
          {
5815
0
          if (recno >= 0)
5816
0
            recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
5817
0
          ptr++;
5818
0
          }
5819
0
        namelen = (int)(ptr - name);
5820
5821
0
        if ((terminator > 0 && *ptr++ != terminator) ||
5822
0
            *ptr++ != CHAR_RIGHT_PARENTHESIS)
5823
0
          {
5824
0
          ptr--;      /* Error offset */
5825
0
          *errorcodeptr = ERR26;
5826
0
          goto FAILED;
5827
0
          }
5828
5829
        /* Do no further checking in the pre-compile phase. */
5830
5831
0
        if (lengthptr != NULL) break;
5832
5833
        /* In the real compile we do the work of looking for the actual
5834
        reference. If the string started with "+" or "-" we require the rest to
5835
        be digits, in which case recno will be set. */
5836
5837
0
        if (refsign > 0)
5838
0
          {
5839
0
          if (recno <= 0)
5840
0
            {
5841
0
            *errorcodeptr = ERR58;
5842
0
            goto FAILED;
5843
0
            }
5844
0
          recno = (refsign == CHAR_MINUS)?
5845
0
            cd->bracount - recno + 1 : recno +cd->bracount;
5846
0
          if (recno <= 0 || recno > cd->final_bracount)
5847
0
            {
5848
0
            *errorcodeptr = ERR15;
5849
0
            goto FAILED;
5850
0
            }
5851
0
          PUT2(code, 2+LINK_SIZE, recno);
5852
0
          break;
5853
0
          }
5854
5855
        /* Otherwise (did not start with "+" or "-"), start by looking for the
5856
        name. If we find a name, add one to the opcode to change OP_CREF or
5857
        OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5858
        except they record that the reference was originally to a name. The
5859
        information is used to check duplicate names. */
5860
5861
0
        slot = cd->name_table;
5862
0
        for (i = 0; i < cd->names_found; i++)
5863
0
          {
5864
0
          if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5865
0
          slot += cd->name_entry_size;
5866
0
          }
5867
5868
        /* Found a previous named subpattern */
5869
5870
0
        if (i < cd->names_found)
5871
0
          {
5872
0
          recno = GET2(slot, 0);
5873
0
          PUT2(code, 2+LINK_SIZE, recno);
5874
0
          code[1+LINK_SIZE]++;
5875
0
          }
5876
5877
        /* Search the pattern for a forward reference */
5878
5879
0
        else if ((i = find_parens(cd, name, namelen,
5880
0
                        (options & PCRE_EXTENDED) != 0, utf)) > 0)
5881
0
          {
5882
0
          PUT2(code, 2+LINK_SIZE, i);
5883
0
          code[1+LINK_SIZE]++;
5884
0
          }
5885
5886
        /* If terminator == 0 it means that the name followed directly after
5887
        the opening parenthesis [e.g. (?(abc)...] and in this case there are
5888
        some further alternatives to try. For the cases where terminator != 0
5889
        [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5890
        now checked all the possibilities, so give an error. */
5891
5892
0
        else if (terminator != 0)
5893
0
          {
5894
0
          *errorcodeptr = ERR15;
5895
0
          goto FAILED;
5896
0
          }
5897
5898
        /* Check for (?(R) for recursion. Allow digits after R to specify a
5899
        specific group number. */
5900
5901
0
        else if (*name == CHAR_R)
5902
0
          {
5903
0
          recno = 0;
5904
0
          for (i = 1; i < namelen; i++)
5905
0
            {
5906
0
            if (!IS_DIGIT(name[i]))
5907
0
              {
5908
0
              *errorcodeptr = ERR15;
5909
0
              goto FAILED;
5910
0
              }
5911
0
            recno = recno * 10 + name[i] - CHAR_0;
5912
0
            }
5913
0
          if (recno == 0) recno = RREF_ANY;
5914
0
          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
5915
0
          PUT2(code, 2+LINK_SIZE, recno);
5916
0
          }
5917
5918
        /* Similarly, check for the (?(DEFINE) "condition", which is always
5919
        false. */
5920
5921
0
        else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5922
0
          {
5923
0
          code[1+LINK_SIZE] = OP_DEF;
5924
0
          skipbytes = 1;
5925
0
          }
5926
5927
        /* Check for the "name" actually being a subpattern number. We are
5928
        in the second pass here, so final_bracount is set. */
5929
5930
0
        else if (recno > 0 && recno <= cd->final_bracount)
5931
0
          {
5932
0
          PUT2(code, 2+LINK_SIZE, recno);
5933
0
          }
5934
5935
        /* Either an unidentified subpattern, or a reference to (?(0) */
5936
5937
0
        else
5938
0
          {
5939
0
          *errorcodeptr = (recno == 0)? ERR35: ERR15;
5940
0
          goto FAILED;
5941
0
          }
5942
0
        break;
5943
5944
5945
        /* ------------------------------------------------------------ */
5946
0
        case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5947
0
        bravalue = OP_ASSERT;
5948
0
        cd->assert_depth += 1;
5949
0
        ptr++;
5950
0
        break;
5951
5952
5953
        /* ------------------------------------------------------------ */
5954
0
        case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
5955
0
        ptr++;
5956
0
        if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
5957
0
          {
5958
0
          *code++ = OP_FAIL;
5959
0
          previous = NULL;
5960
0
          continue;
5961
0
          }
5962
0
        bravalue = OP_ASSERT_NOT;
5963
0
        cd->assert_depth += 1;
5964
0
        break;
5965
5966
5967
        /* ------------------------------------------------------------ */
5968
0
        case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
5969
0
        switch (ptr[1])
5970
0
          {
5971
0
          case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5972
0
          bravalue = OP_ASSERTBACK;
5973
0
          cd->assert_depth += 1;
5974
0
          ptr += 2;
5975
0
          break;
5976
5977
0
          case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5978
0
          bravalue = OP_ASSERTBACK_NOT;
5979
0
          cd->assert_depth += 1;
5980
0
          ptr += 2;
5981
0
          break;
5982
5983
0
          default:                /* Could be name define, else bad */
5984
0
          if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5985
0
            goto DEFINE_NAME;
5986
0
          ptr++;                  /* Correct offset for error */
5987
0
          *errorcodeptr = ERR24;
5988
0
          goto FAILED;
5989
0
          }
5990
0
        break;
5991
5992
5993
        /* ------------------------------------------------------------ */
5994
0
        case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
5995
0
        bravalue = OP_ONCE;
5996
0
        ptr++;
5997
0
        break;
5998
5999
6000
        /* ------------------------------------------------------------ */
6001
0
        case CHAR_C:                 /* Callout - may be followed by digits; */
6002
0
        previous_callout = code;     /* Save for later completion */
6003
0
        after_manual_callout = 1;    /* Skip one item before completing */
6004
0
        *code++ = OP_CALLOUT;
6005
0
          {
6006
0
          int n = 0;
6007
0
          ptr++;
6008
0
          while(IS_DIGIT(*ptr))
6009
0
            n = n * 10 + *ptr++ - CHAR_0;
6010
0
          if (*ptr != CHAR_RIGHT_PARENTHESIS)
6011
0
            {
6012
0
            *errorcodeptr = ERR39;
6013
0
            goto FAILED;
6014
0
            }
6015
0
          if (n > 255)
6016
0
            {
6017
0
            *errorcodeptr = ERR38;
6018
0
            goto FAILED;
6019
0
            }
6020
0
          *code++ = n;
6021
0
          PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6022
0
          PUT(code, LINK_SIZE, 0);                          /* Default length */
6023
0
          code += 2 * LINK_SIZE;
6024
0
          }
6025
0
        previous = NULL;
6026
0
        continue;
6027
6028
6029
        /* ------------------------------------------------------------ */
6030
0
        case CHAR_P:              /* Python-style named subpattern handling */
6031
0
        if (*(++ptr) == CHAR_EQUALS_SIGN ||
6032
0
            *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
6033
0
          {
6034
0
          is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6035
0
          terminator = CHAR_RIGHT_PARENTHESIS;
6036
0
          goto NAMED_REF_OR_RECURSE;
6037
0
          }
6038
0
        else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
6039
0
          {
6040
0
          *errorcodeptr = ERR41;
6041
0
          goto FAILED;
6042
0
          }
6043
        /* Fall through to handle (?P< as (?< is handled */
6044
6045
6046
        /* ------------------------------------------------------------ */
6047
0
        DEFINE_NAME:    /* Come here from (?< handling */
6048
0
        case CHAR_APOSTROPHE:
6049
0
          {
6050
0
          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6051
0
            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6052
0
          name = ++ptr;
6053
6054
0
          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6055
0
          namelen = (int)(ptr - name);
6056
6057
          /* In the pre-compile phase, just do a syntax check. */
6058
6059
0
          if (lengthptr != NULL)
6060
0
            {
6061
0
            if (*ptr != terminator)
6062
0
              {
6063
0
              *errorcodeptr = ERR42;
6064
0
              goto FAILED;
6065
0
              }
6066
0
            if (cd->names_found >= MAX_NAME_COUNT)
6067
0
              {
6068
0
              *errorcodeptr = ERR49;
6069
0
              goto FAILED;
6070
0
              }
6071
0
            if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6072
0
              {
6073
0
              cd->name_entry_size = namelen + IMM2_SIZE + 1;
6074
0
              if (namelen > MAX_NAME_SIZE)
6075
0
                {
6076
0
                *errorcodeptr = ERR48;
6077
0
                goto FAILED;
6078
0
                }
6079
0
              }
6080
0
            }
6081
6082
          /* In the real compile, create the entry in the table, maintaining
6083
          alphabetical order. Duplicate names for different numbers are
6084
          permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
6085
          number are always OK. (An existing number can be re-used if (?|
6086
          appears in the pattern.) In either event, a duplicate name results in
6087
          a duplicate entry in the table, even if the number is the same. This
6088
          is because the number of names, and hence the table size, is computed
6089
          in the pre-compile, and it affects various numbers and pointers which
6090
          would all have to be modified, and the compiled code moved down, if
6091
          duplicates with the same number were omitted from the table. This
6092
          doesn't seem worth the hassle. However, *different* names for the
6093
          same number are not permitted. */
6094
6095
0
          else
6096
0
            {
6097
0
            BOOL dupname = FALSE;
6098
0
            slot = cd->name_table;
6099
6100
0
            for (i = 0; i < cd->names_found; i++)
6101
0
              {
6102
0
              int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
6103
0
              if (crc == 0)
6104
0
                {
6105
0
                if (slot[IMM2_SIZE+namelen] == 0)
6106
0
                  {
6107
0
                  if (GET2(slot, 0) != cd->bracount + 1 &&
6108
0
                      (options & PCRE_DUPNAMES) == 0)
6109
0
                    {
6110
0
                    *errorcodeptr = ERR43;
6111
0
                    goto FAILED;
6112
0
                    }
6113
0
                  else dupname = TRUE;
6114
0
                  }
6115
0
                else crc = -1;      /* Current name is a substring */
6116
0
                }
6117
6118
              /* Make space in the table and break the loop for an earlier
6119
              name. For a duplicate or later name, carry on. We do this for
6120
              duplicates so that in the simple case (when ?(| is not used) they
6121
              are in order of their numbers. */
6122
6123
0
              if (crc < 0)
6124
0
                {
6125
0
                memmove(slot + cd->name_entry_size, slot,
6126
0
                  IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
6127
0
                break;
6128
0
                }
6129
6130
              /* Continue the loop for a later or duplicate name */
6131
6132
0
              slot += cd->name_entry_size;
6133
0
              }
6134
6135
            /* For non-duplicate names, check for a duplicate number before
6136
            adding the new name. */
6137
6138
0
            if (!dupname)
6139
0
              {
6140
0
              pcre_uchar *cslot = cd->name_table;
6141
0
              for (i = 0; i < cd->names_found; i++)
6142
0
                {
6143
0
                if (cslot != slot)
6144
0
                  {
6145
0
                  if (GET2(cslot, 0) == cd->bracount + 1)
6146
0
                    {
6147
0
                    *errorcodeptr = ERR65;
6148
0
                    goto FAILED;
6149
0
                    }
6150
0
                  }
6151
0
                else i--;
6152
0
                cslot += cd->name_entry_size;
6153
0
                }
6154
0
              }
6155
6156
0
            PUT2(slot, 0, cd->bracount + 1);
6157
0
            memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
6158
0
            slot[IMM2_SIZE + namelen] = 0;
6159
0
            }
6160
0
          }
6161
6162
        /* In both pre-compile and compile, count the number of names we've
6163
        encountered. */
6164
6165
0
        cd->names_found++;
6166
0
        ptr++;                    /* Move past > or ' */
6167
0
        goto NUMBERED_GROUP;
6168
6169
6170
        /* ------------------------------------------------------------ */
6171
0
        case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
6172
0
        terminator = CHAR_RIGHT_PARENTHESIS;
6173
0
        is_recurse = TRUE;
6174
        /* Fall through */
6175
6176
        /* We come here from the Python syntax above that handles both
6177
        references (?P=name) and recursion (?P>name), as well as falling
6178
        through from the Perl recursion syntax (?&name). We also come here from
6179
        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6180
        .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6181
6182
0
        NAMED_REF_OR_RECURSE:
6183
0
        name = ++ptr;
6184
0
        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6185
0
        namelen = (int)(ptr - name);
6186
6187
        /* In the pre-compile phase, do a syntax check. We used to just set
6188
        a dummy reference number, because it was not used in the first pass.
6189
        However, with the change of recursive back references to be atomic,
6190
        we have to look for the number so that this state can be identified, as
6191
        otherwise the incorrect length is computed. If it's not a backwards
6192
        reference, the dummy number will do. */
6193
6194
0
        if (lengthptr != NULL)
6195
0
          {
6196
0
          const pcre_uchar *temp;
6197
6198
0
          if (namelen == 0)
6199
0
            {
6200
0
            *errorcodeptr = ERR62;
6201
0
            goto FAILED;
6202
0
            }
6203
0
          if (*ptr != terminator)
6204
0
            {
6205
0
            *errorcodeptr = ERR42;
6206
0
            goto FAILED;
6207
0
            }
6208
0
          if (namelen > MAX_NAME_SIZE)
6209
0
            {
6210
0
            *errorcodeptr = ERR48;
6211
0
            goto FAILED;
6212
0
            }
6213
6214
          /* The name table does not exist in the first pass, so we cannot
6215
          do a simple search as in the code below. Instead, we have to scan the
6216
          pattern to find the number. It is important that we scan it only as
6217
          far as we have got because the syntax of named subpatterns has not
6218
          been checked for the rest of the pattern, and find_parens() assumes
6219
          correct syntax. In any case, it's a waste of resources to scan
6220
          further. We stop the scan at the current point by temporarily
6221
          adjusting the value of cd->endpattern. */
6222
6223
0
          temp = cd->end_pattern;
6224
0
          cd->end_pattern = ptr;
6225
0
          recno = find_parens(cd, name, namelen,
6226
0
            (options & PCRE_EXTENDED) != 0, utf);
6227
0
          cd->end_pattern = temp;
6228
0
          if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
6229
0
          }
6230
6231
        /* In the real compile, seek the name in the table. We check the name
6232
        first, and then check that we have reached the end of the name in the
6233
        table. That way, if the name that is longer than any in the table,
6234
        the comparison will fail without reading beyond the table entry. */
6235
6236
0
        else
6237
0
          {
6238
0
          slot = cd->name_table;
6239
0
          for (i = 0; i < cd->names_found; i++)
6240
0
            {
6241
0
            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6242
0
                slot[IMM2_SIZE+namelen] == 0)
6243
0
              break;
6244
0
            slot += cd->name_entry_size;
6245
0
            }
6246
6247
0
          if (i < cd->names_found)         /* Back reference */
6248
0
            {
6249
0
            recno = GET2(slot, 0);
6250
0
            }
6251
0
          else if ((recno =                /* Forward back reference */
6252
0
                    find_parens(cd, name, namelen,
6253
0
                      (options & PCRE_EXTENDED) != 0, utf)) <= 0)
6254
0
            {
6255
0
            *errorcodeptr = ERR15;
6256
0
            goto FAILED;
6257
0
            }
6258
0
          }
6259
6260
        /* In both phases, we can now go to the code than handles numerical
6261
        recursion or backreferences. */
6262
6263
0
        if (is_recurse) goto HANDLE_RECURSION;
6264
0
          else goto HANDLE_REFERENCE;
6265
6266
6267
        /* ------------------------------------------------------------ */
6268
0
        case CHAR_R:              /* Recursion */
6269
0
        ptr++;                    /* Same as (?0)      */
6270
        /* Fall through */
6271
6272
6273
        /* ------------------------------------------------------------ */
6274
0
        case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
6275
0
        case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6276
0
        case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6277
0
          {
6278
0
          const pcre_uchar *called;
6279
0
          terminator = CHAR_RIGHT_PARENTHESIS;
6280
6281
          /* Come here from the \g<...> and \g'...' code (Oniguruma
6282
          compatibility). However, the syntax has been checked to ensure that
6283
          the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6284
          be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6285
          ever be taken. */
6286
6287
0
          HANDLE_NUMERICAL_RECURSION:
6288
6289
0
          if ((refsign = *ptr) == CHAR_PLUS)
6290
0
            {
6291
0
            ptr++;
6292
0
            if (!IS_DIGIT(*ptr))
6293
0
              {
6294
0
              *errorcodeptr = ERR63;
6295
0
              goto FAILED;
6296
0
              }
6297
0
            }
6298
0
          else if (refsign == CHAR_MINUS)
6299
0
            {
6300
0
            if (!IS_DIGIT(ptr[1]))
6301
0
              goto OTHER_CHAR_AFTER_QUERY;
6302
0
            ptr++;
6303
0
            }
6304
6305
0
          recno = 0;
6306
0
          while(IS_DIGIT(*ptr))
6307
0
            recno = recno * 10 + *ptr++ - CHAR_0;
6308
6309
0
          if (*ptr != terminator)
6310
0
            {
6311
0
            *errorcodeptr = ERR29;
6312
0
            goto FAILED;
6313
0
            }
6314
6315
0
          if (refsign == CHAR_MINUS)
6316
0
            {
6317
0
            if (recno == 0)
6318
0
              {
6319
0
              *errorcodeptr = ERR58;
6320
0
              goto FAILED;
6321
0
              }
6322
0
            recno = cd->bracount - recno + 1;
6323
0
            if (recno <= 0)
6324
0
              {
6325
0
              *errorcodeptr = ERR15;
6326
0
              goto FAILED;
6327
0
              }
6328
0
            }
6329
0
          else if (refsign == CHAR_PLUS)
6330
0
            {
6331
0
            if (recno == 0)
6332
0
              {
6333
0
              *errorcodeptr = ERR58;
6334
0
              goto FAILED;
6335
0
              }
6336
0
            recno += cd->bracount;
6337
0
            }
6338
6339
          /* Come here from code above that handles a named recursion */
6340
6341
0
          HANDLE_RECURSION:
6342
6343
0
          previous = code;
6344
0
          called = cd->start_code;
6345
6346
          /* When we are actually compiling, find the bracket that is being
6347
          referenced. Temporarily end the regex in case it doesn't exist before
6348
          this point. If we end up with a forward reference, first check that
6349
          the bracket does occur later so we can give the error (and position)
6350
          now. Then remember this forward reference in the workspace so it can
6351
          be filled in at the end. */
6352
6353
0
          if (lengthptr == NULL)
6354
0
            {
6355
0
            *code = OP_END;
6356
0
            if (recno != 0)
6357
0
              called = PRIV(find_bracket)(cd->start_code, utf, recno);
6358
6359
            /* Forward reference */
6360
6361
0
            if (called == NULL)
6362
0
              {
6363
0
              if (find_parens(cd, NULL, recno,
6364
0
                    (options & PCRE_EXTENDED) != 0, utf) < 0)
6365
0
                {
6366
0
                *errorcodeptr = ERR15;
6367
0
                goto FAILED;
6368
0
                }
6369
6370
              /* Fudge the value of "called" so that when it is inserted as an
6371
              offset below, what it actually inserted is the reference number
6372
              of the group. Then remember the forward reference. */
6373
6374
0
              called = cd->start_code + recno;
6375
0
              if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6376
0
                  WORK_SIZE_SAFETY_MARGIN)
6377
0
                {
6378
0
                *errorcodeptr = expand_workspace(cd);
6379
0
                if (*errorcodeptr != 0) goto FAILED;
6380
0
                }
6381
0
              PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6382
0
              }
6383
6384
            /* If not a forward reference, and the subpattern is still open,
6385
            this is a recursive call. We check to see if this is a left
6386
            recursion that could loop for ever, and diagnose that case. We
6387
            must not, however, do this check if we are in a conditional
6388
            subpattern because the condition might be testing for recursion in
6389
            a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
6390
            Forever loops are also detected at runtime, so those that occur in
6391
            conditional subpatterns will be picked up then. */
6392
6393
0
            else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6394
0
                     could_be_empty(called, code, bcptr, utf, cd))
6395
0
              {
6396
0
              *errorcodeptr = ERR40;
6397
0
              goto FAILED;
6398
0
              }
6399
0
            }
6400
6401
          /* Insert the recursion/subroutine item. It does not have a set first
6402
          character (relevant if it is repeated, because it will then be
6403
          wrapped with ONCE brackets). */
6404
6405
0
          *code = OP_RECURSE;
6406
0
          PUT(code, 1, (int)(called - cd->start_code));
6407
0
          code += 1 + LINK_SIZE;
6408
0
          groupsetfirstchar = FALSE;
6409
0
          }
6410
6411
        /* Can't determine a first byte now */
6412
6413
0
        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6414
0
        continue;
6415
6416
6417
        /* ------------------------------------------------------------ */
6418
0
        default:              /* Other characters: check option setting */
6419
0
        OTHER_CHAR_AFTER_QUERY:
6420
0
        set = unset = 0;
6421
0
        optset = &set;
6422
6423
0
        while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6424
0
          {
6425
0
          switch (*ptr++)
6426
0
            {
6427
0
            case CHAR_MINUS: optset = &unset; break;
6428
6429
0
            case CHAR_J:    /* Record that it changed in the external options */
6430
0
            *optset |= PCRE_DUPNAMES;
6431
0
            cd->external_flags |= PCRE_JCHANGED;
6432
0
            break;
6433
6434
0
            case CHAR_i: *optset |= PCRE_CASELESS; break;
6435
0
            case CHAR_m: *optset |= PCRE_MULTILINE; break;
6436
0
            case CHAR_s: *optset |= PCRE_DOTALL; break;
6437
0
            case CHAR_x: *optset |= PCRE_EXTENDED; break;
6438
0
            case CHAR_U: *optset |= PCRE_UNGREEDY; break;
6439
0
            case CHAR_X: *optset |= PCRE_EXTRA; break;
6440
6441
0
            default:  *errorcodeptr = ERR12;
6442
0
                      ptr--;    /* Correct the offset */
6443
0
                      goto FAILED;
6444
0
            }
6445
0
          }
6446
6447
        /* Set up the changed option bits, but don't change anything yet. */
6448
6449
0
        newoptions = (options | set) & (~unset);
6450
6451
        /* If the options ended with ')' this is not the start of a nested
6452
        group with option changes, so the options change at this level. If this
6453
        item is right at the start of the pattern, the options can be
6454
        abstracted and made external in the pre-compile phase, and ignored in
6455
        the compile phase. This can be helpful when matching -- for instance in
6456
        caseless checking of required bytes.
6457
6458
        If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
6459
        definitely *not* at the start of the pattern because something has been
6460
        compiled. In the pre-compile phase, however, the code pointer can have
6461
        that value after the start, because it gets reset as code is discarded
6462
        during the pre-compile. However, this can happen only at top level - if
6463
        we are within parentheses, the starting BRA will still be present. At
6464
        any parenthesis level, the length value can be used to test if anything
6465
        has been compiled at that level. Thus, a test for both these conditions
6466
        is necessary to ensure we correctly detect the start of the pattern in
6467
        both phases.
6468
6469
        If we are not at the pattern start, reset the greedy defaults and the
6470
        case value for firstchar and reqchar. */
6471
6472
0
        if (*ptr == CHAR_RIGHT_PARENTHESIS)
6473
0
          {
6474
0
          if (code == cd->start_code + 1 + LINK_SIZE &&
6475
0
               (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
6476
0
            {
6477
0
            cd->external_options = newoptions;
6478
0
            }
6479
0
          else
6480
0
            {
6481
0
            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6482
0
            greedy_non_default = greedy_default ^ 1;
6483
0
            req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6484
0
            }
6485
6486
          /* Change options at this level, and pass them back for use
6487
          in subsequent branches. */
6488
6489
0
          *optionsptr = options = newoptions;
6490
0
          previous = NULL;       /* This item can't be repeated */
6491
0
          continue;              /* It is complete */
6492
0
          }
6493
6494
        /* If the options ended with ':' we are heading into a nested group
6495
        with possible change of options. Such groups are non-capturing and are
6496
        not assertions of any kind. All we need to do is skip over the ':';
6497
        the newoptions value is handled below. */
6498
6499
0
        bravalue = OP_BRA;
6500
0
        ptr++;
6501
0
        }     /* End of switch for character following (? */
6502
0
      }       /* End of (? handling */
6503
6504
    /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
6505
    is set, all unadorned brackets become non-capturing and behave like (?:...)
6506
    brackets. */
6507
6508
0
    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
6509
0
      {
6510
0
      bravalue = OP_BRA;
6511
0
      }
6512
6513
    /* Else we have a capturing group. */
6514
6515
0
    else
6516
0
      {
6517
0
      NUMBERED_GROUP:
6518
0
      cd->bracount += 1;
6519
0
      PUT2(code, 1+LINK_SIZE, cd->bracount);
6520
0
      skipbytes = IMM2_SIZE;
6521
0
      }
6522
6523
    /* Process nested bracketed regex. Assertions used not to be repeatable,
6524
    but this was changed for Perl compatibility, so all kinds can now be
6525
    repeated. We copy code into a non-register variable (tempcode) in order to
6526
    be able to pass its address because some compilers complain otherwise. */
6527
6528
0
    previous = code;                      /* For handling repetition */
6529
0
    *code = bravalue;
6530
0
    tempcode = code;
6531
0
    tempreqvary = cd->req_varyopt;        /* Save value before bracket */
6532
0
    tempbracount = cd->bracount;          /* Save value before bracket */
6533
0
    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6534
6535
0
    if (!compile_regex(
6536
0
         newoptions,                      /* The complete new option state */
6537
0
         &tempcode,                       /* Where to put code (updated) */
6538
0
         &ptr,                            /* Input pointer (updated) */
6539
0
         errorcodeptr,                    /* Where to put an error message */
6540
0
         (bravalue == OP_ASSERTBACK ||
6541
0
          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
6542
0
         reset_bracount,                  /* True if (?| group */
6543
0
         skipbytes,                       /* Skip over bracket number */
6544
0
         cond_depth +
6545
0
           ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
6546
0
         &subfirstchar,                   /* For possible first char */
6547
0
         &subreqchar,                     /* For possible last char */
6548
0
         bcptr,                           /* Current branch chain */
6549
0
         cd,                              /* Tables block */
6550
0
         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6551
0
           &length_prevgroup              /* Pre-compile phase */
6552
0
         ))
6553
0
      goto FAILED;
6554
6555
    /* If this was an atomic group and there are no capturing groups within it,
6556
    generate OP_ONCE_NC instead of OP_ONCE. */
6557
6558
0
    if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
6559
0
      *code = OP_ONCE_NC;
6560
6561
0
    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6562
0
      cd->assert_depth -= 1;
6563
6564
    /* At the end of compiling, code is still pointing to the start of the
6565
    group, while tempcode has been updated to point past the end of the group.
6566
    The pattern pointer (ptr) is on the bracket.
6567
6568
    If this is a conditional bracket, check that there are no more than
6569
    two branches in the group, or just one if it's a DEFINE group. We do this
6570
    in the real compile phase, not in the pre-pass, where the whole group may
6571
    not be available. */
6572
6573
0
    if (bravalue == OP_COND && lengthptr == NULL)
6574
0
      {
6575
0
      pcre_uchar *tc = code;
6576
0
      int condcount = 0;
6577
6578
0
      do {
6579
0
         condcount++;
6580
0
         tc += GET(tc,1);
6581
0
         }
6582
0
      while (*tc != OP_KET);
6583
6584
      /* A DEFINE group is never obeyed inline (the "condition" is always
6585
      false). It must have only one branch. */
6586
6587
0
      if (code[LINK_SIZE+1] == OP_DEF)
6588
0
        {
6589
0
        if (condcount > 1)
6590
0
          {
6591
0
          *errorcodeptr = ERR54;
6592
0
          goto FAILED;
6593
0
          }
6594
0
        bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
6595
0
        }
6596
6597
      /* A "normal" conditional group. If there is just one branch, we must not
6598
      make use of its firstchar or reqchar, because this is equivalent to an
6599
      empty second branch. */
6600
6601
0
      else
6602
0
        {
6603
0
        if (condcount > 2)
6604
0
          {
6605
0
          *errorcodeptr = ERR27;
6606
0
          goto FAILED;
6607
0
          }
6608
0
        if (condcount == 1) subfirstchar = subreqchar = REQ_NONE;
6609
0
        }
6610
0
      }
6611
6612
    /* Error if hit end of pattern */
6613
6614
0
    if (*ptr != CHAR_RIGHT_PARENTHESIS)
6615
0
      {
6616
0
      *errorcodeptr = ERR14;
6617
0
      goto FAILED;
6618
0
      }
6619
6620
    /* In the pre-compile phase, update the length by the length of the group,
6621
    less the brackets at either end. Then reduce the compiled code to just a
6622
    set of non-capturing brackets so that it doesn't use much memory if it is
6623
    duplicated by a quantifier.*/
6624
6625
0
    if (lengthptr != NULL)
6626
0
      {
6627
0
      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6628
0
        {
6629
0
        *errorcodeptr = ERR20;
6630
0
        goto FAILED;
6631
0
        }
6632
0
      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6633
0
      code++;   /* This already contains bravalue */
6634
0
      PUTINC(code, 0, 1 + LINK_SIZE);
6635
0
      *code++ = OP_KET;
6636
0
      PUTINC(code, 0, 1 + LINK_SIZE);
6637
0
      break;    /* No need to waste time with special character handling */
6638
0
      }
6639
6640
    /* Otherwise update the main code pointer to the end of the group. */
6641
6642
0
    code = tempcode;
6643
6644
    /* For a DEFINE group, required and first character settings are not
6645
    relevant. */
6646
6647
0
    if (bravalue == OP_DEF) break;
6648
6649
    /* Handle updating of the required and first characters for other types of
6650
    group. Update for normal brackets of all kinds, and conditions with two
6651
    branches (see code above). If the bracket is followed by a quantifier with
6652
    zero repeat, we have to back off. Hence the definition of zeroreqchar and
6653
    zerofirstchar outside the main loop so that they can be accessed for the
6654
    back off. */
6655
6656
0
    zeroreqchar = reqchar;
6657
0
    zerofirstchar = firstchar;
6658
0
    groupsetfirstchar = FALSE;
6659
6660
0
    if (bravalue >= OP_ONCE)
6661
0
      {
6662
      /* If we have not yet set a firstchar in this branch, take it from the
6663
      subpattern, remembering that it was set here so that a repeat of more
6664
      than one can replicate it as reqchar if necessary. If the subpattern has
6665
      no firstchar, set "none" for the whole branch. In both cases, a zero
6666
      repeat forces firstchar to "none". */
6667
6668
0
      if (firstchar == REQ_UNSET)
6669
0
        {
6670
0
        if (subfirstchar >= 0)
6671
0
          {
6672
0
          firstchar = subfirstchar;
6673
0
          groupsetfirstchar = TRUE;
6674
0
          }
6675
0
        else firstchar = REQ_NONE;
6676
0
        zerofirstchar = REQ_NONE;
6677
0
        }
6678
6679
      /* If firstchar was previously set, convert the subpattern's firstchar
6680
      into reqchar if there wasn't one, using the vary flag that was in
6681
      existence beforehand. */
6682
6683
0
      else if (subfirstchar >= 0 && subreqchar < 0)
6684
0
        subreqchar = subfirstchar | tempreqvary;
6685
6686
      /* If the subpattern set a required byte (or set a first byte that isn't
6687
      really the first byte - see above), set it. */
6688
6689
0
      if (subreqchar >= 0) reqchar = subreqchar;
6690
0
      }
6691
6692
    /* For a forward assertion, we take the reqchar, if set. This can be
6693
    helpful if the pattern that follows the assertion doesn't set a different
6694
    char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
6695
    for an assertion, however because it leads to incorrect effect for patterns
6696
    such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
6697
    of a firstchar. This is overcome by a scan at the end if there's no
6698
    firstchar, looking for an asserted first char. */
6699
6700
0
    else if (bravalue == OP_ASSERT && subreqchar >= 0) reqchar = subreqchar;
6701
0
    break;     /* End of processing '(' */
6702
6703
6704
    /* ===================================================================*/
6705
    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
6706
    are arranged to be the negation of the corresponding OP_values in the
6707
    default case when PCRE_UCP is not set. For the back references, the values
6708
    are ESC_REF plus the reference number. Only back references and those types
6709
    that consume a character may be repeated. We can test for values between
6710
    ESC_b and ESC_Z for the latter; this may have to change if any new ones are
6711
    ever created. */
6712
6713
0
    case CHAR_BACKSLASH:
6714
0
    tempptr = ptr;
6715
0
    c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
6716
0
    if (*errorcodeptr != 0) goto FAILED;
6717
6718
0
    if (c < 0)
6719
0
      {
6720
0
      if (-c == ESC_Q)            /* Handle start of quoted string */
6721
0
        {
6722
0
        if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
6723
0
          ptr += 2;               /* avoid empty string */
6724
0
            else inescq = TRUE;
6725
0
        continue;
6726
0
        }
6727
6728
0
      if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
6729
6730
      /* For metasequences that actually match a character, we disable the
6731
      setting of a first character if it hasn't already been set. */
6732
6733
0
      if (firstchar == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
6734
0
        firstchar = REQ_NONE;
6735
6736
      /* Set values to reset to if this is followed by a zero repeat. */
6737
6738
0
      zerofirstchar = firstchar;
6739
0
      zeroreqchar = reqchar;
6740
6741
      /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
6742
      is a subroutine call by number (Oniguruma syntax). In fact, the value
6743
      -ESC_g is returned only for these cases. So we don't need to check for <
6744
      or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
6745
      -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
6746
      that is a synonym for a named back reference). */
6747
6748
0
      if (-c == ESC_g)
6749
0
        {
6750
0
        const pcre_uchar *p;
6751
0
        save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
6752
0
        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6753
0
          CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6754
6755
        /* These two statements stop the compiler for warning about possibly
6756
        unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
6757
        fact, because we actually check for a number below, the paths that
6758
        would actually be in error are never taken. */
6759
6760
0
        skipbytes = 0;
6761
0
        reset_bracount = FALSE;
6762
6763
        /* Test for a name */
6764
6765
0
        if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
6766
0
          {
6767
0
          BOOL is_a_number = TRUE;
6768
0
          for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6769
0
            {
6770
0
            if (!MAX_255(*p)) { is_a_number = FALSE; break; }
6771
0
            if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;
6772
0
            if ((cd->ctypes[*p] & ctype_word) == 0) break;
6773
0
            }
6774
0
          if (*p != terminator)
6775
0
            {
6776
0
            *errorcodeptr = ERR57;
6777
0
            break;
6778
0
            }
6779
0
          if (is_a_number)
6780
0
            {
6781
0
            ptr++;
6782
0
            goto HANDLE_NUMERICAL_RECURSION;
6783
0
            }
6784
0
          is_recurse = TRUE;
6785
0
          goto NAMED_REF_OR_RECURSE;
6786
0
          }
6787
6788
        /* Test a signed number in angle brackets or quotes. */
6789
6790
0
        p = ptr + 2;
6791
0
        while (IS_DIGIT(*p)) p++;
6792
0
        if (*p != terminator)
6793
0
          {
6794
0
          *errorcodeptr = ERR57;
6795
0
          break;
6796
0
          }
6797
0
        ptr++;
6798
0
        goto HANDLE_NUMERICAL_RECURSION;
6799
0
        }
6800
6801
      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6802
      We also support \k{name} (.NET syntax).  */
6803
6804
0
      if (-c == ESC_k)
6805
0
        {
6806
0
        if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
6807
0
          ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
6808
0
          {
6809
0
          *errorcodeptr = ERR69;
6810
0
          break;
6811
0
          }
6812
0
        is_recurse = FALSE;
6813
0
        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6814
0
          CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6815
0
          CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6816
0
        goto NAMED_REF_OR_RECURSE;
6817
0
        }
6818
6819
      /* Back references are handled specially; must disable firstchar if
6820
      not set to cope with cases like (?=(\w+))\1: which would otherwise set
6821
      ':' later. */
6822
6823
0
      if (-c >= ESC_REF)
6824
0
        {
6825
0
        open_capitem *oc;
6826
0
        recno = -c - ESC_REF;
6827
6828
0
        HANDLE_REFERENCE:    /* Come here from named backref handling */
6829
0
        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6830
0
        previous = code;
6831
0
        *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
6832
0
        PUT2INC(code, 0, recno);
6833
0
        cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6834
0
        if (recno > cd->top_backref) cd->top_backref = recno;
6835
6836
        /* Check to see if this back reference is recursive, that it, it
6837
        is inside the group that it references. A flag is set so that the
6838
        group can be made atomic. */
6839
6840
0
        for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6841
0
          {
6842
0
          if (oc->number == recno)
6843
0
            {
6844
0
            oc->flag = TRUE;
6845
0
            break;
6846
0
            }
6847
0
          }
6848
0
        }
6849
6850
      /* So are Unicode property matches, if supported. */
6851
6852
0
#ifdef SUPPORT_UCP
6853
0
      else if (-c == ESC_P || -c == ESC_p)
6854
0
        {
6855
0
        BOOL negated;
6856
0
        int pdata;
6857
0
        int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6858
0
        if (ptype < 0) goto FAILED;
6859
0
        previous = code;
6860
0
        *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
6861
0
        *code++ = ptype;
6862
0
        *code++ = pdata;
6863
0
        }
6864
#else
6865
6866
      /* If Unicode properties are not supported, \X, \P, and \p are not
6867
      allowed. */
6868
6869
      else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
6870
        {
6871
        *errorcodeptr = ERR45;
6872
        goto FAILED;
6873
        }
6874
#endif
6875
6876
      /* For the rest (including \X when Unicode properties are supported), we
6877
      can obtain the OP value by negating the escape value in the default
6878
      situation when PCRE_UCP is not set. When it *is* set, we substitute
6879
      Unicode property tests. Note that \b and \B do a one-character
6880
      lookbehind. */
6881
6882
0
      else
6883
0
        {
6884
0
        if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
6885
0
          cd->max_lookbehind = 1;
6886
0
#ifdef SUPPORT_UCP
6887
0
        if (-c >= ESC_DU && -c <= ESC_wu)
6888
0
          {
6889
0
          nestptr = ptr + 1;                   /* Where to resume */
6890
0
          ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
6891
0
          }
6892
0
        else
6893
0
#endif
6894
        /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6895
        so that it works in DFA mode and in lookbehinds. */
6896
6897
0
          {
6898
0
          previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6899
0
          *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
6900
0
          }
6901
0
        }
6902
0
      continue;
6903
0
      }
6904
6905
    /* We have a data character whose value is in c. In UTF-8 mode it may have
6906
    a value > 127. We set its representation in the length/buffer, and then
6907
    handle it as a data character. */
6908
6909
0
#ifdef SUPPORT_UTF
6910
0
    if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
6911
0
      mclength = PRIV(ord2utf)(c, mcbuffer);
6912
0
    else
6913
0
#endif
6914
6915
0
     {
6916
0
     mcbuffer[0] = c;
6917
0
     mclength = 1;
6918
0
     }
6919
0
    goto ONE_CHAR;
6920
6921
6922
    /* ===================================================================*/
6923
    /* Handle a literal character. It is guaranteed not to be whitespace or #
6924
    when the extended flag is set. If we are in UTF-8 mode, it may be a
6925
    multi-byte literal character. */
6926
6927
0
    default:
6928
0
    NORMAL_CHAR:
6929
0
    mclength = 1;
6930
0
    mcbuffer[0] = c;
6931
6932
0
#ifdef SUPPORT_UTF
6933
0
    if (utf && HAS_EXTRALEN(c))
6934
0
      ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
6935
0
#endif
6936
6937
    /* At this point we have the character's bytes in mcbuffer, and the length
6938
    in mclength. When not in UTF-8 mode, the length is always 1. */
6939
6940
0
    ONE_CHAR:
6941
0
    previous = code;
6942
0
    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
6943
0
    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6944
6945
    /* Remember if \r or \n were seen */
6946
6947
0
    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
6948
0
      cd->external_flags |= PCRE_HASCRORLF;
6949
6950
    /* Set the first and required bytes appropriately. If no previous first
6951
    byte, set it from this character, but revert to none on a zero repeat.
6952
    Otherwise, leave the firstchar value alone, and don't change it on a zero
6953
    repeat. */
6954
6955
0
    if (firstchar == REQ_UNSET)
6956
0
      {
6957
0
      zerofirstchar = REQ_NONE;
6958
0
      zeroreqchar = reqchar;
6959
6960
      /* If the character is more than one byte long, we can set firstchar
6961
      only if it is not to be matched caselessly. */
6962
6963
0
      if (mclength == 1 || req_caseopt == 0)
6964
0
        {
6965
0
        firstchar = mcbuffer[0] | req_caseopt;
6966
0
        if (mclength != 1) reqchar = code[-1] | cd->req_varyopt;
6967
0
        }
6968
0
      else firstchar = reqchar = REQ_NONE;
6969
0
      }
6970
6971
    /* firstchar was previously set; we can set reqchar only if the length is
6972
    1 or the matching is caseful. */
6973
6974
0
    else
6975
0
      {
6976
0
      zerofirstchar = firstchar;
6977
0
      zeroreqchar = reqchar;
6978
0
      if (mclength == 1 || req_caseopt == 0)
6979
0
        reqchar = code[-1] | req_caseopt | cd->req_varyopt;
6980
0
      }
6981
6982
0
    break;            /* End of literal character handling */
6983
0
    }
6984
0
  }                   /* end of big loop */
6985
6986
6987
/* Control never reaches here by falling through, only by a goto for all the
6988
error states. Pass back the position in the pattern so that it can be displayed
6989
to the user for diagnosing the error. */
6990
6991
0
FAILED:
6992
0
*ptrptr = ptr;
6993
0
return FALSE;
6994
0
}
6995
6996
6997
6998
6999
/*************************************************
7000
*     Compile sequence of alternatives           *
7001
*************************************************/
7002
7003
/* On entry, ptr is pointing past the bracket character, but on return it
7004
points to the closing bracket, or vertical bar, or end of string. The code
7005
variable is pointing at the byte into which the BRA operator has been stored.
7006
This function is used during the pre-compile phase when we are trying to find
7007
out the amount of memory needed, as well as during the real compile phase. The
7008
value of lengthptr distinguishes the two phases.
7009
7010
Arguments:
7011
  options        option bits, including any changes for this subpattern
7012
  codeptr        -> the address of the current code pointer
7013
  ptrptr         -> the address of the current pattern pointer
7014
  errorcodeptr   -> pointer to error code variable
7015
  lookbehind     TRUE if this is a lookbehind assertion
7016
  reset_bracount TRUE to reset the count for each branch
7017
  skipbytes      skip this many bytes at start (for brackets and OP_COND)
7018
  cond_depth     depth of nesting for conditional subpatterns
7019
  firstcharptr   place to put the first required character, or a negative number
7020
  reqcharptr     place to put the last required character, or a negative number
7021
  bcptr          pointer to the chain of currently open branches
7022
  cd             points to the data block with tables pointers etc.
7023
  lengthptr      NULL during the real compile phase
7024
                 points to length accumulator during pre-compile phase
7025
7026
Returns:         TRUE on success
7027
*/
7028
7029
static BOOL
7030
compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
7031
  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
7032
  int cond_depth, pcre_int32 *firstcharptr, pcre_int32 *reqcharptr,
7033
  branch_chain *bcptr, compile_data *cd, int *lengthptr)
7034
0
{
7035
0
const pcre_uchar *ptr = *ptrptr;
7036
0
pcre_uchar *code = *codeptr;
7037
0
pcre_uchar *last_branch = code;
7038
0
pcre_uchar *start_bracket = code;
7039
0
pcre_uchar *reverse_count = NULL;
7040
0
open_capitem capitem;
7041
0
int capnumber = 0;
7042
0
pcre_int32 firstchar, reqchar;
7043
0
pcre_int32 branchfirstchar, branchreqchar;
7044
0
int length;
7045
0
int orig_bracount;
7046
0
int max_bracount;
7047
0
branch_chain bc;
7048
7049
0
bc.outer = bcptr;
7050
0
bc.current_branch = code;
7051
7052
0
firstchar = reqchar = REQ_UNSET;
7053
7054
/* Accumulate the length for use in the pre-compile phase. Start with the
7055
length of the BRA and KET and any extra bytes that are required at the
7056
beginning. We accumulate in a local variable to save frequent testing of
7057
lenthptr for NULL. We cannot do this by looking at the value of code at the
7058
start and end of each alternative, because compiled items are discarded during
7059
the pre-compile phase so that the work space is not exceeded. */
7060
7061
0
length = 2 + 2*LINK_SIZE + skipbytes;
7062
7063
/* WARNING: If the above line is changed for any reason, you must also change
7064
the code that abstracts option settings at the start of the pattern and makes
7065
them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
7066
pre-compile phase to find out whether anything has yet been compiled or not. */
7067
7068
/* If this is a capturing subpattern, add to the chain of open capturing items
7069
so that we can detect them if (*ACCEPT) is encountered. This is also used to
7070
detect groups that contain recursive back references to themselves. Note that
7071
only OP_CBRA need be tested here; changing this opcode to one of its variants,
7072
e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
7073
7074
0
if (*code == OP_CBRA)
7075
0
  {
7076
0
  capnumber = GET2(code, 1 + LINK_SIZE);
7077
0
  capitem.number = capnumber;
7078
0
  capitem.next = cd->open_caps;
7079
0
  capitem.flag = FALSE;
7080
0
  cd->open_caps = &capitem;
7081
0
  }
7082
7083
/* Offset is set zero to mark that this bracket is still open */
7084
7085
0
PUT(code, 1, 0);
7086
0
code += 1 + LINK_SIZE + skipbytes;
7087
7088
/* Loop for each alternative branch */
7089
7090
0
orig_bracount = max_bracount = cd->bracount;
7091
0
for (;;)
7092
0
  {
7093
  /* For a (?| group, reset the capturing bracket count so that each branch
7094
  uses the same numbers. */
7095
7096
0
  if (reset_bracount) cd->bracount = orig_bracount;
7097
7098
  /* Set up dummy OP_REVERSE if lookbehind assertion */
7099
7100
0
  if (lookbehind)
7101
0
    {
7102
0
    *code++ = OP_REVERSE;
7103
0
    reverse_count = code;
7104
0
    PUTINC(code, 0, 0);
7105
0
    length += 1 + LINK_SIZE;
7106
0
    }
7107
7108
  /* Now compile the branch; in the pre-compile phase its length gets added
7109
  into the length. */
7110
7111
0
  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
7112
0
        &branchreqchar, &bc, cond_depth, cd,
7113
0
        (lengthptr == NULL)? NULL : &length))
7114
0
    {
7115
0
    *ptrptr = ptr;
7116
0
    return FALSE;
7117
0
    }
7118
7119
  /* Keep the highest bracket count in case (?| was used and some branch
7120
  has fewer than the rest. */
7121
7122
0
  if (cd->bracount > max_bracount) max_bracount = cd->bracount;
7123
7124
  /* In the real compile phase, there is some post-processing to be done. */
7125
7126
0
  if (lengthptr == NULL)
7127
0
    {
7128
    /* If this is the first branch, the firstchar and reqchar values for the
7129
    branch become the values for the regex. */
7130
7131
0
    if (*last_branch != OP_ALT)
7132
0
      {
7133
0
      firstchar = branchfirstchar;
7134
0
      reqchar = branchreqchar;
7135
0
      }
7136
7137
    /* If this is not the first branch, the first char and reqchar have to
7138
    match the values from all the previous branches, except that if the
7139
    previous value for reqchar didn't have REQ_VARY set, it can still match,
7140
    and we set REQ_VARY for the regex. */
7141
7142
0
    else
7143
0
      {
7144
      /* If we previously had a firstchar, but it doesn't match the new branch,
7145
      we have to abandon the firstchar for the regex, but if there was
7146
      previously no reqchar, it takes on the value of the old firstchar. */
7147
7148
0
      if (firstchar >= 0 && firstchar != branchfirstchar)
7149
0
        {
7150
0
        if (reqchar < 0) reqchar = firstchar;
7151
0
        firstchar = REQ_NONE;
7152
0
        }
7153
7154
      /* If we (now or from before) have no firstchar, a firstchar from the
7155
      branch becomes a reqchar if there isn't a branch reqchar. */
7156
7157
0
      if (firstchar < 0 && branchfirstchar >= 0 && branchreqchar < 0)
7158
0
          branchreqchar = branchfirstchar;
7159
7160
      /* Now ensure that the reqchars match */
7161
7162
0
      if ((reqchar & ~REQ_VARY) != (branchreqchar & ~REQ_VARY))
7163
0
        reqchar = REQ_NONE;
7164
0
      else reqchar |= branchreqchar;   /* To "or" REQ_VARY */
7165
0
      }
7166
7167
    /* If lookbehind, check that this branch matches a fixed-length string, and
7168
    put the length into the OP_REVERSE item. Temporarily mark the end of the
7169
    branch with OP_END. If the branch contains OP_RECURSE, the result is -3
7170
    because there may be forward references that we can't check here. Set a
7171
    flag to cause another lookbehind check at the end. Why not do it all at the
7172
    end? Because common, erroneous checks are picked up here and the offset of
7173
    the problem can be shown. */
7174
7175
0
    if (lookbehind)
7176
0
      {
7177
0
      int fixed_length;
7178
0
      *code = OP_END;
7179
0
      fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
7180
0
        FALSE, cd);
7181
0
      DPRINTF(("fixed length = %d\n", fixed_length));
7182
0
      if (fixed_length == -3)
7183
0
        {
7184
0
        cd->check_lookbehind = TRUE;
7185
0
        }
7186
0
      else if (fixed_length < 0)
7187
0
        {
7188
0
        *errorcodeptr = (fixed_length == -2)? ERR36 :
7189
0
                        (fixed_length == -4)? ERR70: ERR25;
7190
0
        *ptrptr = ptr;
7191
0
        return FALSE;
7192
0
        }
7193
0
      else
7194
0
        {
7195
0
        if (fixed_length > cd->max_lookbehind)
7196
0
          cd->max_lookbehind = fixed_length;
7197
0
        PUT(reverse_count, 0, fixed_length);
7198
0
        }
7199
0
      }
7200
0
    }
7201
7202
  /* Reached end of expression, either ')' or end of pattern. In the real
7203
  compile phase, go back through the alternative branches and reverse the chain
7204
  of offsets, with the field in the BRA item now becoming an offset to the
7205
  first alternative. If there are no alternatives, it points to the end of the
7206
  group. The length in the terminating ket is always the length of the whole
7207
  bracketed item. Return leaving the pointer at the terminating char. */
7208
7209
0
  if (*ptr != CHAR_VERTICAL_LINE)
7210
0
    {
7211
0
    if (lengthptr == NULL)
7212
0
      {
7213
0
      int branch_length = (int)(code - last_branch);
7214
0
      do
7215
0
        {
7216
0
        int prev_length = GET(last_branch, 1);
7217
0
        PUT(last_branch, 1, branch_length);
7218
0
        branch_length = prev_length;
7219
0
        last_branch -= branch_length;
7220
0
        }
7221
0
      while (branch_length > 0);
7222
0
      }
7223
7224
    /* Fill in the ket */
7225
7226
0
    *code = OP_KET;
7227
0
    PUT(code, 1, (int)(code - start_bracket));
7228
0
    code += 1 + LINK_SIZE;
7229
7230
    /* If it was a capturing subpattern, check to see if it contained any
7231
    recursive back references. If so, we must wrap it in atomic brackets.
7232
    In any event, remove the block from the chain. */
7233
7234
0
    if (capnumber > 0)
7235
0
      {
7236
0
      if (cd->open_caps->flag)
7237
0
        {
7238
0
        memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
7239
0
          IN_UCHARS(code - start_bracket));
7240
0
        *start_bracket = OP_ONCE;
7241
0
        code += 1 + LINK_SIZE;
7242
0
        PUT(start_bracket, 1, (int)(code - start_bracket));
7243
0
        *code = OP_KET;
7244
0
        PUT(code, 1, (int)(code - start_bracket));
7245
0
        code += 1 + LINK_SIZE;
7246
0
        length += 2 + 2*LINK_SIZE;
7247
0
        }
7248
0
      cd->open_caps = cd->open_caps->next;
7249
0
      }
7250
7251
    /* Retain the highest bracket number, in case resetting was used. */
7252
7253
0
    cd->bracount = max_bracount;
7254
7255
    /* Set values to pass back */
7256
7257
0
    *codeptr = code;
7258
0
    *ptrptr = ptr;
7259
0
    *firstcharptr = firstchar;
7260
0
    *reqcharptr = reqchar;
7261
0
    if (lengthptr != NULL)
7262
0
      {
7263
0
      if (OFLOW_MAX - *lengthptr < length)
7264
0
        {
7265
0
        *errorcodeptr = ERR20;
7266
0
        return FALSE;
7267
0
        }
7268
0
      *lengthptr += length;
7269
0
      }
7270
0
    return TRUE;
7271
0
    }
7272
7273
  /* Another branch follows. In the pre-compile phase, we can move the code
7274
  pointer back to where it was for the start of the first branch. (That is,
7275
  pretend that each branch is the only one.)
7276
7277
  In the real compile phase, insert an ALT node. Its length field points back
7278
  to the previous branch while the bracket remains open. At the end the chain
7279
  is reversed. It's done like this so that the start of the bracket has a
7280
  zero offset until it is closed, making it possible to detect recursion. */
7281
7282
0
  if (lengthptr != NULL)
7283
0
    {
7284
0
    code = *codeptr + 1 + LINK_SIZE + skipbytes;
7285
0
    length += 1 + LINK_SIZE;
7286
0
    }
7287
0
  else
7288
0
    {
7289
0
    *code = OP_ALT;
7290
0
    PUT(code, 1, (int)(code - last_branch));
7291
0
    bc.current_branch = last_branch = code;
7292
0
    code += 1 + LINK_SIZE;
7293
0
    }
7294
7295
0
  ptr++;
7296
0
  }
7297
/* Control never reaches here */
7298
0
}
7299
7300
7301
7302
7303
/*************************************************
7304
*          Check for anchored expression         *
7305
*************************************************/
7306
7307
/* Try to find out if this is an anchored regular expression. Consider each
7308
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
7309
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
7310
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
7311
be found, because ^ generates OP_CIRCM in that mode.
7312
7313
We can also consider a regex to be anchored if OP_SOM starts all its branches.
7314
This is the code for \G, which means "match at start of match position, taking
7315
into account the match offset".
7316
7317
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
7318
because that will try the rest of the pattern at all possible matching points,
7319
so there is no point trying again.... er ....
7320
7321
.... except when the .* appears inside capturing parentheses, and there is a
7322
subsequent back reference to those parentheses. We haven't enough information
7323
to catch that case precisely.
7324
7325
At first, the best we could do was to detect when .* was in capturing brackets
7326
and the highest back reference was greater than or equal to that level.
7327
However, by keeping a bitmap of the first 31 back references, we can catch some
7328
of the more common cases more precisely.
7329
7330
Arguments:
7331
  code           points to start of expression (the bracket)
7332
  bracket_map    a bitmap of which brackets we are inside while testing; this
7333
                  handles up to substring 31; after that we just have to take
7334
                  the less precise approach
7335
  backref_map    the back reference bitmap
7336
7337
Returns:     TRUE or FALSE
7338
*/
7339
7340
static BOOL
7341
is_anchored(const pcre_uchar *code, unsigned int bracket_map,
7342
  unsigned int backref_map)
7343
0
{
7344
0
do {
7345
0
   const pcre_uchar *scode = first_significant_code(
7346
0
     code + PRIV(OP_lengths)[*code], FALSE);
7347
0
   int op = *scode;
7348
7349
   /* Non-capturing brackets */
7350
7351
0
   if (op == OP_BRA  || op == OP_BRAPOS ||
7352
0
       op == OP_SBRA || op == OP_SBRAPOS)
7353
0
     {
7354
0
     if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
7355
0
     }
7356
7357
   /* Capturing brackets */
7358
7359
0
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7360
0
            op == OP_SCBRA || op == OP_SCBRAPOS)
7361
0
     {
7362
0
     int n = GET2(scode, 1+LINK_SIZE);
7363
0
     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7364
0
     if (!is_anchored(scode, new_map, backref_map)) return FALSE;
7365
0
     }
7366
7367
   /* Other brackets */
7368
7369
0
   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||
7370
0
            op == OP_COND)
7371
0
     {
7372
0
     if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
7373
0
     }
7374
7375
   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7376
   it isn't in brackets that are or may be referenced. */
7377
7378
0
   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7379
0
             op == OP_TYPEPOSSTAR))
7380
0
     {
7381
0
     if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
7382
0
       return FALSE;
7383
0
     }
7384
7385
   /* Check for explicit anchoring */
7386
7387
0
   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7388
0
   code += GET(code, 1);
7389
0
   }
7390
0
while (*code == OP_ALT);   /* Loop for each alternative */
7391
0
return TRUE;
7392
0
}
7393
7394
7395
7396
/*************************************************
7397
*         Check for starting with ^ or .*        *
7398
*************************************************/
7399
7400
/* This is called to find out if every branch starts with ^ or .* so that
7401
"first char" processing can be done to speed things up in multiline
7402
matching and for non-DOTALL patterns that start with .* (which must start at
7403
the beginning or after \n). As in the case of is_anchored() (see above), we
7404
have to take account of back references to capturing brackets that contain .*
7405
because in that case we can't make the assumption.
7406
7407
Arguments:
7408
  code           points to start of expression (the bracket)
7409
  bracket_map    a bitmap of which brackets we are inside while testing; this
7410
                  handles up to substring 31; after that we just have to take
7411
                  the less precise approach
7412
  backref_map    the back reference bitmap
7413
7414
Returns:         TRUE or FALSE
7415
*/
7416
7417
static BOOL
7418
is_startline(const pcre_uchar *code, unsigned int bracket_map,
7419
  unsigned int backref_map)
7420
0
{
7421
0
do {
7422
0
   const pcre_uchar *scode = first_significant_code(
7423
0
     code + PRIV(OP_lengths)[*code], FALSE);
7424
0
   int op = *scode;
7425
7426
   /* If we are at the start of a conditional assertion group, *both* the
7427
   conditional assertion *and* what follows the condition must satisfy the test
7428
   for start of line. Other kinds of condition fail. Note that there may be an
7429
   auto-callout at the start of a condition. */
7430
7431
0
   if (op == OP_COND)
7432
0
     {
7433
0
     scode += 1 + LINK_SIZE;
7434
0
     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
7435
0
     switch (*scode)
7436
0
       {
7437
0
       case OP_CREF:
7438
0
       case OP_NCREF:
7439
0
       case OP_RREF:
7440
0
       case OP_NRREF:
7441
0
       case OP_DEF:
7442
0
       return FALSE;
7443
7444
0
       default:     /* Assertion */
7445
0
       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7446
0
       do scode += GET(scode, 1); while (*scode == OP_ALT);
7447
0
       scode += 1 + LINK_SIZE;
7448
0
       break;
7449
0
       }
7450
0
     scode = first_significant_code(scode, FALSE);
7451
0
     op = *scode;
7452
0
     }
7453
7454
   /* Non-capturing brackets */
7455
7456
0
   if (op == OP_BRA  || op == OP_BRAPOS ||
7457
0
       op == OP_SBRA || op == OP_SBRAPOS)
7458
0
     {
7459
0
     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7460
0
     }
7461
7462
   /* Capturing brackets */
7463
7464
0
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7465
0
            op == OP_SCBRA || op == OP_SCBRAPOS)
7466
0
     {
7467
0
     int n = GET2(scode, 1+LINK_SIZE);
7468
0
     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7469
0
     if (!is_startline(scode, new_map, backref_map)) return FALSE;
7470
0
     }
7471
7472
   /* Other brackets */
7473
7474
0
   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)
7475
0
     {
7476
0
     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7477
0
     }
7478
7479
   /* .* means "start at start or after \n" if it isn't in brackets that
7480
   may be referenced. */
7481
7482
0
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
7483
0
     {
7484
0
     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
7485
0
     }
7486
7487
   /* Check for explicit circumflex */
7488
7489
0
   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
7490
7491
   /* Move on to the next alternative */
7492
7493
0
   code += GET(code, 1);
7494
0
   }
7495
0
while (*code == OP_ALT);  /* Loop for each alternative */
7496
0
return TRUE;
7497
0
}
7498
7499
7500
7501
/*************************************************
7502
*       Check for asserted fixed first char      *
7503
*************************************************/
7504
7505
/* During compilation, the "first char" settings from forward assertions are
7506
discarded, because they can cause conflicts with actual literals that follow.
7507
However, if we end up without a first char setting for an unanchored pattern,
7508
it is worth scanning the regex to see if there is an initial asserted first
7509
char. If all branches start with the same asserted char, or with a bracket all
7510
of whose alternatives start with the same asserted char (recurse ad lib), then
7511
we return that char, otherwise -1.
7512
7513
Arguments:
7514
  code       points to start of expression (the bracket)
7515
  inassert   TRUE if in an assertion
7516
7517
Returns:     -1 or the fixed first char
7518
*/
7519
7520
static int
7521
find_firstassertedchar(const pcre_uchar *code, BOOL inassert)
7522
0
{
7523
0
int c = -1;
7524
0
do {
7525
0
   int d;
7526
0
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
7527
0
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
7528
0
   const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
7529
0
     TRUE);
7530
0
   int op = *scode;
7531
7532
0
   switch(op)
7533
0
     {
7534
0
     default:
7535
0
     return -1;
7536
7537
0
     case OP_BRA:
7538
0
     case OP_BRAPOS:
7539
0
     case OP_CBRA:
7540
0
     case OP_SCBRA:
7541
0
     case OP_CBRAPOS:
7542
0
     case OP_SCBRAPOS:
7543
0
     case OP_ASSERT:
7544
0
     case OP_ONCE:
7545
0
     case OP_ONCE_NC:
7546
0
     case OP_COND:
7547
0
     if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
7548
0
       return -1;
7549
0
     if (c < 0) c = d; else if (c != d) return -1;
7550
0
     break;
7551
7552
0
     case OP_EXACT:
7553
0
     scode += IMM2_SIZE;
7554
     /* Fall through */
7555
7556
0
     case OP_CHAR:
7557
0
     case OP_PLUS:
7558
0
     case OP_MINPLUS:
7559
0
     case OP_POSPLUS:
7560
0
     if (!inassert) return -1;
7561
0
     if (c < 0) c = scode[1];
7562
0
       else if (c != scode[1]) return -1;
7563
0
     break;
7564
7565
0
     case OP_EXACTI:
7566
0
     scode += IMM2_SIZE;
7567
     /* Fall through */
7568
7569
0
     case OP_CHARI:
7570
0
     case OP_PLUSI:
7571
0
     case OP_MINPLUSI:
7572
0
     case OP_POSPLUSI:
7573
0
     if (!inassert) return -1;
7574
0
     if (c < 0) c = scode[1] | REQ_CASELESS;
7575
0
       else if (c != scode[1]) return -1;
7576
0
     break;
7577
0
     }
7578
7579
0
   code += GET(code, 1);
7580
0
   }
7581
0
while (*code == OP_ALT);
7582
0
return c;
7583
0
}
7584
7585
7586
7587
/*************************************************
7588
*        Compile a Regular Expression            *
7589
*************************************************/
7590
7591
/* This function takes a string and returns a pointer to a block of store
7592
holding a compiled version of the expression. The original API for this
7593
function had no error code return variable; it is retained for backwards
7594
compatibility. The new function is given a new name.
7595
7596
Arguments:
7597
  pattern       the regular expression
7598
  options       various option bits
7599
  errorcodeptr  pointer to error code variable (pcre_compile2() only)
7600
                  can be NULL if you don't want a code value
7601
  errorptr      pointer to pointer to error text
7602
  erroroffset   ptr offset in pattern where error was detected
7603
  tables        pointer to character tables or NULL
7604
7605
Returns:        pointer to compiled data block, or NULL on error,
7606
                with errorptr and erroroffset set
7607
*/
7608
7609
#ifdef COMPILE_PCRE8
7610
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7611
pcre_compile(const char *pattern, int options, const char **errorptr,
7612
  int *erroroffset, const unsigned char *tables)
7613
#else
7614
PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
7615
pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
7616
  int *erroroffset, const unsigned char *tables)
7617
#endif
7618
0
{
7619
0
#ifdef COMPILE_PCRE8
7620
0
return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
7621
#else
7622
return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
7623
#endif
7624
0
}
7625
7626
7627
#ifdef COMPILE_PCRE8
7628
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7629
pcre_compile2(const char *pattern, int options, int *errorcodeptr,
7630
  const char **errorptr, int *erroroffset, const unsigned char *tables)
7631
#else
7632
PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
7633
pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
7634
  const char **errorptr, int *erroroffset, const unsigned char *tables)
7635
#endif
7636
0
{
7637
0
REAL_PCRE *re;
7638
0
int length = 1;  /* For final END opcode */
7639
0
pcre_int32 firstchar, reqchar;
7640
0
int newline;
7641
0
int errorcode = 0;
7642
0
int skipatstart = 0;
7643
0
BOOL utf;
7644
0
size_t size;
7645
0
pcre_uchar *code;
7646
0
const pcre_uchar *codestart;
7647
0
const pcre_uchar *ptr;
7648
0
compile_data compile_block;
7649
0
compile_data *cd = &compile_block;
7650
7651
/* This space is used for "compiling" into during the first phase, when we are
7652
computing the amount of memory that is needed. Compiled items are thrown away
7653
as soon as possible, so that a fairly large buffer should be sufficient for
7654
this purpose. The same space is used in the second phase for remembering where
7655
to fill in forward references to subpatterns. That may overflow, in which case
7656
new memory is obtained from malloc(). */
7657
7658
0
pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7659
7660
/* Set this early so that early errors get offset 0. */
7661
7662
0
ptr = (const pcre_uchar *)pattern;
7663
7664
/* We can't pass back an error message if errorptr is NULL; I guess the best we
7665
can do is just return NULL, but we can set a code value if there is a code
7666
pointer. */
7667
7668
0
if (errorptr == NULL)
7669
0
  {
7670
0
  if (errorcodeptr != NULL) *errorcodeptr = 99;
7671
0
  return NULL;
7672
0
  }
7673
7674
0
*errorptr = NULL;
7675
0
if (errorcodeptr != NULL) *errorcodeptr = ERR0;
7676
7677
/* However, we can give a message for this error */
7678
7679
0
if (erroroffset == NULL)
7680
0
  {
7681
0
  errorcode = ERR16;
7682
0
  goto PCRE_EARLY_ERROR_RETURN2;
7683
0
  }
7684
7685
0
*erroroffset = 0;
7686
7687
/* Set up pointers to the individual character tables */
7688
7689
0
if (tables == NULL) tables = PRIV(default_tables);
7690
0
cd->lcc = tables + lcc_offset;
7691
0
cd->fcc = tables + fcc_offset;
7692
0
cd->cbits = tables + cbits_offset;
7693
0
cd->ctypes = tables + ctypes_offset;
7694
7695
/* Check that all undefined public option bits are zero */
7696
7697
0
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
7698
0
  {
7699
0
  errorcode = ERR17;
7700
0
  goto PCRE_EARLY_ERROR_RETURN;
7701
0
  }
7702
7703
/* Check for global one-time settings at the start of the pattern, and remember
7704
the offset for later. */
7705
7706
0
while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
7707
0
       ptr[skipatstart+1] == CHAR_ASTERISK)
7708
0
  {
7709
0
  int newnl = 0;
7710
0
  int newbsr = 0;
7711
7712
0
#ifdef COMPILE_PCRE8
7713
0
  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 5) == 0)
7714
0
    { skipatstart += 7; options |= PCRE_UTF8; continue; }
7715
0
#endif
7716
#ifdef COMPILE_PCRE16
7717
  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0)
7718
    { skipatstart += 8; options |= PCRE_UTF16; continue; }
7719
#endif
7720
0
  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
7721
0
    { skipatstart += 6; options |= PCRE_UCP; continue; }
7722
0
  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
7723
0
    { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
7724
7725
0
  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
7726
0
    { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
7727
0
  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
7728
0
    { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
7729
0
  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
7730
0
    { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
7731
0
  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
7732
0
    { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
7733
0
  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
7734
0
    { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
7735
7736
0
  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
7737
0
    { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
7738
0
  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
7739
0
    { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
7740
7741
0
  if (newnl != 0)
7742
0
    options = (options & ~PCRE_NEWLINE_BITS) | newnl;
7743
0
  else if (newbsr != 0)
7744
0
    options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
7745
0
  else break;
7746
0
  }
7747
7748
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
7749
0
utf = (options & PCRE_UTF8) != 0;
7750
7751
/* Can't support UTF unless PCRE has been compiled to include the code. The
7752
return of an error code from PRIV(valid_utf)() is a new feature, introduced in
7753
release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7754
not used here. */
7755
7756
0
#ifdef SUPPORT_UTF
7757
0
if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7758
0
     (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
7759
0
  {
7760
0
#ifdef COMPILE_PCRE8
7761
0
  errorcode = ERR44;
7762
#else
7763
  errorcode = ERR74;
7764
#endif
7765
0
  goto PCRE_EARLY_ERROR_RETURN2;
7766
0
  }
7767
#else
7768
if (utf)
7769
  {
7770
  errorcode = ERR32;
7771
  goto PCRE_EARLY_ERROR_RETURN;
7772
  }
7773
#endif
7774
7775
/* Can't support UCP unless PCRE has been compiled to include the code. */
7776
7777
#ifndef SUPPORT_UCP
7778
if ((options & PCRE_UCP) != 0)
7779
  {
7780
  errorcode = ERR67;
7781
  goto PCRE_EARLY_ERROR_RETURN;
7782
  }
7783
#endif
7784
7785
/* Check validity of \R options. */
7786
7787
0
if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
7788
0
     (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
7789
0
  {
7790
0
  errorcode = ERR56;
7791
0
  goto PCRE_EARLY_ERROR_RETURN;
7792
0
  }
7793
7794
/* Handle different types of newline. The three bits give seven cases. The
7795
current code allows for fixed one- or two-byte sequences, plus "any" and
7796
"anycrlf". */
7797
7798
0
switch (options & PCRE_NEWLINE_BITS)
7799
0
  {
7800
0
  case 0: newline = NEWLINE; break;   /* Build-time default */
7801
0
  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
7802
0
  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
7803
0
  case PCRE_NEWLINE_CR+
7804
0
       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
7805
0
  case PCRE_NEWLINE_ANY: newline = -1; break;
7806
0
  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
7807
0
  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
7808
0
  }
7809
7810
0
if (newline == -2)
7811
0
  {
7812
0
  cd->nltype = NLTYPE_ANYCRLF;
7813
0
  }
7814
0
else if (newline < 0)
7815
0
  {
7816
0
  cd->nltype = NLTYPE_ANY;
7817
0
  }
7818
0
else
7819
0
  {
7820
0
  cd->nltype = NLTYPE_FIXED;
7821
0
  if (newline > 255)
7822
0
    {
7823
0
    cd->nllen = 2;
7824
0
    cd->nl[0] = (newline >> 8) & 255;
7825
0
    cd->nl[1] = newline & 255;
7826
0
    }
7827
0
  else
7828
0
    {
7829
0
    cd->nllen = 1;
7830
0
    cd->nl[0] = newline;
7831
0
    }
7832
0
  }
7833
7834
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
7835
references to help in deciding whether (.*) can be treated as anchored or not.
7836
*/
7837
7838
0
cd->top_backref = 0;
7839
0
cd->backref_map = 0;
7840
7841
/* Reflect pattern for debugging output */
7842
7843
0
DPRINTF(("------------------------------------------------------------------\n"));
7844
#ifdef PCRE_DEBUG
7845
print_puchar(stdout, (PCRE_PUCHAR)pattern);
7846
#endif
7847
0
DPRINTF(("\n"));
7848
7849
/* Pretend to compile the pattern while actually just accumulating the length
7850
of memory required. This behaviour is triggered by passing a non-NULL final
7851
argument to compile_regex(). We pass a block of workspace (cworkspace) for it
7852
to compile parts of the pattern into; the compiled code is discarded when it is
7853
no longer needed, so hopefully this workspace will never overflow, though there
7854
is a test for its doing so. */
7855
7856
0
cd->bracount = cd->final_bracount = 0;
7857
0
cd->names_found = 0;
7858
0
cd->name_entry_size = 0;
7859
0
cd->name_table = NULL;
7860
0
cd->start_code = cworkspace;
7861
0
cd->hwm = cworkspace;
7862
0
cd->start_workspace = cworkspace;
7863
0
cd->workspace_size = COMPILE_WORK_SIZE;
7864
0
cd->start_pattern = (const pcre_uchar *)pattern;
7865
0
cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7866
0
cd->req_varyopt = 0;
7867
0
cd->assert_depth = 0;
7868
0
cd->max_lookbehind = 0;
7869
0
cd->external_options = options;
7870
0
cd->external_flags = 0;
7871
0
cd->open_caps = NULL;
7872
7873
/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
7874
don't need to look at the result of the function here. The initial options have
7875
been put into the cd block so that they can be changed if an option setting is
7876
found within the regex right at the beginning. Bringing initial option settings
7877
outside can help speed up starting point checks. */
7878
7879
0
ptr += skipatstart;
7880
0
code = cworkspace;
7881
0
*code = OP_BRA;
7882
0
(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
7883
0
  FALSE, 0, 0, &firstchar, &reqchar, NULL, cd, &length);
7884
0
if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
7885
7886
0
DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
7887
0
  (int)(cd->hwm - cworkspace)));
7888
7889
0
if (length > MAX_PATTERN_SIZE)
7890
0
  {
7891
0
  errorcode = ERR20;
7892
0
  goto PCRE_EARLY_ERROR_RETURN;
7893
0
  }
7894
7895
/* Compute the size of data block needed and get it, either from malloc or
7896
externally provided function. Integer overflow should no longer be possible
7897
because nowadays we limit the maximum value of cd->names_found and
7898
cd->name_entry_size. */
7899
7900
0
size = sizeof(REAL_PCRE) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
7901
0
re = (REAL_PCRE *)(PUBL(malloc))(size);
7902
7903
0
if (re == NULL)
7904
0
  {
7905
0
  errorcode = ERR21;
7906
0
  goto PCRE_EARLY_ERROR_RETURN;
7907
0
  }
7908
7909
/* Put in the magic number, and save the sizes, initial options, internal
7910
flags, and character table pointer. NULL is used for the default character
7911
tables. The nullpad field is at the end; it's there to help in the case when a
7912
regex compiled on a system with 4-byte pointers is run on another with 8-byte
7913
pointers. */
7914
7915
0
re->magic_number = MAGIC_NUMBER;
7916
0
re->size = (int)size;
7917
0
re->options = cd->external_options;
7918
0
re->flags = cd->external_flags;
7919
0
re->first_char = 0;
7920
0
re->req_char = 0;
7921
0
re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
7922
0
re->name_entry_size = cd->name_entry_size;
7923
0
re->name_count = cd->names_found;
7924
0
re->ref_count = 0;
7925
0
re->tables = (tables == PRIV(default_tables))? NULL : tables;
7926
0
re->nullpad = NULL;
7927
7928
/* The starting points of the name/number translation table and of the code are
7929
passed around in the compile data block. The start/end pattern and initial
7930
options are already set from the pre-compile phase, as is the name_entry_size
7931
field. Reset the bracket count and the names_found field. Also reset the hwm
7932
field; this time it's used for remembering forward references to subpatterns.
7933
*/
7934
7935
0
cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7936
0
cd->assert_depth = 0;
7937
0
cd->bracount = 0;
7938
0
cd->max_lookbehind = 0;
7939
0
cd->names_found = 0;
7940
0
cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7941
0
codestart = cd->name_table + re->name_entry_size * re->name_count;
7942
0
cd->start_code = codestart;
7943
0
cd->hwm = (pcre_uchar *)(cd->start_workspace);
7944
0
cd->req_varyopt = 0;
7945
0
cd->had_accept = FALSE;
7946
0
cd->check_lookbehind = FALSE;
7947
0
cd->open_caps = NULL;
7948
7949
/* Set up a starting, non-extracting bracket, then compile the expression. On
7950
error, errorcode will be set non-zero, so we don't need to look at the result
7951
of the function here. */
7952
7953
0
ptr = (const pcre_uchar *)pattern + skipatstart;
7954
0
code = (pcre_uchar *)codestart;
7955
0
*code = OP_BRA;
7956
0
(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
7957
0
  &firstchar, &reqchar, NULL, cd, NULL);
7958
0
re->top_bracket = cd->bracount;
7959
0
re->top_backref = cd->top_backref;
7960
0
re->max_lookbehind = cd->max_lookbehind;
7961
0
re->flags = cd->external_flags | PCRE_MODE;
7962
7963
0
if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
7964
7965
/* If not reached end of pattern on success, there's an excess bracket. */
7966
7967
0
if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
7968
7969
/* Fill in the terminating state and check for disastrous overflow, but
7970
if debugging, leave the test till after things are printed out. */
7971
7972
0
*code++ = OP_END;
7973
7974
0
#ifndef PCRE_DEBUG
7975
0
if (code - codestart > length) errorcode = ERR23;
7976
0
#endif
7977
7978
/* Fill in any forward references that are required. There may be repeated
7979
references; optimize for them, as searching a large regex takes time. */
7980
7981
0
if (cd->hwm > cd->start_workspace)
7982
0
  {
7983
0
  int prev_recno = -1;
7984
0
  const pcre_uchar *groupptr = NULL;
7985
0
  while (errorcode == 0 && cd->hwm > cd->start_workspace)
7986
0
    {
7987
0
    int offset, recno;
7988
0
    cd->hwm -= LINK_SIZE;
7989
0
    offset = GET(cd->hwm, 0);
7990
0
    recno = GET(codestart, offset);
7991
0
    if (recno != prev_recno)
7992
0
      {
7993
0
      groupptr = PRIV(find_bracket)(codestart, utf, recno);
7994
0
      prev_recno = recno;
7995
0
      }
7996
0
    if (groupptr == NULL) errorcode = ERR53;
7997
0
      else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
7998
0
    }
7999
0
  }
8000
8001
/* If the workspace had to be expanded, free the new memory. */
8002
8003
0
if (cd->workspace_size > COMPILE_WORK_SIZE)
8004
0
  (PUBL(free))((void *)cd->start_workspace);
8005
8006
/* Give an error if there's back reference to a non-existent capturing
8007
subpattern. */
8008
8009
0
if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
8010
8011
/* If there were any lookbehind assertions that contained OP_RECURSE
8012
(recursions or subroutine calls), a flag is set for them to be checked here,
8013
because they may contain forward references. Actual recursions can't be fixed
8014
length, but subroutine calls can. It is done like this so that those without
8015
OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
8016
exceptional ones forgo this. We scan the pattern to check that they are fixed
8017
length, and set their lengths. */
8018
8019
0
if (cd->check_lookbehind)
8020
0
  {
8021
0
  pcre_uchar *cc = (pcre_uchar *)codestart;
8022
8023
  /* Loop, searching for OP_REVERSE items, and process those that do not have
8024
  their length set. (Actually, it will also re-process any that have a length
8025
  of zero, but that is a pathological case, and it does no harm.) When we find
8026
  one, we temporarily terminate the branch it is in while we scan it. */
8027
8028
0
  for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
8029
0
       cc != NULL;
8030
0
       cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
8031
0
    {
8032
0
    if (GET(cc, 1) == 0)
8033
0
      {
8034
0
      int fixed_length;
8035
0
      pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
8036
0
      int end_op = *be;
8037
0
      *be = OP_END;
8038
0
      fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
8039
0
        cd);
8040
0
      *be = end_op;
8041
0
      DPRINTF(("fixed length = %d\n", fixed_length));
8042
0
      if (fixed_length < 0)
8043
0
        {
8044
0
        errorcode = (fixed_length == -2)? ERR36 :
8045
0
                    (fixed_length == -4)? ERR70 : ERR25;
8046
0
        break;
8047
0
        }
8048
0
      if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
8049
0
      PUT(cc, 1, fixed_length);
8050
0
      }
8051
0
    cc += 1 + LINK_SIZE;
8052
0
    }
8053
0
  }
8054
8055
/* Failed to compile, or error while post-processing */
8056
8057
0
if (errorcode != 0)
8058
0
  {
8059
0
  (PUBL(free))(re);
8060
0
  PCRE_EARLY_ERROR_RETURN:
8061
0
  *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
8062
0
  PCRE_EARLY_ERROR_RETURN2:
8063
0
  *errorptr = find_error_text(errorcode);
8064
0
  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
8065
0
  return NULL;
8066
0
  }
8067
8068
/* If the anchored option was not passed, set the flag if we can determine that
8069
the pattern is anchored by virtue of ^ characters or \A or anything else (such
8070
as starting with .* when DOTALL is set).
8071
8072
Otherwise, if we know what the first byte has to be, save it, because that
8073
speeds up unanchored matches no end. If not, see if we can set the
8074
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
8075
start with ^. and also when all branches start with .* for non-DOTALL matches.
8076
*/
8077
8078
0
if ((re->options & PCRE_ANCHORED) == 0)
8079
0
  {
8080
0
  if (is_anchored(codestart, 0, cd->backref_map))
8081
0
    re->options |= PCRE_ANCHORED;
8082
0
  else
8083
0
    {
8084
0
    if (firstchar < 0)
8085
0
      firstchar = find_firstassertedchar(codestart, FALSE);
8086
0
    if (firstchar >= 0)   /* Remove caseless flag for non-caseable chars */
8087
0
      {
8088
0
#ifdef COMPILE_PCRE8
8089
0
      re->first_char = firstchar & 0xff;
8090
#else
8091
#ifdef COMPILE_PCRE16
8092
      re->first_char = firstchar & 0xffff;
8093
#endif
8094
#endif
8095
0
      if ((firstchar & REQ_CASELESS) != 0)
8096
0
        {
8097
#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
8098
        /* We ignore non-ASCII first chars in 8 bit mode. */
8099
        if (utf)
8100
          {
8101
          if (re->first_char < 128)
8102
            {
8103
            if (cd->fcc[re->first_char] != re->first_char)
8104
              re->flags |= PCRE_FCH_CASELESS;
8105
            }
8106
          else if (UCD_OTHERCASE(re->first_char) != re->first_char)
8107
            re->flags |= PCRE_FCH_CASELESS;
8108
          }
8109
        else
8110
#endif
8111
0
        if (MAX_255(re->first_char)
8112
0
            && cd->fcc[re->first_char] != re->first_char)
8113
0
          re->flags |= PCRE_FCH_CASELESS;
8114
0
        }
8115
8116
0
      re->flags |= PCRE_FIRSTSET;
8117
0
      }
8118
0
    else if (is_startline(codestart, 0, cd->backref_map))
8119
0
      re->flags |= PCRE_STARTLINE;
8120
0
    }
8121
0
  }
8122
8123
/* For an anchored pattern, we use the "required byte" only if it follows a
8124
variable length item in the regex. Remove the caseless flag for non-caseable
8125
bytes. */
8126
8127
0
if (reqchar >= 0 &&
8128
0
     ((re->options & PCRE_ANCHORED) == 0 || (reqchar & REQ_VARY) != 0))
8129
0
  {
8130
0
#ifdef COMPILE_PCRE8
8131
0
  re->req_char = reqchar & 0xff;
8132
#else
8133
#ifdef COMPILE_PCRE16
8134
  re->req_char = reqchar & 0xffff;
8135
#endif
8136
#endif
8137
0
  if ((reqchar & REQ_CASELESS) != 0)
8138
0
    {
8139
#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
8140
    /* We ignore non-ASCII first chars in 8 bit mode. */
8141
    if (utf)
8142
      {
8143
      if (re->req_char < 128)
8144
        {
8145
        if (cd->fcc[re->req_char] != re->req_char)
8146
          re->flags |= PCRE_RCH_CASELESS;
8147
        }
8148
      else if (UCD_OTHERCASE(re->req_char) != re->req_char)
8149
        re->flags |= PCRE_RCH_CASELESS;
8150
      }
8151
    else
8152
#endif
8153
0
    if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
8154
0
      re->flags |= PCRE_RCH_CASELESS;
8155
0
    }
8156
8157
0
  re->flags |= PCRE_REQCHSET;
8158
0
  }
8159
8160
/* Print out the compiled data if debugging is enabled. This is never the
8161
case when building a production library. */
8162
8163
#ifdef PCRE_DEBUG
8164
printf("Length = %d top_bracket = %d top_backref = %d\n",
8165
  length, re->top_bracket, re->top_backref);
8166
8167
printf("Options=%08x\n", re->options);
8168
8169
if ((re->flags & PCRE_FIRSTSET) != 0)
8170
  {
8171
  pcre_uchar ch = re->first_char;
8172
  const char *caseless =
8173
    ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
8174
  if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
8175
    else printf("First char = \\x%02x%s\n", ch, caseless);
8176
  }
8177
8178
if ((re->flags & PCRE_REQCHSET) != 0)
8179
  {
8180
  pcre_uchar ch = re->req_char;
8181
  const char *caseless =
8182
    ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
8183
  if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
8184
    else printf("Req char = \\x%02x%s\n", ch, caseless);
8185
  }
8186
8187
#ifdef COMPILE_PCRE8
8188
pcre_printint((pcre *)re, stdout, TRUE);
8189
#else
8190
pcre16_printint((pcre *)re, stdout, TRUE);
8191
#endif
8192
8193
/* This check is done here in the debugging case so that the code that
8194
was compiled can be seen. */
8195
8196
if (code - codestart > length)
8197
  {
8198
  (PUBL(free))(re);
8199
  *errorptr = find_error_text(ERR23);
8200
  *erroroffset = ptr - (pcre_uchar *)pattern;
8201
  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
8202
  return NULL;
8203
  }
8204
#endif   /* PCRE_DEBUG */
8205
8206
0
#ifdef COMPILE_PCRE8
8207
0
return (pcre *)re;
8208
#else
8209
return (pcre16 *)re;
8210
#endif
8211
0
}
8212
8213
/* End of pcre_compile.c */