Coverage Report

Created: 2023-09-23 07:09

/src/augeas/gnulib/lib/regcomp.c
Line
Count
Source (jump to first uncovered line)
1
/* Extended regular expression matching and search library.
2
   Copyright (C) 2002-2022 Free Software Foundation, Inc.
3
   This file is part of the GNU C Library.
4
   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6
   The GNU C Library is free software; you can redistribute it and/or
7
   modify it under the terms of the GNU Lesser General Public
8
   License as published by the Free Software Foundation; either
9
   version 2.1 of the License, or (at your option) any later version.
10
11
   The GNU C Library is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
   Lesser General Public License for more details.
15
16
   You should have received a copy of the GNU Lesser General Public
17
   License along with the GNU C Library; if not, see
18
   <https://www.gnu.org/licenses/>.  */
19
20
#ifdef _LIBC
21
# include <locale/weight.h>
22
#endif
23
24
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25
            size_t length, reg_syntax_t syntax);
26
static void re_compile_fastmap_iter (regex_t *bufp,
27
             const re_dfastate_t *init_state,
28
             char *fastmap);
29
static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30
static void free_charset (re_charset_t *cset);
31
static void free_workarea_compile (regex_t *preg);
32
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33
static void optimize_utf8 (re_dfa_t *dfa);
34
static reg_errcode_t analyze (regex_t *preg);
35
static reg_errcode_t preorder (bin_tree_t *root,
36
             reg_errcode_t (fn (void *, bin_tree_t *)),
37
             void *extra);
38
static reg_errcode_t postorder (bin_tree_t *root,
39
        reg_errcode_t (fn (void *, bin_tree_t *)),
40
        void *extra);
41
static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42
static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43
static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44
         bin_tree_t *node);
45
static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46
static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47
static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48
static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49
static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50
           unsigned int constraint);
51
static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52
static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53
           Idx node, bool root);
54
static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55
static Idx fetch_number (re_string_t *input, re_token_t *token,
56
       reg_syntax_t syntax);
57
static int peek_token (re_token_t *token, re_string_t *input,
58
      reg_syntax_t syntax);
59
static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60
        reg_syntax_t syntax, reg_errcode_t *err);
61
static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62
          re_token_t *token, reg_syntax_t syntax,
63
          Idx nest, reg_errcode_t *err);
64
static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65
         re_token_t *token, reg_syntax_t syntax,
66
         Idx nest, reg_errcode_t *err);
67
static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68
             re_token_t *token, reg_syntax_t syntax,
69
             Idx nest, reg_errcode_t *err);
70
static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71
          re_token_t *token, reg_syntax_t syntax,
72
          Idx nest, reg_errcode_t *err);
73
static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74
         re_dfa_t *dfa, re_token_t *token,
75
         reg_syntax_t syntax, reg_errcode_t *err);
76
static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77
              re_token_t *token, reg_syntax_t syntax,
78
              reg_errcode_t *err);
79
static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80
              re_string_t *regexp,
81
              re_token_t *token, int token_len,
82
              re_dfa_t *dfa,
83
              reg_syntax_t syntax,
84
              bool accept_hyphen);
85
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86
            re_string_t *regexp,
87
            re_token_t *token);
88
static reg_errcode_t build_equiv_class (bitset_t sbcset,
89
          re_charset_t *mbcset,
90
          Idx *equiv_class_alloc,
91
          const unsigned char *name);
92
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93
              bitset_t sbcset,
94
              re_charset_t *mbcset,
95
              Idx *char_class_alloc,
96
              const char *class_name,
97
              reg_syntax_t syntax);
98
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
99
               RE_TRANSLATE_TYPE trans,
100
               const char *class_name,
101
               const char *extra,
102
               bool non_match, reg_errcode_t *err);
103
static bin_tree_t *create_tree (re_dfa_t *dfa,
104
        bin_tree_t *left, bin_tree_t *right,
105
        re_token_type_t type);
106
static bin_tree_t *create_token_tree (re_dfa_t *dfa,
107
              bin_tree_t *left, bin_tree_t *right,
108
              const re_token_t *token);
109
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
110
static void free_token (re_token_t *node);
111
static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
112
static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
113

114
/* This table gives an error message for each of the error codes listed
115
   in regex.h.  Obviously the order here has to be same as there.
116
   POSIX doesn't require that we do anything for REG_NOERROR,
117
   but why not be nice?  */
118
119
static const char __re_error_msgid[] =
120
  {
121
#define REG_NOERROR_IDX 0
122
    gettext_noop ("Success")  /* REG_NOERROR */
123
    "\0"
124
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
125
    gettext_noop ("No match") /* REG_NOMATCH */
126
    "\0"
127
#define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
128
    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
129
    "\0"
130
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
131
    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
132
    "\0"
133
#define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
134
    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
135
    "\0"
136
#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
137
    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
138
    "\0"
139
#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
140
    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
141
    "\0"
142
#define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
143
    gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
144
    "\0"
145
#define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
146
    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
147
    "\0"
148
#define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
149
    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
150
    "\0"
151
#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
152
    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
153
    "\0"
154
#define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
155
    gettext_noop ("Invalid range end")  /* REG_ERANGE */
156
    "\0"
157
#define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
158
    gettext_noop ("Memory exhausted") /* REG_ESPACE */
159
    "\0"
160
#define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
161
    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
162
    "\0"
163
#define REG_EEND_IDX  (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
164
    gettext_noop ("Premature end of regular expression") /* REG_EEND */
165
    "\0"
166
#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
167
    gettext_noop ("Regular expression too big") /* REG_ESIZE */
168
    "\0"
169
#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
170
    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
171
  };
172
173
static const size_t __re_error_msgid_idx[] =
174
  {
175
    REG_NOERROR_IDX,
176
    REG_NOMATCH_IDX,
177
    REG_BADPAT_IDX,
178
    REG_ECOLLATE_IDX,
179
    REG_ECTYPE_IDX,
180
    REG_EESCAPE_IDX,
181
    REG_ESUBREG_IDX,
182
    REG_EBRACK_IDX,
183
    REG_EPAREN_IDX,
184
    REG_EBRACE_IDX,
185
    REG_BADBR_IDX,
186
    REG_ERANGE_IDX,
187
    REG_ESPACE_IDX,
188
    REG_BADRPT_IDX,
189
    REG_EEND_IDX,
190
    REG_ESIZE_IDX,
191
    REG_ERPAREN_IDX
192
  };
193

194
/* Entry points for GNU code.  */
195
196
/* re_compile_pattern is the GNU regular expression compiler: it
197
   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
198
   Returns 0 if the pattern was valid, otherwise an error string.
199
200
   Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
201
   are set in BUFP on entry.  */
202
203
const char *
204
re_compile_pattern (const char *pattern, size_t length,
205
        struct re_pattern_buffer *bufp)
206
12.7k
{
207
12.7k
  reg_errcode_t ret;
208
209
  /* And GNU code determines whether or not to get register information
210
     by passing null for the REGS argument to re_match, etc., not by
211
     setting no_sub, unless RE_NO_SUB is set.  */
212
12.7k
  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
213
214
  /* Match anchors at newline.  */
215
12.7k
  bufp->newline_anchor = 1;
216
217
12.7k
  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
218
219
12.7k
  if (!ret)
220
12.7k
    return NULL;
221
54
  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
222
12.7k
}
223
weak_alias (__re_compile_pattern, re_compile_pattern)
224
225
/* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
226
   also be assigned to arbitrarily: each pattern buffer stores its own
227
   syntax, so it can be changed between regex compilations.  */
228
/* This has no initializer because initialized variables in Emacs
229
   become read-only after dumping.  */
230
reg_syntax_t re_syntax_options;
231
232
233
/* Specify the precise syntax of regexps for compilation.  This provides
234
   for compatibility for various utilities which historically have
235
   different, incompatible syntaxes.
236
237
   The argument SYNTAX is a bit mask comprised of the various bits
238
   defined in regex.h.  We return the old syntax.  */
239
240
reg_syntax_t
241
re_set_syntax (reg_syntax_t syntax)
242
0
{
243
0
  reg_syntax_t ret = re_syntax_options;
244
245
0
  re_syntax_options = syntax;
246
0
  return ret;
247
0
}
248
weak_alias (__re_set_syntax, re_set_syntax)
249
250
int
251
re_compile_fastmap (struct re_pattern_buffer *bufp)
252
0
{
253
0
  re_dfa_t *dfa = bufp->buffer;
254
0
  char *fastmap = bufp->fastmap;
255
256
0
  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
257
0
  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
258
0
  if (dfa->init_state != dfa->init_state_word)
259
0
    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
260
0
  if (dfa->init_state != dfa->init_state_nl)
261
0
    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
262
0
  if (dfa->init_state != dfa->init_state_begbuf)
263
0
    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
264
0
  bufp->fastmap_accurate = 1;
265
0
  return 0;
266
0
}
267
weak_alias (__re_compile_fastmap, re_compile_fastmap)
268
269
static __always_inline void
270
re_set_fastmap (char *fastmap, bool icase, int ch)
271
0
{
272
0
  fastmap[ch] = 1;
273
0
  if (icase)
274
0
    fastmap[tolower (ch)] = 1;
275
0
}
276
277
/* Helper function for re_compile_fastmap.
278
   Compile fastmap for the initial_state INIT_STATE.  */
279
280
static void
281
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
282
       char *fastmap)
283
0
{
284
0
  re_dfa_t *dfa = bufp->buffer;
285
0
  Idx node_cnt;
286
0
  bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
287
0
  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
288
0
    {
289
0
      Idx node = init_state->nodes.elems[node_cnt];
290
0
      re_token_type_t type = dfa->nodes[node].type;
291
292
0
      if (type == CHARACTER)
293
0
  {
294
0
    re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
295
0
    if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
296
0
      {
297
0
        unsigned char buf[MB_LEN_MAX];
298
0
        unsigned char *p;
299
0
        wchar_t wc;
300
0
        mbstate_t state;
301
302
0
        p = buf;
303
0
        *p++ = dfa->nodes[node].opr.c;
304
0
        while (++node < dfa->nodes_len
305
0
         && dfa->nodes[node].type == CHARACTER
306
0
         && dfa->nodes[node].mb_partial)
307
0
    *p++ = dfa->nodes[node].opr.c;
308
0
        memset (&state, '\0', sizeof (state));
309
0
        if (__mbrtowc (&wc, (const char *) buf, p - buf,
310
0
           &state) == p - buf
311
0
      && (__wcrtomb ((char *) buf, __towlower (wc), &state)
312
0
          != (size_t) -1))
313
0
    re_set_fastmap (fastmap, false, buf[0]);
314
0
      }
315
0
  }
316
0
      else if (type == SIMPLE_BRACKET)
317
0
  {
318
0
    int i, ch;
319
0
    for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
320
0
      {
321
0
        int j;
322
0
        bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
323
0
        for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
324
0
    if (w & ((bitset_word_t) 1 << j))
325
0
      re_set_fastmap (fastmap, icase, ch);
326
0
      }
327
0
  }
328
0
      else if (type == COMPLEX_BRACKET)
329
0
  {
330
0
    re_charset_t *cset = dfa->nodes[node].opr.mbcset;
331
0
    Idx i;
332
333
#ifdef _LIBC
334
    /* See if we have to try all bytes which start multiple collation
335
       elements.
336
       e.g. In da_DK, we want to catch 'a' since "aa" is a valid
337
      collation element, and don't catch 'b' since 'b' is
338
      the only collation element which starts from 'b' (and
339
      it is caught by SIMPLE_BRACKET).  */
340
        if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
341
      && (cset->ncoll_syms || cset->nranges))
342
    {
343
      const int32_t *table = (const int32_t *)
344
        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
345
      for (i = 0; i < SBC_MAX; ++i)
346
        if (table[i] < 0)
347
          re_set_fastmap (fastmap, icase, i);
348
    }
349
#endif /* _LIBC */
350
351
    /* See if we have to start the match at all multibyte characters,
352
       i.e. where we would not find an invalid sequence.  This only
353
       applies to multibyte character sets; for single byte character
354
       sets, the SIMPLE_BRACKET again suffices.  */
355
0
    if (dfa->mb_cur_max > 1
356
0
        && (cset->nchar_classes || cset->non_match || cset->nranges
357
#ifdef _LIBC
358
      || cset->nequiv_classes
359
#endif /* _LIBC */
360
0
     ))
361
0
      {
362
0
        unsigned char c = 0;
363
0
        do
364
0
    {
365
0
      mbstate_t mbs;
366
0
      memset (&mbs, 0, sizeof (mbs));
367
0
      if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
368
0
        re_set_fastmap (fastmap, false, (int) c);
369
0
    }
370
0
        while (++c != 0);
371
0
      }
372
373
0
    else
374
0
      {
375
        /* ... Else catch all bytes which can start the mbchars.  */
376
0
        for (i = 0; i < cset->nmbchars; ++i)
377
0
    {
378
0
      char buf[256];
379
0
      mbstate_t state;
380
0
      memset (&state, '\0', sizeof (state));
381
0
      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
382
0
        re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
383
0
      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
384
0
        {
385
0
          if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
386
0
        != (size_t) -1)
387
0
      re_set_fastmap (fastmap, false, *(unsigned char *) buf);
388
0
        }
389
0
    }
390
0
      }
391
0
  }
392
0
      else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
393
0
  {
394
0
    memset (fastmap, '\1', sizeof (char) * SBC_MAX);
395
0
    if (type == END_OF_RE)
396
0
      bufp->can_be_null = 1;
397
0
    return;
398
0
  }
399
0
    }
400
0
}
401

402
/* Entry point for POSIX code.  */
403
/* regcomp takes a regular expression as a string and compiles it.
404
405
   PREG is a regex_t *.  We do not expect any fields to be initialized,
406
   since POSIX says we shouldn't.  Thus, we set
407
408
     'buffer' to the compiled pattern;
409
     'used' to the length of the compiled pattern;
410
     'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
411
       REG_EXTENDED bit in CFLAGS is set; otherwise, to
412
       RE_SYNTAX_POSIX_BASIC;
413
     'newline_anchor' to REG_NEWLINE being set in CFLAGS;
414
     'fastmap' to an allocated space for the fastmap;
415
     'fastmap_accurate' to zero;
416
     're_nsub' to the number of subexpressions in PATTERN.
417
418
   PATTERN is the address of the pattern string.
419
420
   CFLAGS is a series of bits which affect compilation.
421
422
     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
423
     use POSIX basic syntax.
424
425
     If REG_NEWLINE is set, then . and [^...] don't match newline.
426
     Also, regexec will try a match beginning after every newline.
427
428
     If REG_ICASE is set, then we considers upper- and lowercase
429
     versions of letters to be equivalent when matching.
430
431
     If REG_NOSUB is set, then when PREG is passed to regexec, that
432
     routine will report only success or failure, and nothing about the
433
     registers.
434
435
   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
436
   the return codes and their meanings.)  */
437
438
int
439
regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
440
0
{
441
0
  reg_errcode_t ret;
442
0
  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
443
0
       : RE_SYNTAX_POSIX_BASIC);
444
445
0
  preg->buffer = NULL;
446
0
  preg->allocated = 0;
447
0
  preg->used = 0;
448
449
  /* Try to allocate space for the fastmap.  */
450
0
  preg->fastmap = re_malloc (char, SBC_MAX);
451
0
  if (__glibc_unlikely (preg->fastmap == NULL))
452
0
    return REG_ESPACE;
453
454
0
  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
455
456
  /* If REG_NEWLINE is set, newlines are treated differently.  */
457
0
  if (cflags & REG_NEWLINE)
458
0
    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
459
0
      syntax &= ~RE_DOT_NEWLINE;
460
0
      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
461
      /* It also changes the matching behavior.  */
462
0
      preg->newline_anchor = 1;
463
0
    }
464
0
  else
465
0
    preg->newline_anchor = 0;
466
0
  preg->no_sub = !!(cflags & REG_NOSUB);
467
0
  preg->translate = NULL;
468
469
0
  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
470
471
  /* POSIX doesn't distinguish between an unmatched open-group and an
472
     unmatched close-group: both are REG_EPAREN.  */
473
0
  if (ret == REG_ERPAREN)
474
0
    ret = REG_EPAREN;
475
476
  /* We have already checked preg->fastmap != NULL.  */
477
0
  if (__glibc_likely (ret == REG_NOERROR))
478
    /* Compute the fastmap now, since regexec cannot modify the pattern
479
       buffer.  This function never fails in this implementation.  */
480
0
    (void) re_compile_fastmap (preg);
481
0
  else
482
0
    {
483
      /* Some error occurred while compiling the expression.  */
484
0
      re_free (preg->fastmap);
485
0
      preg->fastmap = NULL;
486
0
    }
487
488
0
  return (int) ret;
489
0
}
490
libc_hidden_def (__regcomp)
491
weak_alias (__regcomp, regcomp)
492
493
/* Returns a message corresponding to an error code, ERRCODE, returned
494
   from either regcomp or regexec.   We don't use PREG here.  */
495
496
size_t
497
regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
498
    size_t errbuf_size)
499
0
{
500
0
  const char *msg;
501
0
  size_t msg_size;
502
0
  int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
503
504
0
  if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
505
    /* Only error codes returned by the rest of the code should be passed
506
       to this routine.  If we are given anything else, or if other regex
507
       code generates an invalid error code, then the program has a bug.
508
       Dump core so we can fix it.  */
509
0
    abort ();
510
511
0
  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
512
513
0
  msg_size = strlen (msg) + 1; /* Includes the null.  */
514
515
0
  if (__glibc_likely (errbuf_size != 0))
516
0
    {
517
0
      size_t cpy_size = msg_size;
518
0
      if (__glibc_unlikely (msg_size > errbuf_size))
519
0
  {
520
0
    cpy_size = errbuf_size - 1;
521
0
    errbuf[cpy_size] = '\0';
522
0
  }
523
0
      memcpy (errbuf, msg, cpy_size);
524
0
    }
525
526
0
  return msg_size;
527
0
}
528
weak_alias (__regerror, regerror)
529
530
531
/* This static array is used for the map to single-byte characters when
532
   UTF-8 is used.  Otherwise we would allocate memory just to initialize
533
   it the same all the time.  UTF-8 is the preferred encoding so this is
534
   a worthwhile optimization.  */
535
static const bitset_t utf8_sb_map =
536
{
537
  /* Set the first 128 bits.  */
538
#if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
539
  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
540
#else
541
# if 4 * BITSET_WORD_BITS < ASCII_CHARS
542
#  error "bitset_word_t is narrower than 32 bits"
543
# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
544
  BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
545
# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
546
  BITSET_WORD_MAX, BITSET_WORD_MAX,
547
# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
548
  BITSET_WORD_MAX,
549
# endif
550
  (BITSET_WORD_MAX
551
   >> (SBC_MAX % BITSET_WORD_BITS == 0
552
       ? 0
553
       : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
554
#endif
555
};
556
557
558
static void
559
free_dfa_content (re_dfa_t *dfa)
560
12.7k
{
561
12.7k
  Idx i, j;
562
563
12.7k
  if (dfa->nodes)
564
5.04M
    for (i = 0; i < dfa->nodes_len; ++i)
565
5.03M
      free_token (dfa->nodes + i);
566
12.7k
  re_free (dfa->nexts);
567
5.04M
  for (i = 0; i < dfa->nodes_len; ++i)
568
5.03M
    {
569
5.03M
      if (dfa->eclosures != NULL)
570
5.03M
  re_node_set_free (dfa->eclosures + i);
571
5.03M
      if (dfa->inveclosures != NULL)
572
5.03M
  re_node_set_free (dfa->inveclosures + i);
573
5.03M
      if (dfa->edests != NULL)
574
5.03M
  re_node_set_free (dfa->edests + i);
575
5.03M
    }
576
12.7k
  re_free (dfa->edests);
577
12.7k
  re_free (dfa->eclosures);
578
12.7k
  re_free (dfa->inveclosures);
579
12.7k
  re_free (dfa->nodes);
580
581
12.7k
  if (dfa->state_table)
582
116M
    for (i = 0; i <= dfa->state_hash_mask; ++i)
583
116M
      {
584
116M
  struct re_state_table_entry *entry = dfa->state_table + i;
585
116M
  for (j = 0; j < entry->num; ++j)
586
161k
    {
587
161k
      re_dfastate_t *state = entry->array[j];
588
161k
      free_state (state);
589
161k
    }
590
116M
  re_free (entry->array);
591
116M
      }
592
12.7k
  re_free (dfa->state_table);
593
12.7k
  if (dfa->sb_char != utf8_sb_map)
594
12.7k
    re_free (dfa->sb_char);
595
12.7k
  re_free (dfa->subexp_map);
596
#ifdef DEBUG
597
  re_free (dfa->re_str);
598
#endif
599
600
12.7k
  re_free (dfa);
601
12.7k
}
602
603
604
/* Free dynamically allocated space used by PREG.  */
605
606
void
607
regfree (regex_t *preg)
608
12.7k
{
609
12.7k
  re_dfa_t *dfa = preg->buffer;
610
12.7k
  if (__glibc_likely (dfa != NULL))
611
12.7k
    {
612
12.7k
      lock_fini (dfa->lock);
613
12.7k
      free_dfa_content (dfa);
614
12.7k
    }
615
12.7k
  preg->buffer = NULL;
616
12.7k
  preg->allocated = 0;
617
618
12.7k
  re_free (preg->fastmap);
619
12.7k
  preg->fastmap = NULL;
620
621
12.7k
  re_free (preg->translate);
622
12.7k
  preg->translate = NULL;
623
12.7k
}
624
libc_hidden_def (__regfree)
625
weak_alias (__regfree, regfree)
626

627
/* Entry points compatible with 4.2 BSD regex library.  We don't define
628
   them unless specifically requested.  */
629
630
#if defined _REGEX_RE_COMP || defined _LIBC
631
632
/* BSD has one and only one pattern buffer.  */
633
static struct re_pattern_buffer re_comp_buf;
634
635
char *
636
# ifdef _LIBC
637
/* Make these definitions weak in libc, so POSIX programs can redefine
638
   these names if they don't use our functions, and still use
639
   regcomp/regexec above without link errors.  */
640
weak_function
641
# endif
642
re_comp (const char *s)
643
{
644
  reg_errcode_t ret;
645
  char *fastmap;
646
647
  if (!s)
648
    {
649
      if (!re_comp_buf.buffer)
650
  return gettext ("No previous regular expression");
651
      return 0;
652
    }
653
654
  if (re_comp_buf.buffer)
655
    {
656
      fastmap = re_comp_buf.fastmap;
657
      re_comp_buf.fastmap = NULL;
658
      __regfree (&re_comp_buf);
659
      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660
      re_comp_buf.fastmap = fastmap;
661
    }
662
663
  if (re_comp_buf.fastmap == NULL)
664
    {
665
      re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
666
      if (re_comp_buf.fastmap == NULL)
667
  return (char *) gettext (__re_error_msgid
668
         + __re_error_msgid_idx[(int) REG_ESPACE]);
669
    }
670
671
  /* Since 're_exec' always passes NULL for the 'regs' argument, we
672
     don't need to initialize the pattern buffer fields which affect it.  */
673
674
  /* Match anchors at newlines.  */
675
  re_comp_buf.newline_anchor = 1;
676
677
  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
678
679
  if (!ret)
680
    return NULL;
681
682
  /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
683
  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
684
}
685
686
#ifdef _LIBC
687
libc_freeres_fn (free_mem)
688
{
689
  __regfree (&re_comp_buf);
690
}
691
#endif
692
693
#endif /* _REGEX_RE_COMP */
694

695
/* Internal entry point.
696
   Compile the regular expression PATTERN, whose length is LENGTH.
697
   SYNTAX indicate regular expression's syntax.  */
698
699
static reg_errcode_t
700
re_compile_internal (regex_t *preg, const char * pattern, size_t length,
701
         reg_syntax_t syntax)
702
12.7k
{
703
12.7k
  reg_errcode_t err = REG_NOERROR;
704
12.7k
  re_dfa_t *dfa;
705
12.7k
  re_string_t regexp;
706
707
  /* Initialize the pattern buffer.  */
708
12.7k
  preg->fastmap_accurate = 0;
709
12.7k
  preg->syntax = syntax;
710
12.7k
  preg->not_bol = preg->not_eol = 0;
711
12.7k
  preg->used = 0;
712
12.7k
  preg->re_nsub = 0;
713
12.7k
  preg->can_be_null = 0;
714
12.7k
  preg->regs_allocated = REGS_UNALLOCATED;
715
716
  /* Initialize the dfa.  */
717
12.7k
  dfa = preg->buffer;
718
12.7k
  if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
719
12.7k
    {
720
      /* If zero allocated, but buffer is non-null, try to realloc
721
   enough space.  This loses if buffer's address is bogus, but
722
   that is the user's responsibility.  If ->buffer is NULL this
723
   is a simple allocation.  */
724
12.7k
      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
725
12.7k
      if (dfa == NULL)
726
0
  return REG_ESPACE;
727
12.7k
      preg->allocated = sizeof (re_dfa_t);
728
12.7k
      preg->buffer = dfa;
729
12.7k
    }
730
12.7k
  preg->used = sizeof (re_dfa_t);
731
732
12.7k
  err = init_dfa (dfa, length);
733
12.7k
  if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
734
0
    err = REG_ESPACE;
735
12.7k
  if (__glibc_unlikely (err != REG_NOERROR))
736
0
    {
737
0
      free_dfa_content (dfa);
738
0
      preg->buffer = NULL;
739
0
      preg->allocated = 0;
740
0
      return err;
741
0
    }
742
#ifdef DEBUG
743
  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
744
  dfa->re_str = re_malloc (char, length + 1);
745
  strncpy (dfa->re_str, pattern, length + 1);
746
#endif
747
748
12.7k
  err = re_string_construct (&regexp, pattern, length, preg->translate,
749
12.7k
           (syntax & RE_ICASE) != 0, dfa);
750
12.7k
  if (__glibc_unlikely (err != REG_NOERROR))
751
0
    {
752
54
    re_compile_internal_free_return:
753
54
      free_workarea_compile (preg);
754
54
      re_string_destruct (&regexp);
755
54
      lock_fini (dfa->lock);
756
54
      free_dfa_content (dfa);
757
54
      preg->buffer = NULL;
758
54
      preg->allocated = 0;
759
54
      return err;
760
0
    }
761
762
  /* Parse the regular expression, and build a structure tree.  */
763
12.7k
  preg->re_nsub = 0;
764
12.7k
  dfa->str_tree = parse (&regexp, preg, syntax, &err);
765
12.7k
  if (__glibc_unlikely (dfa->str_tree == NULL))
766
54
    goto re_compile_internal_free_return;
767
768
  /* Analyze the tree and create the nfa.  */
769
12.7k
  err = analyze (preg);
770
12.7k
  if (__glibc_unlikely (err != REG_NOERROR))
771
0
    goto re_compile_internal_free_return;
772
773
  /* If possible, do searching in single byte encoding to speed things up.  */
774
12.7k
  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
775
0
    optimize_utf8 (dfa);
776
777
  /* Then create the initial state of the dfa.  */
778
12.7k
  err = create_initial_state (dfa);
779
780
  /* Release work areas.  */
781
12.7k
  free_workarea_compile (preg);
782
12.7k
  re_string_destruct (&regexp);
783
784
12.7k
  if (__glibc_unlikely (err != REG_NOERROR))
785
0
    {
786
0
      lock_fini (dfa->lock);
787
0
      free_dfa_content (dfa);
788
0
      preg->buffer = NULL;
789
0
      preg->allocated = 0;
790
0
    }
791
792
12.7k
  return err;
793
12.7k
}
794
795
/* Initialize DFA.  We use the length of the regular expression PAT_LEN
796
   as the initial length of some arrays.  */
797
798
static reg_errcode_t
799
init_dfa (re_dfa_t *dfa, size_t pat_len)
800
12.7k
{
801
12.7k
  __re_size_t table_size;
802
12.7k
#ifndef _LIBC
803
12.7k
  const char *codeset_name;
804
12.7k
#endif
805
12.7k
  size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
806
12.7k
  size_t max_object_size =
807
12.7k
    MAX (sizeof (struct re_state_table_entry),
808
12.7k
   MAX (sizeof (re_token_t),
809
12.7k
        MAX (sizeof (re_node_set),
810
12.7k
       MAX (sizeof (regmatch_t),
811
12.7k
      max_i18n_object_size))));
812
813
12.7k
  memset (dfa, '\0', sizeof (re_dfa_t));
814
815
  /* Force allocation of str_tree_storage the first time.  */
816
12.7k
  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
817
818
  /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
819
     calculation below, and for similar doubling calculations
820
     elsewhere.  And it's <= rather than <, because some of the
821
     doubling calculations add 1 afterwards.  */
822
12.7k
  if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
823
12.7k
      <= pat_len))
824
0
    return REG_ESPACE;
825
826
12.7k
  dfa->nodes_alloc = pat_len + 1;
827
12.7k
  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
828
829
  /*  table_size = 2 ^ ceil(log pat_len) */
830
53.4k
  for (table_size = 1; ; table_size <<= 1)
831
66.2k
    if (table_size > pat_len)
832
12.7k
      break;
833
834
12.7k
  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
835
12.7k
  dfa->state_hash_mask = table_size - 1;
836
837
12.7k
  dfa->mb_cur_max = MB_CUR_MAX;
838
#ifdef _LIBC
839
  if (dfa->mb_cur_max == 6
840
      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
841
    dfa->is_utf8 = 1;
842
  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
843
           != 0);
844
#else
845
12.7k
  codeset_name = nl_langinfo (CODESET);
846
12.7k
  if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
847
12.7k
      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
848
12.7k
      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
849
12.7k
      && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
850
0
    dfa->is_utf8 = 1;
851
852
  /* We check exhaustively in the loop below if this charset is a
853
     superset of ASCII.  */
854
12.7k
  dfa->map_notascii = 0;
855
12.7k
#endif
856
857
12.7k
  if (dfa->mb_cur_max > 1)
858
0
    {
859
0
      if (dfa->is_utf8)
860
0
  dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861
0
      else
862
0
  {
863
0
    int i, j, ch;
864
865
0
    dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
866
0
    if (__glibc_unlikely (dfa->sb_char == NULL))
867
0
      return REG_ESPACE;
868
869
    /* Set the bits corresponding to single byte chars.  */
870
0
    for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
871
0
      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
872
0
        {
873
0
    wint_t wch = __btowc (ch);
874
0
    if (wch != WEOF)
875
0
      dfa->sb_char[i] |= (bitset_word_t) 1 << j;
876
0
#ifndef _LIBC
877
0
    if (isascii (ch) && wch != ch)
878
0
      dfa->map_notascii = 1;
879
0
#endif
880
0
        }
881
0
  }
882
0
    }
883
884
12.7k
  if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
885
0
    return REG_ESPACE;
886
12.7k
  return REG_NOERROR;
887
12.7k
}
888
889
/* Initialize WORD_CHAR table, which indicate which character is
890
   "word".  In this case "word" means that it is the word construction
891
   character used by some operators like "\<", "\>", etc.  */
892
893
static void
894
init_word_char (re_dfa_t *dfa)
895
0
{
896
0
  int i = 0;
897
0
  int j;
898
0
  int ch = 0;
899
0
  dfa->word_ops_used = 1;
900
0
  if (__glibc_likely (dfa->map_notascii == 0))
901
0
    {
902
0
      bitset_word_t bits0 = 0x00000000;
903
0
      bitset_word_t bits1 = 0x03ff0000;
904
0
      bitset_word_t bits2 = 0x87fffffe;
905
0
      bitset_word_t bits3 = 0x07fffffe;
906
0
      if (BITSET_WORD_BITS == 64)
907
0
  {
908
    /* Pacify gcc -Woverflow on 32-bit platformns.  */
909
0
    dfa->word_char[0] = bits1 << 31 << 1 | bits0;
910
0
    dfa->word_char[1] = bits3 << 31 << 1 | bits2;
911
0
    i = 2;
912
0
  }
913
0
      else if (BITSET_WORD_BITS == 32)
914
0
  {
915
0
    dfa->word_char[0] = bits0;
916
0
    dfa->word_char[1] = bits1;
917
0
    dfa->word_char[2] = bits2;
918
0
    dfa->word_char[3] = bits3;
919
0
    i = 4;
920
0
  }
921
0
      else
922
0
        goto general_case;
923
0
      ch = 128;
924
925
0
      if (__glibc_likely (dfa->is_utf8))
926
0
  {
927
0
    memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
928
0
    return;
929
0
  }
930
0
    }
931
932
0
 general_case:
933
0
  for (; i < BITSET_WORDS; ++i)
934
0
    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
935
0
      if (isalnum (ch) || ch == '_')
936
0
  dfa->word_char[i] |= (bitset_word_t) 1 << j;
937
0
}
938
939
/* Free the work area which are only used while compiling.  */
940
941
static void
942
free_workarea_compile (regex_t *preg)
943
12.7k
{
944
12.7k
  re_dfa_t *dfa = preg->buffer;
945
12.7k
  bin_tree_storage_t *storage, *next;
946
1.09M
  for (storage = dfa->str_tree_storage; storage; storage = next)
947
1.07M
    {
948
1.07M
      next = storage->next;
949
1.07M
      re_free (storage);
950
1.07M
    }
951
12.7k
  dfa->str_tree_storage = NULL;
952
12.7k
  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
953
12.7k
  dfa->str_tree = NULL;
954
12.7k
  re_free (dfa->org_indices);
955
12.7k
  dfa->org_indices = NULL;
956
12.7k
}
957
958
/* Create initial states for all contexts.  */
959
960
static reg_errcode_t
961
create_initial_state (re_dfa_t *dfa)
962
12.7k
{
963
12.7k
  Idx first, i;
964
12.7k
  reg_errcode_t err;
965
12.7k
  re_node_set init_nodes;
966
967
  /* Initial states have the epsilon closure of the node which is
968
     the first node of the regular expression.  */
969
12.7k
  first = dfa->str_tree->first->node_idx;
970
12.7k
  dfa->init_node = first;
971
12.7k
  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
972
12.7k
  if (__glibc_unlikely (err != REG_NOERROR))
973
0
    return err;
974
975
  /* The back-references which are in initial states can epsilon transit,
976
     since in this case all of the subexpressions can be null.
977
     Then we add epsilon closures of the nodes which are the next nodes of
978
     the back-references.  */
979
12.7k
  if (dfa->nbackref > 0)
980
0
    for (i = 0; i < init_nodes.nelem; ++i)
981
0
      {
982
0
  Idx node_idx = init_nodes.elems[i];
983
0
  re_token_type_t type = dfa->nodes[node_idx].type;
984
985
0
  Idx clexp_idx;
986
0
  if (type != OP_BACK_REF)
987
0
    continue;
988
0
  for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
989
0
    {
990
0
      re_token_t *clexp_node;
991
0
      clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
992
0
      if (clexp_node->type == OP_CLOSE_SUBEXP
993
0
    && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
994
0
        break;
995
0
    }
996
0
  if (clexp_idx == init_nodes.nelem)
997
0
    continue;
998
999
0
  if (type == OP_BACK_REF)
1000
0
    {
1001
0
      Idx dest_idx = dfa->edests[node_idx].elems[0];
1002
0
      if (!re_node_set_contains (&init_nodes, dest_idx))
1003
0
        {
1004
0
    reg_errcode_t merge_err
1005
0
                  = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1006
0
    if (merge_err != REG_NOERROR)
1007
0
      return merge_err;
1008
0
    i = 0;
1009
0
        }
1010
0
    }
1011
0
      }
1012
1013
  /* It must be the first time to invoke acquire_state.  */
1014
12.7k
  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1015
  /* We don't check ERR here, since the initial state must not be NULL.  */
1016
12.7k
  if (__glibc_unlikely (dfa->init_state == NULL))
1017
0
    return err;
1018
12.7k
  if (dfa->init_state->has_constraint)
1019
229
    {
1020
229
      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1021
229
                   CONTEXT_WORD);
1022
229
      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1023
229
                 CONTEXT_NEWLINE);
1024
229
      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1025
229
               &init_nodes,
1026
229
               CONTEXT_NEWLINE
1027
229
               | CONTEXT_BEGBUF);
1028
229
      if (__glibc_unlikely (dfa->init_state_word == NULL
1029
229
          || dfa->init_state_nl == NULL
1030
229
          || dfa->init_state_begbuf == NULL))
1031
0
  return err;
1032
229
    }
1033
12.4k
  else
1034
12.4k
    dfa->init_state_word = dfa->init_state_nl
1035
12.4k
      = dfa->init_state_begbuf = dfa->init_state;
1036
1037
12.7k
  re_node_set_free (&init_nodes);
1038
12.7k
  return REG_NOERROR;
1039
12.7k
}
1040

1041
/* If it is possible to do searching in single byte encoding instead of UTF-8
1042
   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1043
   DFA nodes where needed.  */
1044
1045
static void
1046
optimize_utf8 (re_dfa_t *dfa)
1047
0
{
1048
0
  Idx node;
1049
0
  int i;
1050
0
  bool mb_chars = false;
1051
0
  bool has_period = false;
1052
1053
0
  for (node = 0; node < dfa->nodes_len; ++node)
1054
0
    switch (dfa->nodes[node].type)
1055
0
      {
1056
0
      case CHARACTER:
1057
0
  if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1058
0
    mb_chars = true;
1059
0
  break;
1060
0
      case ANCHOR:
1061
0
  switch (dfa->nodes[node].opr.ctx_type)
1062
0
    {
1063
0
    case LINE_FIRST:
1064
0
    case LINE_LAST:
1065
0
    case BUF_FIRST:
1066
0
    case BUF_LAST:
1067
0
      break;
1068
0
    default:
1069
      /* Word anchors etc. cannot be handled.  It's okay to test
1070
         opr.ctx_type since constraints (for all DFA nodes) are
1071
         created by ORing one or more opr.ctx_type values.  */
1072
0
      return;
1073
0
    }
1074
0
  break;
1075
0
      case OP_PERIOD:
1076
0
  has_period = true;
1077
0
  break;
1078
0
      case OP_BACK_REF:
1079
0
      case OP_ALT:
1080
0
      case END_OF_RE:
1081
0
      case OP_DUP_ASTERISK:
1082
0
      case OP_OPEN_SUBEXP:
1083
0
      case OP_CLOSE_SUBEXP:
1084
0
  break;
1085
0
      case COMPLEX_BRACKET:
1086
0
  return;
1087
0
      case SIMPLE_BRACKET:
1088
  /* Just double check.  */
1089
0
  {
1090
0
    int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1091
0
      ? 0
1092
0
      : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1093
0
    for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1094
0
      {
1095
0
        if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1096
0
    return;
1097
0
        rshift = 0;
1098
0
      }
1099
0
  }
1100
0
  break;
1101
0
      default:
1102
0
  abort ();
1103
0
      }
1104
1105
0
  if (mb_chars || has_period)
1106
0
    for (node = 0; node < dfa->nodes_len; ++node)
1107
0
      {
1108
0
  if (dfa->nodes[node].type == CHARACTER
1109
0
      && dfa->nodes[node].opr.c >= ASCII_CHARS)
1110
0
    dfa->nodes[node].mb_partial = 0;
1111
0
  else if (dfa->nodes[node].type == OP_PERIOD)
1112
0
    dfa->nodes[node].type = OP_UTF8_PERIOD;
1113
0
      }
1114
1115
  /* The search can be in single byte locale.  */
1116
0
  dfa->mb_cur_max = 1;
1117
0
  dfa->is_utf8 = 0;
1118
0
  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119
0
}
1120

1121
/* Analyze the structure tree, and calculate "first", "next", "edest",
1122
   "eclosure", and "inveclosure".  */
1123
1124
static reg_errcode_t
1125
analyze (regex_t *preg)
1126
12.7k
{
1127
12.7k
  re_dfa_t *dfa = preg->buffer;
1128
12.7k
  reg_errcode_t ret;
1129
1130
  /* Allocate arrays.  */
1131
12.7k
  dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1132
12.7k
  dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1133
12.7k
  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134
12.7k
  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135
12.7k
  if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1136
12.7k
      || dfa->edests == NULL || dfa->eclosures == NULL))
1137
0
    return REG_ESPACE;
1138
1139
12.7k
  dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1140
12.7k
  if (dfa->subexp_map != NULL)
1141
12.7k
    {
1142
12.7k
      Idx i;
1143
14.9k
      for (i = 0; i < preg->re_nsub; i++)
1144
2.19k
  dfa->subexp_map[i] = i;
1145
12.7k
      preorder (dfa->str_tree, optimize_subexps, dfa);
1146
14.7k
      for (i = 0; i < preg->re_nsub; i++)
1147
2.04k
  if (dfa->subexp_map[i] != i)
1148
16
    break;
1149
12.7k
      if (i == preg->re_nsub)
1150
12.6k
  {
1151
12.6k
    re_free (dfa->subexp_map);
1152
12.6k
    dfa->subexp_map = NULL;
1153
12.6k
  }
1154
12.7k
    }
1155
1156
12.7k
  ret = postorder (dfa->str_tree, lower_subexps, preg);
1157
12.7k
  if (__glibc_unlikely (ret != REG_NOERROR))
1158
0
    return ret;
1159
12.7k
  ret = postorder (dfa->str_tree, calc_first, dfa);
1160
12.7k
  if (__glibc_unlikely (ret != REG_NOERROR))
1161
0
    return ret;
1162
12.7k
  preorder (dfa->str_tree, calc_next, dfa);
1163
12.7k
  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164
12.7k
  if (__glibc_unlikely (ret != REG_NOERROR))
1165
0
    return ret;
1166
12.7k
  ret = calc_eclosure (dfa);
1167
12.7k
  if (__glibc_unlikely (ret != REG_NOERROR))
1168
0
    return ret;
1169
1170
  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171
     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1172
12.7k
  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173
12.7k
      || dfa->nbackref)
1174
246
    {
1175
246
      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176
246
      if (__glibc_unlikely (dfa->inveclosures == NULL))
1177
0
  return REG_ESPACE;
1178
246
      ret = calc_inveclosure (dfa);
1179
246
    }
1180
1181
12.7k
  return ret;
1182
12.7k
}
1183
1184
/* Our parse trees are very unbalanced, so we cannot use a stack to
1185
   implement parse tree visits.  Instead, we use parent pointers and
1186
   some hairy code in these two functions.  */
1187
static reg_errcode_t
1188
postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189
     void *extra)
1190
40.6k
{
1191
40.6k
  bin_tree_t *node, *prev;
1192
1193
40.6k
  for (node = root; ; )
1194
13.3M
    {
1195
      /* Descend down the tree, preferably to the left (or to the right
1196
   if that's the only child).  */
1197
28.3M
      while (node->left || node->right)
1198
14.9M
  if (node->left)
1199
14.9M
    node = node->left;
1200
230
  else
1201
230
    node = node->right;
1202
1203
13.3M
      do
1204
28.3M
  {
1205
28.3M
    reg_errcode_t err = fn (extra, node);
1206
28.3M
    if (__glibc_unlikely (err != REG_NOERROR))
1207
0
      return err;
1208
28.3M
    if (node->parent == NULL)
1209
40.6k
      return REG_NOERROR;
1210
28.2M
    prev = node;
1211
28.2M
    node = node->parent;
1212
28.2M
  }
1213
      /* Go up while we have a node that is reached from the right.  */
1214
28.2M
      while (node->right == prev || node->right == NULL);
1215
13.3M
      node = node->right;
1216
13.3M
    }
1217
40.6k
}
1218
1219
static reg_errcode_t
1220
preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1221
    void *extra)
1222
38.1k
{
1223
38.1k
  bin_tree_t *node;
1224
1225
38.1k
  for (node = root; ; )
1226
29.8M
    {
1227
29.8M
      reg_errcode_t err = fn (extra, node);
1228
29.8M
      if (__glibc_unlikely (err != REG_NOERROR))
1229
0
  return err;
1230
1231
      /* Go to the left node, or up and to the right.  */
1232
29.8M
      if (node->left)
1233
14.9M
  node = node->left;
1234
14.9M
      else
1235
14.9M
  {
1236
14.9M
    bin_tree_t *prev = NULL;
1237
44.7M
    while (node->right == prev || node->right == NULL)
1238
29.8M
      {
1239
29.8M
        prev = node;
1240
29.8M
        node = node->parent;
1241
29.8M
        if (!node)
1242
38.1k
    return REG_NOERROR;
1243
29.8M
      }
1244
14.8M
    node = node->right;
1245
14.8M
  }
1246
29.8M
    }
1247
38.1k
}
1248
1249
/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1250
   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1251
   backreferences as well.  Requires a preorder visit.  */
1252
static reg_errcode_t
1253
optimize_subexps (void *extra, bin_tree_t *node)
1254
9.95M
{
1255
9.95M
  re_dfa_t *dfa = (re_dfa_t *) extra;
1256
1257
9.95M
  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1258
0
    {
1259
0
      int idx = node->token.opr.idx;
1260
0
      node->token.opr.idx = dfa->subexp_map[idx];
1261
0
      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1262
0
    }
1263
1264
9.95M
  else if (node->token.type == SUBEXP
1265
9.95M
     && node->left && node->left->token.type == SUBEXP)
1266
27
    {
1267
27
      Idx other_idx = node->left->token.opr.idx;
1268
1269
27
      node->left = node->left->left;
1270
27
      if (node->left)
1271
26
  node->left->parent = node;
1272
1273
27
      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1274
27
      if (other_idx < BITSET_WORD_BITS)
1275
27
  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1276
27
    }
1277
1278
9.95M
  return REG_NOERROR;
1279
9.95M
}
1280
1281
/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1282
   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1283
static reg_errcode_t
1284
lower_subexps (void *extra, bin_tree_t *node)
1285
9.95M
{
1286
9.95M
  regex_t *preg = (regex_t *) extra;
1287
9.95M
  reg_errcode_t err = REG_NOERROR;
1288
1289
9.95M
  if (node->left && node->left->token.type == SUBEXP)
1290
327
    {
1291
327
      node->left = lower_subexp (&err, preg, node->left);
1292
327
      if (node->left)
1293
327
  node->left->parent = node;
1294
327
    }
1295
9.95M
  if (node->right && node->right->token.type == SUBEXP)
1296
1.86k
    {
1297
1.86k
      node->right = lower_subexp (&err, preg, node->right);
1298
1.86k
      if (node->right)
1299
1.86k
  node->right->parent = node;
1300
1.86k
    }
1301
1302
9.95M
  return err;
1303
9.95M
}
1304
1305
static bin_tree_t *
1306
lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1307
2.19k
{
1308
2.19k
  re_dfa_t *dfa = preg->buffer;
1309
2.19k
  bin_tree_t *body = node->left;
1310
2.19k
  bin_tree_t *op, *cls, *tree1, *tree;
1311
1312
2.19k
  if (preg->no_sub
1313
      /* We do not optimize empty subexpressions, because otherwise we may
1314
   have bad CONCAT nodes with NULL children.  This is obviously not
1315
   very common, so we do not lose much.  An example that triggers
1316
   this case is the sed "script" /\(\)/x.  */
1317
2.19k
      && node->left != NULL
1318
2.19k
      && (node->token.opr.idx >= BITSET_WORD_BITS
1319
0
    || !(dfa->used_bkref_map
1320
0
         & ((bitset_word_t) 1 << node->token.opr.idx))))
1321
0
    return node->left;
1322
1323
  /* Convert the SUBEXP node to the concatenation of an
1324
     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1325
2.19k
  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1326
2.19k
  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1327
2.19k
  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1328
2.19k
  tree = create_tree (dfa, op, tree1, CONCAT);
1329
2.19k
  if (__glibc_unlikely (tree == NULL || tree1 == NULL
1330
2.19k
      || op == NULL || cls == NULL))
1331
0
    {
1332
0
      *err = REG_ESPACE;
1333
0
      return NULL;
1334
0
    }
1335
1336
2.19k
  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1337
2.19k
  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1338
2.19k
  return tree;
1339
2.19k
}
1340
1341
/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1342
   nodes.  Requires a postorder visit.  */
1343
static reg_errcode_t
1344
calc_first (void *extra, bin_tree_t *node)
1345
9.95M
{
1346
9.95M
  re_dfa_t *dfa = (re_dfa_t *) extra;
1347
9.95M
  if (node->token.type == CONCAT)
1348
4.92M
    {
1349
4.92M
      node->first = node->left->first;
1350
4.92M
      node->node_idx = node->left->node_idx;
1351
4.92M
    }
1352
5.02M
  else
1353
5.02M
    {
1354
5.02M
      node->first = node;
1355
5.02M
      node->node_idx = re_dfa_add_node (dfa, node->token);
1356
5.02M
      if (__glibc_unlikely (node->node_idx == -1))
1357
0
  return REG_ESPACE;
1358
5.02M
      if (node->token.type == ANCHOR)
1359
439
  dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1360
5.02M
    }
1361
9.95M
  return REG_NOERROR;
1362
9.95M
}
1363
1364
/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1365
static reg_errcode_t
1366
calc_next (void *extra, bin_tree_t *node)
1367
9.95M
{
1368
9.95M
  switch (node->token.type)
1369
9.95M
    {
1370
21.6k
    case OP_DUP_ASTERISK:
1371
21.6k
      node->left->next = node;
1372
21.6k
      break;
1373
4.92M
    case CONCAT:
1374
4.92M
      node->left->next = node->right->first;
1375
4.92M
      node->right->next = node->next;
1376
4.92M
      break;
1377
5.00M
    default:
1378
5.00M
      if (node->left)
1379
38.8k
  node->left->next = node->next;
1380
5.00M
      if (node->right)
1381
26.7k
  node->right->next = node->next;
1382
5.00M
      break;
1383
9.95M
    }
1384
9.95M
  return REG_NOERROR;
1385
9.95M
}
1386
1387
/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1388
static reg_errcode_t
1389
link_nfa_nodes (void *extra, bin_tree_t *node)
1390
9.95M
{
1391
9.95M
  re_dfa_t *dfa = (re_dfa_t *) extra;
1392
9.95M
  Idx idx = node->node_idx;
1393
9.95M
  reg_errcode_t err = REG_NOERROR;
1394
1395
9.95M
  switch (node->token.type)
1396
9.95M
    {
1397
4.92M
    case CONCAT:
1398
4.92M
      break;
1399
1400
12.7k
    case END_OF_RE:
1401
12.7k
      DEBUG_ASSERT (node->next == NULL);
1402
0
      break;
1403
1404
21.6k
    case OP_DUP_ASTERISK:
1405
60.6k
    case OP_ALT:
1406
60.6k
      {
1407
60.6k
  Idx left, right;
1408
60.6k
  dfa->has_plural_match = 1;
1409
60.6k
  if (node->left != NULL)
1410
60.5k
    left = node->left->first->node_idx;
1411
109
  else
1412
109
    left = node->next->node_idx;
1413
60.6k
  if (node->right != NULL)
1414
26.7k
    right = node->right->first->node_idx;
1415
33.8k
  else
1416
33.8k
    right = node->next->node_idx;
1417
60.6k
  DEBUG_ASSERT (left > -1);
1418
60.6k
  DEBUG_ASSERT (right > -1);
1419
0
  err = re_node_set_init_2 (dfa->edests + idx, left, right);
1420
60.6k
      }
1421
0
      break;
1422
1423
439
    case ANCHOR:
1424
2.63k
    case OP_OPEN_SUBEXP:
1425
4.82k
    case OP_CLOSE_SUBEXP:
1426
4.82k
      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1427
4.82k
      break;
1428
1429
0
    case OP_BACK_REF:
1430
0
      dfa->nexts[idx] = node->next->node_idx;
1431
0
      if (node->token.type == OP_BACK_REF)
1432
0
  err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1433
0
      break;
1434
1435
4.95M
    default:
1436
4.95M
      DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1437
0
      dfa->nexts[idx] = node->next->node_idx;
1438
4.95M
      break;
1439
9.95M
    }
1440
1441
9.95M
  return err;
1442
9.95M
}
1443
1444
/* Duplicate the epsilon closure of the node ROOT_NODE.
1445
   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1446
   to their own constraint.  */
1447
1448
static reg_errcode_t
1449
duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1450
      Idx root_node, unsigned int init_constraint)
1451
673
{
1452
673
  Idx org_node, clone_node;
1453
673
  bool ok;
1454
673
  unsigned int constraint = init_constraint;
1455
673
  for (org_node = top_org_node, clone_node = top_clone_node;;)
1456
1.69k
    {
1457
1.69k
      Idx org_dest, clone_dest;
1458
1.69k
      if (dfa->nodes[org_node].type == OP_BACK_REF)
1459
0
  {
1460
    /* If the back reference epsilon-transit, its destination must
1461
       also have the constraint.  Then duplicate the epsilon closure
1462
       of the destination of the back reference, and store it in
1463
       edests of the back reference.  */
1464
0
    org_dest = dfa->nexts[org_node];
1465
0
    re_node_set_empty (dfa->edests + clone_node);
1466
0
    clone_dest = duplicate_node (dfa, org_dest, constraint);
1467
0
    if (__glibc_unlikely (clone_dest == -1))
1468
0
      return REG_ESPACE;
1469
0
    dfa->nexts[clone_node] = dfa->nexts[org_node];
1470
0
    ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1471
0
    if (__glibc_unlikely (! ok))
1472
0
      return REG_ESPACE;
1473
0
  }
1474
1.69k
      else if (dfa->edests[org_node].nelem == 0)
1475
657
  {
1476
    /* In case of the node can't epsilon-transit, don't duplicate the
1477
       destination and store the original destination as the
1478
       destination of the node.  */
1479
657
    dfa->nexts[clone_node] = dfa->nexts[org_node];
1480
657
    break;
1481
657
  }
1482
1.03k
      else if (dfa->edests[org_node].nelem == 1)
1483
748
  {
1484
    /* In case of the node can epsilon-transit, and it has only one
1485
       destination.  */
1486
748
    org_dest = dfa->edests[org_node].elems[0];
1487
748
    re_node_set_empty (dfa->edests + clone_node);
1488
    /* If the node is root_node itself, it means the epsilon closure
1489
       has a loop.  Then tie it to the destination of the root_node.  */
1490
748
    if (org_node == root_node && clone_node != org_node)
1491
16
      {
1492
16
        ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1493
16
        if (__glibc_unlikely (! ok))
1494
0
          return REG_ESPACE;
1495
16
        break;
1496
16
      }
1497
    /* In case the node has another constraint, append it.  */
1498
732
    constraint |= dfa->nodes[org_node].constraint;
1499
732
    clone_dest = duplicate_node (dfa, org_dest, constraint);
1500
732
    if (__glibc_unlikely (clone_dest == -1))
1501
0
      return REG_ESPACE;
1502
732
    ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503
732
    if (__glibc_unlikely (! ok))
1504
0
      return REG_ESPACE;
1505
732
  }
1506
288
      else /* dfa->edests[org_node].nelem == 2 */
1507
288
  {
1508
    /* In case of the node can epsilon-transit, and it has two
1509
       destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1510
288
    org_dest = dfa->edests[org_node].elems[0];
1511
288
    re_node_set_empty (dfa->edests + clone_node);
1512
    /* Search for a duplicated node which satisfies the constraint.  */
1513
288
    clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1514
288
    if (clone_dest == -1)
1515
234
      {
1516
        /* There is no such duplicated node, create a new one.  */
1517
234
        reg_errcode_t err;
1518
234
        clone_dest = duplicate_node (dfa, org_dest, constraint);
1519
234
        if (__glibc_unlikely (clone_dest == -1))
1520
0
    return REG_ESPACE;
1521
234
        ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1522
234
        if (__glibc_unlikely (! ok))
1523
0
    return REG_ESPACE;
1524
234
        err = duplicate_node_closure (dfa, org_dest, clone_dest,
1525
234
              root_node, constraint);
1526
234
        if (__glibc_unlikely (err != REG_NOERROR))
1527
0
    return err;
1528
234
      }
1529
54
    else
1530
54
      {
1531
        /* There is a duplicated node which satisfies the constraint,
1532
     use it to avoid infinite loop.  */
1533
54
        ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534
54
        if (__glibc_unlikely (! ok))
1535
0
    return REG_ESPACE;
1536
54
      }
1537
1538
288
    org_dest = dfa->edests[org_node].elems[1];
1539
288
    clone_dest = duplicate_node (dfa, org_dest, constraint);
1540
288
    if (__glibc_unlikely (clone_dest == -1))
1541
0
      return REG_ESPACE;
1542
288
    ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543
288
    if (__glibc_unlikely (! ok))
1544
0
      return REG_ESPACE;
1545
288
  }
1546
1.02k
      org_node = org_dest;
1547
1.02k
      clone_node = clone_dest;
1548
1.02k
    }
1549
673
  return REG_NOERROR;
1550
673
}
1551
1552
/* Search for a node which is duplicated from the node ORG_NODE, and
1553
   satisfies the constraint CONSTRAINT.  */
1554
1555
static Idx
1556
search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1557
      unsigned int constraint)
1558
288
{
1559
288
  Idx idx;
1560
4.62k
  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1561
4.38k
    {
1562
4.38k
      if (org_node == dfa->org_indices[idx]
1563
4.38k
    && constraint == dfa->nodes[idx].constraint)
1564
54
  return idx; /* Found.  */
1565
4.38k
    }
1566
234
  return -1; /* Not found.  */
1567
288
}
1568
1569
/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1570
   Return the index of the new node, or -1 if insufficient storage is
1571
   available.  */
1572
1573
static Idx
1574
duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1575
1.25k
{
1576
1.25k
  Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1577
1.25k
  if (__glibc_likely (dup_idx != -1))
1578
1.25k
    {
1579
1.25k
      dfa->nodes[dup_idx].constraint = constraint;
1580
1.25k
      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1581
1.25k
      dfa->nodes[dup_idx].duplicated = 1;
1582
1583
      /* Store the index of the original node.  */
1584
1.25k
      dfa->org_indices[dup_idx] = org_idx;
1585
1.25k
    }
1586
1.25k
  return dup_idx;
1587
1.25k
}
1588
1589
static reg_errcode_t
1590
calc_inveclosure (re_dfa_t *dfa)
1591
246
{
1592
246
  Idx src, idx;
1593
246
  bool ok;
1594
5.00M
  for (idx = 0; idx < dfa->nodes_len; ++idx)
1595
5.00M
    re_node_set_init_empty (dfa->inveclosures + idx);
1596
1597
5.00M
  for (src = 0; src < dfa->nodes_len; ++src)
1598
5.00M
    {
1599
5.00M
      Idx *elems = dfa->eclosures[src].elems;
1600
22.2M
      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1601
17.2M
  {
1602
17.2M
    ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1603
17.2M
    if (__glibc_unlikely (! ok))
1604
0
      return REG_ESPACE;
1605
17.2M
  }
1606
5.00M
    }
1607
1608
246
  return REG_NOERROR;
1609
246
}
1610
1611
/* Calculate "eclosure" for all the node in DFA.  */
1612
1613
static reg_errcode_t
1614
calc_eclosure (re_dfa_t *dfa)
1615
12.7k
{
1616
12.7k
  Idx node_idx;
1617
12.7k
  bool incomplete;
1618
12.7k
  DEBUG_ASSERT (dfa->nodes_len > 0);
1619
0
  incomplete = false;
1620
  /* For each nodes, calculate epsilon closure.  */
1621
5.03M
  for (node_idx = 0; ; ++node_idx)
1622
5.04M
    {
1623
5.04M
      reg_errcode_t err;
1624
5.04M
      re_node_set eclosure_elem;
1625
5.04M
      if (node_idx == dfa->nodes_len)
1626
12.7k
  {
1627
12.7k
    if (!incomplete)
1628
12.7k
      break;
1629
0
    incomplete = false;
1630
0
    node_idx = 0;
1631
0
  }
1632
1633
5.03M
      DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1634
1635
      /* If we have already calculated, skip it.  */
1636
5.03M
      if (dfa->eclosures[node_idx].nelem != 0)
1637
84.0k
  continue;
1638
      /* Calculate epsilon closure of 'node_idx'.  */
1639
4.94M
      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1640
4.94M
      if (__glibc_unlikely (err != REG_NOERROR))
1641
0
  return err;
1642
1643
4.94M
      if (dfa->eclosures[node_idx].nelem == 0)
1644
0
  {
1645
0
    incomplete = true;
1646
0
    re_node_set_free (&eclosure_elem);
1647
0
  }
1648
4.94M
    }
1649
12.7k
  return REG_NOERROR;
1650
12.7k
}
1651
1652
/* Calculate epsilon closure of NODE.  */
1653
1654
static reg_errcode_t
1655
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1656
5.03M
{
1657
5.03M
  reg_errcode_t err;
1658
5.03M
  Idx i;
1659
5.03M
  re_node_set eclosure;
1660
5.03M
  bool incomplete = false;
1661
5.03M
  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662
5.03M
  if (__glibc_unlikely (err != REG_NOERROR))
1663
0
    return err;
1664
1665
  /* An epsilon closure includes itself.  */
1666
5.03M
  eclosure.elems[eclosure.nelem++] = node;
1667
1668
  /* This indicates that we are calculating this node now.
1669
     We reference this value to avoid infinite loop.  */
1670
5.03M
  dfa->eclosures[node].nelem = -1;
1671
1672
  /* If the current node has constraints, duplicate all nodes
1673
     since they must inherit the constraints.  */
1674
5.03M
  if (dfa->nodes[node].constraint
1675
5.03M
      && dfa->edests[node].nelem
1676
5.03M
      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1677
439
    {
1678
439
      err = duplicate_node_closure (dfa, node, node, node,
1679
439
            dfa->nodes[node].constraint);
1680
439
      if (__glibc_unlikely (err != REG_NOERROR))
1681
0
  return err;
1682
439
    }
1683
1684
  /* Expand each epsilon destination nodes.  */
1685
5.03M
  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1686
209k
    for (i = 0; i < dfa->edests[node].nelem; ++i)
1687
137k
      {
1688
137k
  re_node_set eclosure_elem;
1689
137k
  Idx edest = dfa->edests[node].elems[i];
1690
  /* If calculating the epsilon closure of 'edest' is in progress,
1691
     return intermediate result.  */
1692
137k
  if (dfa->eclosures[edest].nelem == -1)
1693
1.13k
    {
1694
1.13k
      incomplete = true;
1695
1.13k
      continue;
1696
1.13k
    }
1697
  /* If we haven't calculated the epsilon closure of 'edest' yet,
1698
     calculate now. Otherwise use calculated epsilon closure.  */
1699
136k
  if (dfa->eclosures[edest].nelem == 0)
1700
90.0k
    {
1701
90.0k
      err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1702
90.0k
      if (__glibc_unlikely (err != REG_NOERROR))
1703
0
        return err;
1704
90.0k
    }
1705
46.0k
  else
1706
46.0k
    eclosure_elem = dfa->eclosures[edest];
1707
  /* Merge the epsilon closure of 'edest'.  */
1708
136k
  err = re_node_set_merge (&eclosure, &eclosure_elem);
1709
136k
  if (__glibc_unlikely (err != REG_NOERROR))
1710
0
    return err;
1711
  /* If the epsilon closure of 'edest' is incomplete,
1712
     the epsilon closure of this node is also incomplete.  */
1713
136k
  if (dfa->eclosures[edest].nelem == 0)
1714
6.04k
    {
1715
6.04k
      incomplete = true;
1716
6.04k
      re_node_set_free (&eclosure_elem);
1717
6.04k
    }
1718
136k
      }
1719
1720
5.03M
  if (incomplete && !root)
1721
6.04k
    dfa->eclosures[node].nelem = 0;
1722
5.03M
  else
1723
5.03M
    dfa->eclosures[node] = eclosure;
1724
5.03M
  *new_set = eclosure;
1725
5.03M
  return REG_NOERROR;
1726
5.03M
}
1727

1728
/* Functions for token which are used in the parser.  */
1729
1730
/* Fetch a token from INPUT.
1731
   We must not use this function inside bracket expressions.  */
1732
1733
static void
1734
fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1735
8.02M
{
1736
8.02M
  re_string_skip_bytes (input, peek_token (result, input, syntax));
1737
8.02M
}
1738
1739
/* Peek a token from INPUT, and return the length of the token.
1740
   We must not use this function inside bracket expressions.  */
1741
1742
static int
1743
peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1744
3.87G
{
1745
3.87G
  unsigned char c;
1746
1747
3.87G
  if (re_string_eoi (input))
1748
12.7k
    {
1749
12.7k
      token->type = END_OF_RE;
1750
12.7k
      return 0;
1751
12.7k
    }
1752
1753
3.87G
  c = re_string_peek_byte (input, 0);
1754
3.87G
  token->opr.c = c;
1755
1756
3.87G
  token->word_char = 0;
1757
3.87G
  token->mb_partial = 0;
1758
3.87G
  if (input->mb_cur_max > 1
1759
3.87G
      && !re_string_first_byte (input, re_string_cur_idx (input)))
1760
0
    {
1761
0
      token->type = CHARACTER;
1762
0
      token->mb_partial = 1;
1763
0
      return 1;
1764
0
    }
1765
3.87G
  if (c == '\\')
1766
1.65k
    {
1767
1.65k
      unsigned char c2;
1768
1.65k
      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1769
0
  {
1770
0
    token->type = BACK_SLASH;
1771
0
    return 1;
1772
0
  }
1773
1774
1.65k
      c2 = re_string_peek_byte_case (input, 1);
1775
1.65k
      token->opr.c = c2;
1776
1.65k
      token->type = CHARACTER;
1777
1.65k
      if (input->mb_cur_max > 1)
1778
0
  {
1779
0
    wint_t wc = re_string_wchar_at (input,
1780
0
            re_string_cur_idx (input) + 1);
1781
0
    token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1782
0
  }
1783
1.65k
      else
1784
1.65k
  token->word_char = IS_WORD_CHAR (c2) != 0;
1785
1786
1.65k
      switch (c2)
1787
1.65k
  {
1788
0
  case '|':
1789
0
    if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1790
0
      token->type = OP_ALT;
1791
0
    break;
1792
98
  case '1': case '2': case '3': case '4': case '5':
1793
98
  case '6': case '7': case '8': case '9':
1794
98
    if (!(syntax & RE_NO_BK_REFS))
1795
0
      {
1796
0
        token->type = OP_BACK_REF;
1797
0
        token->opr.idx = c2 - '1';
1798
0
      }
1799
98
    break;
1800
0
  case '<':
1801
0
    if (!(syntax & RE_NO_GNU_OPS))
1802
0
      {
1803
0
        token->type = ANCHOR;
1804
0
        token->opr.ctx_type = WORD_FIRST;
1805
0
      }
1806
0
    break;
1807
0
  case '>':
1808
0
    if (!(syntax & RE_NO_GNU_OPS))
1809
0
      {
1810
0
        token->type = ANCHOR;
1811
0
        token->opr.ctx_type = WORD_LAST;
1812
0
      }
1813
0
    break;
1814
0
  case 'b':
1815
0
    if (!(syntax & RE_NO_GNU_OPS))
1816
0
      {
1817
0
        token->type = ANCHOR;
1818
0
        token->opr.ctx_type = WORD_DELIM;
1819
0
      }
1820
0
    break;
1821
0
  case 'B':
1822
0
    if (!(syntax & RE_NO_GNU_OPS))
1823
0
      {
1824
0
        token->type = ANCHOR;
1825
0
        token->opr.ctx_type = NOT_WORD_DELIM;
1826
0
      }
1827
0
    break;
1828
0
  case 'w':
1829
0
    if (!(syntax & RE_NO_GNU_OPS))
1830
0
      token->type = OP_WORD;
1831
0
    break;
1832
0
  case 'W':
1833
0
    if (!(syntax & RE_NO_GNU_OPS))
1834
0
      token->type = OP_NOTWORD;
1835
0
    break;
1836
0
  case 's':
1837
0
    if (!(syntax & RE_NO_GNU_OPS))
1838
0
      token->type = OP_SPACE;
1839
0
    break;
1840
0
  case 'S':
1841
0
    if (!(syntax & RE_NO_GNU_OPS))
1842
0
      token->type = OP_NOTSPACE;
1843
0
    break;
1844
0
  case '`':
1845
0
    if (!(syntax & RE_NO_GNU_OPS))
1846
0
      {
1847
0
        token->type = ANCHOR;
1848
0
        token->opr.ctx_type = BUF_FIRST;
1849
0
      }
1850
0
    break;
1851
0
  case '\'':
1852
0
    if (!(syntax & RE_NO_GNU_OPS))
1853
0
      {
1854
0
        token->type = ANCHOR;
1855
0
        token->opr.ctx_type = BUF_LAST;
1856
0
      }
1857
0
    break;
1858
0
  case '(':
1859
0
    if (!(syntax & RE_NO_BK_PARENS))
1860
0
      token->type = OP_OPEN_SUBEXP;
1861
0
    break;
1862
0
  case ')':
1863
0
    if (!(syntax & RE_NO_BK_PARENS))
1864
0
      token->type = OP_CLOSE_SUBEXP;
1865
0
    break;
1866
0
  case '+':
1867
0
    if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1868
0
      token->type = OP_DUP_PLUS;
1869
0
    break;
1870
0
  case '?':
1871
0
    if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1872
0
      token->type = OP_DUP_QUESTION;
1873
0
    break;
1874
0
  case '{':
1875
0
    if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1876
0
      token->type = OP_OPEN_DUP_NUM;
1877
0
    break;
1878
132
  case '}':
1879
132
    if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1880
0
      token->type = OP_CLOSE_DUP_NUM;
1881
132
    break;
1882
1.42k
  default:
1883
1.42k
    break;
1884
1.65k
  }
1885
1.65k
      return 2;
1886
1.65k
    }
1887
1888
3.87G
  token->type = CHARACTER;
1889
3.87G
  if (input->mb_cur_max > 1)
1890
0
    {
1891
0
      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1892
0
      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1893
0
    }
1894
3.87G
  else
1895
3.87G
    token->word_char = IS_WORD_CHAR (token->opr.c);
1896
1897
3.87G
  switch (c)
1898
3.87G
    {
1899
709k
    case '\n':
1900
709k
      if (syntax & RE_NEWLINE_ALT)
1901
0
  token->type = OP_ALT;
1902
709k
      break;
1903
28.4k
    case '|':
1904
28.4k
      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1905
28.4k
  token->type = OP_ALT;
1906
28.4k
      break;
1907
267k
    case '*':
1908
267k
      token->type = OP_DUP_ASTERISK;
1909
267k
      break;
1910
2.73k
    case '+':
1911
2.73k
      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1912
2.73k
  token->type = OP_DUP_PLUS;
1913
2.73k
      break;
1914
12.2k
    case '?':
1915
12.2k
      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1916
12.2k
  token->type = OP_DUP_QUESTION;
1917
12.2k
      break;
1918
0
    case '{':
1919
0
      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1920
0
  token->type = OP_OPEN_DUP_NUM;
1921
0
      break;
1922
552
    case '}':
1923
552
      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1924
552
  token->type = OP_CLOSE_DUP_NUM;
1925
552
      break;
1926
830k
    case '(':
1927
830k
      if (syntax & RE_NO_BK_PARENS)
1928
830k
  token->type = OP_OPEN_SUBEXP;
1929
830k
      break;
1930
815k
    case ')':
1931
815k
      if (syntax & RE_NO_BK_PARENS)
1932
815k
  token->type = OP_CLOSE_SUBEXP;
1933
815k
      break;
1934
17.0k
    case '[':
1935
17.0k
      token->type = OP_OPEN_BRACKET;
1936
17.0k
      break;
1937
28.8k
    case '.':
1938
28.8k
      token->type = OP_PERIOD;
1939
28.8k
      break;
1940
165
    case '^':
1941
165
      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1942
165
    && re_string_cur_idx (input) != 0)
1943
150
  {
1944
150
    char prev = re_string_peek_byte (input, -1);
1945
150
    if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1946
150
      break;
1947
150
  }
1948
15
      token->type = ANCHOR;
1949
15
      token->opr.ctx_type = LINE_FIRST;
1950
15
      break;
1951
3.86G
    case '$':
1952
3.86G
      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1953
3.86G
    && re_string_cur_idx (input) + 1 != re_string_length (input))
1954
3.86G
  {
1955
3.86G
    re_token_t next;
1956
3.86G
    re_string_skip_bytes (input, 1);
1957
3.86G
    peek_token (&next, input, syntax);
1958
3.86G
    re_string_skip_bytes (input, -1);
1959
3.86G
    if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1960
3.86G
      break;
1961
3.86G
  }
1962
415
      token->type = ANCHOR;
1963
415
      token->opr.ctx_type = LINE_LAST;
1964
415
      break;
1965
5.29M
    default:
1966
5.29M
      break;
1967
3.87G
    }
1968
3.87G
  return 1;
1969
3.87G
}
1970
1971
/* Peek a token from INPUT, and return the length of the token.
1972
   We must not use this function out of bracket expressions.  */
1973
1974
static int
1975
peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1976
62.2M
{
1977
62.2M
  unsigned char c;
1978
62.2M
  if (re_string_eoi (input))
1979
0
    {
1980
0
      token->type = END_OF_RE;
1981
0
      return 0;
1982
0
    }
1983
62.2M
  c = re_string_peek_byte (input, 0);
1984
62.2M
  token->opr.c = c;
1985
1986
62.2M
  if (input->mb_cur_max > 1
1987
62.2M
      && !re_string_first_byte (input, re_string_cur_idx (input)))
1988
0
    {
1989
0
      token->type = CHARACTER;
1990
0
      return 1;
1991
0
    }
1992
1993
62.2M
  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1994
62.2M
      && re_string_cur_idx (input) + 1 < re_string_length (input))
1995
0
    {
1996
      /* In this case, '\' escape a character.  */
1997
0
      unsigned char c2;
1998
0
      re_string_skip_bytes (input, 1);
1999
0
      c2 = re_string_peek_byte (input, 0);
2000
0
      token->opr.c = c2;
2001
0
      token->type = CHARACTER;
2002
0
      return 1;
2003
0
    }
2004
62.2M
  if (c == '[') /* '[' is a special char in a bracket exps.  */
2005
34.2k
    {
2006
34.2k
      unsigned char c2;
2007
34.2k
      int token_len;
2008
34.2k
      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2009
34.2k
  c2 = re_string_peek_byte (input, 1);
2010
0
      else
2011
0
  c2 = 0;
2012
34.2k
      token->opr.c = c2;
2013
34.2k
      token_len = 2;
2014
34.2k
      switch (c2)
2015
34.2k
  {
2016
83
  case '.':
2017
83
    token->type = OP_OPEN_COLL_ELEM;
2018
83
    break;
2019
2020
0
  case '=':
2021
0
    token->type = OP_OPEN_EQUIV_CLASS;
2022
0
    break;
2023
2024
0
  case ':':
2025
0
    if (syntax & RE_CHAR_CLASSES)
2026
0
      {
2027
0
        token->type = OP_OPEN_CHAR_CLASS;
2028
0
        break;
2029
0
      }
2030
0
    FALLTHROUGH;
2031
34.1k
  default:
2032
34.1k
    token->type = CHARACTER;
2033
34.1k
    token->opr.c = c;
2034
34.1k
    token_len = 1;
2035
34.1k
    break;
2036
34.2k
  }
2037
34.2k
      return token_len;
2038
34.2k
    }
2039
62.2M
  switch (c)
2040
62.2M
    {
2041
20.6k
    case ']':
2042
20.6k
      token->type = OP_CLOSE_BRACKET;
2043
20.6k
      break;
2044
12.2k
    case '^':
2045
12.2k
      token->type = OP_NON_MATCH_LIST;
2046
12.2k
      break;
2047
16.4k
    case '-':
2048
      /* In V7 Unix grep and Unix awk and mawk, [...---...]
2049
         (3 adjacent minus signs) stands for a single minus sign.
2050
         Support that without breaking anything else.  */
2051
16.4k
      if (! (re_string_cur_idx (input) + 2 < re_string_length (input)
2052
16.4k
             && re_string_peek_byte (input, 1) == '-'
2053
16.4k
             && re_string_peek_byte (input, 2) == '-'))
2054
16.4k
        {
2055
16.4k
          token->type = OP_CHARSET_RANGE;
2056
16.4k
          break;
2057
16.4k
        }
2058
0
      re_string_skip_bytes (input, 2);
2059
0
      FALLTHROUGH;
2060
62.1M
    default:
2061
62.1M
      token->type = CHARACTER;
2062
62.2M
    }
2063
62.2M
  return 1;
2064
62.2M
}
2065

2066
/* Functions for parser.  */
2067
2068
/* Entry point of the parser.
2069
   Parse the regular expression REGEXP and return the structure tree.
2070
   If an error occurs, ERR is set by error code, and return NULL.
2071
   This function build the following tree, from regular expression <reg_exp>:
2072
     CAT
2073
     / \
2074
    /   \
2075
   <reg_exp>  EOR
2076
2077
   CAT means concatenation.
2078
   EOR means end of regular expression.  */
2079
2080
static bin_tree_t *
2081
parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2082
       reg_errcode_t *err)
2083
12.7k
{
2084
12.7k
  re_dfa_t *dfa = preg->buffer;
2085
12.7k
  bin_tree_t *tree, *eor, *root;
2086
12.7k
  re_token_t current_token;
2087
12.7k
  dfa->syntax = syntax;
2088
12.7k
  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2089
12.7k
  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2090
12.7k
  if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2091
54
    return NULL;
2092
12.7k
  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2093
12.7k
  if (tree != NULL)
2094
12.7k
    root = create_tree (dfa, tree, eor, CONCAT);
2095
0
  else
2096
0
    root = eor;
2097
12.7k
  if (__glibc_unlikely (eor == NULL || root == NULL))
2098
0
    {
2099
0
      *err = REG_ESPACE;
2100
0
      return NULL;
2101
0
    }
2102
12.7k
  return root;
2103
12.7k
}
2104
2105
/* This function build the following tree, from regular expression
2106
   <branch1>|<branch2>:
2107
     ALT
2108
     / \
2109
    /   \
2110
   <branch1> <branch2>
2111
2112
   ALT means alternative, which represents the operator '|'.  */
2113
2114
static bin_tree_t *
2115
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2116
         reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2117
566k
{
2118
566k
  re_dfa_t *dfa = preg->buffer;
2119
566k
  bin_tree_t *tree, *branch = NULL;
2120
566k
  bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2121
566k
  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2122
566k
  if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2123
15.2k
    return NULL;
2124
2125
579k
  while (token->type == OP_ALT)
2126
28.3k
    {
2127
28.3k
      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2128
28.3k
      if (token->type != OP_ALT && token->type != END_OF_RE
2129
28.3k
    && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2130
28.2k
  {
2131
28.2k
    bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2132
28.2k
    dfa->completed_bkref_map = initial_bkref_map;
2133
28.2k
    branch = parse_branch (regexp, preg, token, syntax, nest, err);
2134
28.2k
    if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2135
84
      {
2136
84
        if (tree != NULL)
2137
84
    postorder (tree, free_tree, NULL);
2138
84
        return NULL;
2139
84
      }
2140
28.1k
    dfa->completed_bkref_map |= accumulated_bkref_map;
2141
28.1k
  }
2142
28
      else
2143
28
  branch = NULL;
2144
28.2k
      tree = create_tree (dfa, tree, branch, OP_ALT);
2145
28.2k
      if (__glibc_unlikely (tree == NULL))
2146
0
  {
2147
0
    *err = REG_ESPACE;
2148
0
    return NULL;
2149
0
  }
2150
28.2k
    }
2151
551k
  return tree;
2152
551k
}
2153
2154
/* This function build the following tree, from regular expression
2155
   <exp1><exp2>:
2156
  CAT
2157
  / \
2158
       /   \
2159
   <exp1> <exp2>
2160
2161
   CAT means concatenation.  */
2162
2163
static bin_tree_t *
2164
parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2165
        reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2166
594k
{
2167
594k
  bin_tree_t *tree, *expr;
2168
594k
  re_dfa_t *dfa = preg->buffer;
2169
594k
  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2170
594k
  if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2171
252
    return NULL;
2172
2173
7.12M
  while (token->type != OP_ALT && token->type != END_OF_RE
2174
7.12M
   && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2175
6.54M
    {
2176
6.54M
      expr = parse_expression (regexp, preg, token, syntax, nest, err);
2177
6.54M
      if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2178
15.0k
  {
2179
15.0k
    if (tree != NULL)
2180
15.0k
      postorder (tree, free_tree, NULL);
2181
15.0k
    return NULL;
2182
15.0k
  }
2183
6.52M
      if (tree != NULL && expr != NULL)
2184
6.52M
  {
2185
6.52M
    bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2186
6.52M
    if (newtree == NULL)
2187
0
      {
2188
0
        postorder (expr, free_tree, NULL);
2189
0
        postorder (tree, free_tree, NULL);
2190
0
        *err = REG_ESPACE;
2191
0
        return NULL;
2192
0
      }
2193
6.52M
    tree = newtree;
2194
6.52M
  }
2195
0
      else if (tree == NULL)
2196
0
  tree = expr;
2197
      /* Otherwise expr == NULL, we don't need to create new tree.  */
2198
6.52M
    }
2199
579k
  return tree;
2200
594k
}
2201
2202
/* This function build the following tree, from regular expression a*:
2203
   *
2204
   |
2205
   a
2206
*/
2207
2208
static bin_tree_t *
2209
parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2210
      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2211
7.13M
{
2212
7.13M
  re_dfa_t *dfa = preg->buffer;
2213
7.13M
  bin_tree_t *tree;
2214
7.13M
  switch (token->type)
2215
7.13M
    {
2216
6.26M
    case CHARACTER:
2217
6.26M
      tree = create_token_tree (dfa, NULL, NULL, token);
2218
6.26M
      if (__glibc_unlikely (tree == NULL))
2219
0
  {
2220
0
    *err = REG_ESPACE;
2221
0
    return NULL;
2222
0
  }
2223
6.26M
      if (dfa->mb_cur_max > 1)
2224
0
  {
2225
0
    while (!re_string_eoi (regexp)
2226
0
     && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2227
0
      {
2228
0
        bin_tree_t *mbc_remain;
2229
0
        fetch_token (token, regexp, syntax);
2230
0
        mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2231
0
        tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2232
0
        if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2233
0
    {
2234
0
      *err = REG_ESPACE;
2235
0
      return NULL;
2236
0
    }
2237
0
      }
2238
0
  }
2239
6.26M
      break;
2240
2241
6.26M
    case OP_OPEN_SUBEXP:
2242
830k
      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2243
830k
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2244
15.2k
  return NULL;
2245
815k
      break;
2246
2247
815k
    case OP_OPEN_BRACKET:
2248
17.0k
      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2249
17.0k
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2250
10
  return NULL;
2251
17.0k
      break;
2252
2253
17.0k
    case OP_BACK_REF:
2254
0
      if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2255
0
  {
2256
0
    *err = REG_ESUBREG;
2257
0
    return NULL;
2258
0
  }
2259
0
      dfa->used_bkref_map |= 1 << token->opr.idx;
2260
0
      tree = create_token_tree (dfa, NULL, NULL, token);
2261
0
      if (__glibc_unlikely (tree == NULL))
2262
0
  {
2263
0
    *err = REG_ESPACE;
2264
0
    return NULL;
2265
0
  }
2266
0
      ++dfa->nbackref;
2267
0
      dfa->has_mb_node = 1;
2268
0
      break;
2269
2270
0
    case OP_OPEN_DUP_NUM:
2271
0
      if (syntax & RE_CONTEXT_INVALID_DUP)
2272
0
  {
2273
0
    *err = REG_BADRPT;
2274
0
    return NULL;
2275
0
  }
2276
0
      FALLTHROUGH;
2277
30
    case OP_DUP_ASTERISK:
2278
30
    case OP_DUP_PLUS:
2279
30
    case OP_DUP_QUESTION:
2280
30
      if (syntax & RE_CONTEXT_INVALID_OPS)
2281
30
  {
2282
30
    *err = REG_BADRPT;
2283
30
    return NULL;
2284
30
  }
2285
0
      else if (syntax & RE_CONTEXT_INDEP_OPS)
2286
0
  {
2287
0
    fetch_token (token, regexp, syntax);
2288
0
    return parse_expression (regexp, preg, token, syntax, nest, err);
2289
0
  }
2290
30
      FALLTHROUGH;
2291
0
    case OP_CLOSE_SUBEXP:
2292
0
      if ((token->type == OP_CLOSE_SUBEXP)
2293
0
    && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2294
0
  {
2295
0
    *err = REG_ERPAREN;
2296
0
    return NULL;
2297
0
  }
2298
0
      FALLTHROUGH;
2299
552
    case OP_CLOSE_DUP_NUM:
2300
      /* We treat it as a normal character.  */
2301
2302
      /* Then we can these characters as normal characters.  */
2303
552
      token->type = CHARACTER;
2304
      /* mb_partial and word_char bits should be initialized already
2305
   by peek_token.  */
2306
552
      tree = create_token_tree (dfa, NULL, NULL, token);
2307
552
      if (__glibc_unlikely (tree == NULL))
2308
0
  {
2309
0
    *err = REG_ESPACE;
2310
0
    return NULL;
2311
0
  }
2312
552
      break;
2313
2314
552
    case ANCHOR:
2315
430
      if ((token->opr.ctx_type
2316
430
     & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2317
430
    && dfa->word_ops_used == 0)
2318
0
  init_word_char (dfa);
2319
430
      if (token->opr.ctx_type == WORD_DELIM
2320
430
    || token->opr.ctx_type == NOT_WORD_DELIM)
2321
0
  {
2322
0
    bin_tree_t *tree_first, *tree_last;
2323
0
    if (token->opr.ctx_type == WORD_DELIM)
2324
0
      {
2325
0
        token->opr.ctx_type = WORD_FIRST;
2326
0
        tree_first = create_token_tree (dfa, NULL, NULL, token);
2327
0
        token->opr.ctx_type = WORD_LAST;
2328
0
      }
2329
0
    else
2330
0
      {
2331
0
        token->opr.ctx_type = INSIDE_WORD;
2332
0
        tree_first = create_token_tree (dfa, NULL, NULL, token);
2333
0
        token->opr.ctx_type = INSIDE_NOTWORD;
2334
0
      }
2335
0
    tree_last = create_token_tree (dfa, NULL, NULL, token);
2336
0
    tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2337
0
    if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2338
0
        || tree == NULL))
2339
0
      {
2340
0
        *err = REG_ESPACE;
2341
0
        return NULL;
2342
0
      }
2343
0
  }
2344
430
      else
2345
430
  {
2346
430
    tree = create_token_tree (dfa, NULL, NULL, token);
2347
430
    if (__glibc_unlikely (tree == NULL))
2348
0
      {
2349
0
        *err = REG_ESPACE;
2350
0
        return NULL;
2351
0
      }
2352
430
  }
2353
      /* We must return here, since ANCHORs can't be followed
2354
   by repetition operators.
2355
   eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2356
       it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2357
430
      fetch_token (token, regexp, syntax);
2358
430
      return tree;
2359
2360
28.8k
    case OP_PERIOD:
2361
28.8k
      tree = create_token_tree (dfa, NULL, NULL, token);
2362
28.8k
      if (__glibc_unlikely (tree == NULL))
2363
0
  {
2364
0
    *err = REG_ESPACE;
2365
0
    return NULL;
2366
0
  }
2367
28.8k
      if (dfa->mb_cur_max > 1)
2368
0
  dfa->has_mb_node = 1;
2369
28.8k
      break;
2370
2371
0
    case OP_WORD:
2372
0
    case OP_NOTWORD:
2373
0
      tree = build_charclass_op (dfa, regexp->trans,
2374
0
         "alnum",
2375
0
         "_",
2376
0
         token->type == OP_NOTWORD, err);
2377
0
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2378
0
  return NULL;
2379
0
      break;
2380
2381
0
    case OP_SPACE:
2382
0
    case OP_NOTSPACE:
2383
0
      tree = build_charclass_op (dfa, regexp->trans,
2384
0
         "space",
2385
0
         "",
2386
0
         token->type == OP_NOTSPACE, err);
2387
0
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2388
0
  return NULL;
2389
0
      break;
2390
2391
97
    case OP_ALT:
2392
97
    case END_OF_RE:
2393
97
      return NULL;
2394
2395
0
    case BACK_SLASH:
2396
0
      *err = REG_EESCAPE;
2397
0
      return NULL;
2398
2399
0
    default:
2400
      /* Must not happen?  */
2401
0
      DEBUG_ASSERT (false);
2402
0
      return NULL;
2403
7.13M
    }
2404
7.12M
  fetch_token (token, regexp, syntax);
2405
2406
7.15M
  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2407
7.15M
   || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2408
33.9k
    {
2409
33.9k
      bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2410
33.9k
             syntax, err);
2411
33.9k
      if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2412
0
  {
2413
0
    if (tree != NULL)
2414
0
      postorder (tree, free_tree, NULL);
2415
0
    return NULL;
2416
0
  }
2417
33.9k
      tree = dup_tree;
2418
      /* In BRE consecutive duplications are not allowed.  */
2419
33.9k
      if ((syntax & RE_CONTEXT_INVALID_DUP)
2420
33.9k
    && (token->type == OP_DUP_ASTERISK
2421
33.9k
        || token->type == OP_OPEN_DUP_NUM))
2422
0
  {
2423
0
    if (tree != NULL)
2424
0
      postorder (tree, free_tree, NULL);
2425
0
    *err = REG_BADRPT;
2426
0
    return NULL;
2427
0
  }
2428
33.9k
    }
2429
2430
7.12M
  return tree;
2431
7.12M
}
2432
2433
/* This function build the following tree, from regular expression
2434
   (<reg_exp>):
2435
   SUBEXP
2436
      |
2437
  <reg_exp>
2438
*/
2439
2440
static bin_tree_t *
2441
parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2442
         reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2443
830k
{
2444
830k
  re_dfa_t *dfa = preg->buffer;
2445
830k
  bin_tree_t *tree;
2446
830k
  size_t cur_nsub;
2447
830k
  cur_nsub = preg->re_nsub++;
2448
2449
830k
  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450
2451
  /* The subexpression may be a null string.  */
2452
830k
  if (token->type == OP_CLOSE_SUBEXP)
2453
276k
    tree = NULL;
2454
553k
  else
2455
553k
    {
2456
553k
      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2457
553k
      if (__glibc_unlikely (*err == REG_NOERROR
2458
553k
          && token->type != OP_CLOSE_SUBEXP))
2459
14
  {
2460
14
    if (tree != NULL)
2461
14
      postorder (tree, free_tree, NULL);
2462
14
    *err = REG_EPAREN;
2463
14
  }
2464
553k
      if (__glibc_unlikely (*err != REG_NOERROR))
2465
15.2k
  return NULL;
2466
553k
    }
2467
2468
815k
  if (cur_nsub <= '9' - '1')
2469
942
    dfa->completed_bkref_map |= 1 << cur_nsub;
2470
2471
815k
  tree = create_tree (dfa, tree, NULL, SUBEXP);
2472
815k
  if (__glibc_unlikely (tree == NULL))
2473
0
    {
2474
0
      *err = REG_ESPACE;
2475
0
      return NULL;
2476
0
    }
2477
815k
  tree->token.opr.idx = cur_nsub;
2478
815k
  return tree;
2479
815k
}
2480
2481
/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2482
2483
static bin_tree_t *
2484
parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2485
        re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2486
33.9k
{
2487
33.9k
  bin_tree_t *tree = NULL, *old_tree = NULL;
2488
33.9k
  Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2489
33.9k
  re_token_t start_token = *token;
2490
2491
33.9k
  if (token->type == OP_OPEN_DUP_NUM)
2492
0
    {
2493
0
      end = 0;
2494
0
      start = fetch_number (regexp, token, syntax);
2495
0
      if (start == -1)
2496
0
  {
2497
0
    if (token->type == CHARACTER && token->opr.c == ',')
2498
0
      start = 0; /* We treat "{,m}" as "{0,m}".  */
2499
0
    else
2500
0
      {
2501
0
        *err = REG_BADBR; /* <re>{} is invalid.  */
2502
0
        return NULL;
2503
0
      }
2504
0
  }
2505
0
      if (__glibc_likely (start != -2))
2506
0
  {
2507
    /* We treat "{n}" as "{n,n}".  */
2508
0
    end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509
0
     : ((token->type == CHARACTER && token->opr.c == ',')
2510
0
        ? fetch_number (regexp, token, syntax) : -2));
2511
0
  }
2512
0
      if (__glibc_unlikely (start == -2 || end == -2))
2513
0
  {
2514
    /* Invalid sequence.  */
2515
0
    if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2516
0
      {
2517
0
        if (token->type == END_OF_RE)
2518
0
    *err = REG_EBRACE;
2519
0
        else
2520
0
    *err = REG_BADBR;
2521
2522
0
        return NULL;
2523
0
      }
2524
2525
    /* If the syntax bit is set, rollback.  */
2526
0
    re_string_set_index (regexp, start_idx);
2527
0
    *token = start_token;
2528
0
    token->type = CHARACTER;
2529
    /* mb_partial and word_char bits should be already initialized by
2530
       peek_token.  */
2531
0
    return elem;
2532
0
  }
2533
2534
0
      if (__glibc_unlikely ((end != -1 && start > end)
2535
0
          || token->type != OP_CLOSE_DUP_NUM))
2536
0
  {
2537
    /* First number greater than second.  */
2538
0
    *err = REG_BADBR;
2539
0
    return NULL;
2540
0
  }
2541
2542
0
      if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2543
0
  {
2544
0
    *err = REG_ESIZE;
2545
0
    return NULL;
2546
0
  }
2547
0
    }
2548
33.9k
  else
2549
33.9k
    {
2550
33.9k
      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551
33.9k
      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2552
33.9k
    }
2553
2554
33.9k
  fetch_token (token, regexp, syntax);
2555
2556
33.9k
  if (__glibc_unlikely (elem == NULL))
2557
0
    return NULL;
2558
33.9k
  if (__glibc_unlikely (start == 0 && end == 0))
2559
0
    {
2560
0
      postorder (elem, free_tree, NULL);
2561
0
      return NULL;
2562
0
    }
2563
2564
  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2565
33.9k
  if (__glibc_unlikely (start > 0))
2566
2.73k
    {
2567
2.73k
      tree = elem;
2568
2.73k
      for (i = 2; i <= start; ++i)
2569
0
  {
2570
0
    elem = duplicate_tree (elem, dfa);
2571
0
    tree = create_tree (dfa, tree, elem, CONCAT);
2572
0
    if (__glibc_unlikely (elem == NULL || tree == NULL))
2573
0
      goto parse_dup_op_espace;
2574
0
  }
2575
2576
2.73k
      if (start == end)
2577
0
  return tree;
2578
2579
      /* Duplicate ELEM before it is marked optional.  */
2580
2.73k
      elem = duplicate_tree (elem, dfa);
2581
2.73k
      if (__glibc_unlikely (elem == NULL))
2582
0
        goto parse_dup_op_espace;
2583
2.73k
      old_tree = tree;
2584
2.73k
    }
2585
31.2k
  else
2586
31.2k
    old_tree = NULL;
2587
2588
33.9k
  if (elem->token.type == SUBEXP)
2589
79
    {
2590
79
      uintptr_t subidx = elem->token.opr.idx;
2591
79
      postorder (elem, mark_opt_subexp, (void *) subidx);
2592
79
    }
2593
2594
33.9k
  tree = create_tree (dfa, elem, NULL,
2595
33.9k
          (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2596
33.9k
  if (__glibc_unlikely (tree == NULL))
2597
0
    goto parse_dup_op_espace;
2598
2599
  /* This loop is actually executed only when end != -1,
2600
     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2601
     already created the start+1-th copy.  */
2602
33.9k
  if (TYPE_SIGNED (Idx) || end != -1)
2603
33.9k
    for (i = start + 2; i <= end; ++i)
2604
0
      {
2605
0
  elem = duplicate_tree (elem, dfa);
2606
0
  tree = create_tree (dfa, tree, elem, CONCAT);
2607
0
  if (__glibc_unlikely (elem == NULL || tree == NULL))
2608
0
    goto parse_dup_op_espace;
2609
2610
0
  tree = create_tree (dfa, tree, NULL, OP_ALT);
2611
0
  if (__glibc_unlikely (tree == NULL))
2612
0
    goto parse_dup_op_espace;
2613
0
      }
2614
2615
33.9k
  if (old_tree)
2616
2.73k
    tree = create_tree (dfa, old_tree, tree, CONCAT);
2617
2618
33.9k
  return tree;
2619
2620
0
 parse_dup_op_espace:
2621
0
  *err = REG_ESPACE;
2622
0
  return NULL;
2623
33.9k
}
2624
2625
/* Size of the names for collating symbol/equivalence_class/character_class.
2626
   I'm not sure, but maybe enough.  */
2627
166
#define BRACKET_NAME_BUF_SIZE 32
2628
2629
#ifndef _LIBC
2630
2631
/* Convert the byte B to the corresponding wide character.  In a
2632
   unibyte locale, treat B as itself.  In a multibyte locale, return
2633
   WEOF if B is an encoding error.  */
2634
static wint_t
2635
parse_byte (unsigned char b, re_dfa_t const *dfa)
2636
32.9k
{
2637
32.9k
  return dfa->mb_cur_max > 1 ? __btowc (b) : b;
2638
32.9k
}
2639
2640
/* Local function for parse_bracket_exp used in _LIBC environment.
2641
   Build the range expression which starts from START_ELEM, and ends
2642
   at END_ELEM.  The result are written to MBCSET and SBCSET.
2643
   RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2644
   mbcset->range_ends, is a pointer argument since we may
2645
   update it.  */
2646
2647
static reg_errcode_t
2648
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2649
     bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2650
     re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
2651
     const unsigned char *collseqmb, const char *collseqwc,
2652
     int_fast32_t table_size, const void *symb_table,
2653
     const unsigned char *extra)
2654
16.4k
{
2655
  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2656
16.4k
  if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2657
16.4k
      || start_elem->type == CHAR_CLASS
2658
16.4k
      || end_elem->type == EQUIV_CLASS
2659
16.4k
      || end_elem->type == CHAR_CLASS))
2660
0
    return REG_ERANGE;
2661
2662
  /* We can handle no multi character collating elements without libc
2663
     support.  */
2664
16.4k
  if (__glibc_unlikely ((start_elem->type == COLL_SYM
2665
16.4k
       && strlen ((char *) start_elem->opr.name) > 1)
2666
16.4k
      || (end_elem->type == COLL_SYM
2667
16.4k
          && strlen ((char *) end_elem->opr.name) > 1)))
2668
0
    return REG_ECOLLATE;
2669
2670
16.4k
  unsigned int
2671
16.4k
    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2672
16.4k
    : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2673
0
       : 0)),
2674
16.4k
    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2675
16.4k
        : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2676
0
     : 0));
2677
16.4k
  wint_t
2678
16.4k
    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2679
16.4k
    ? parse_byte (start_ch, dfa) : start_elem->opr.wch),
2680
16.4k
    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2681
16.4k
        ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
2682
2683
16.4k
  if (start_wc == WEOF || end_wc == WEOF)
2684
0
    return REG_ECOLLATE;
2685
16.4k
  else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2686
16.4k
                             && start_wc > end_wc))
2687
10
    return REG_ERANGE;
2688
2689
  /* Got valid collation sequence values, add them as a new entry.
2690
     However, for !_LIBC we have no collation elements: if the
2691
     character set is single byte, the single byte character set
2692
     that we build below suffices.  parse_bracket_exp passes
2693
     no MBCSET if dfa->mb_cur_max == 1.  */
2694
16.4k
  if (dfa->mb_cur_max > 1)
2695
0
    {
2696
      /* Check the space of the arrays.  */
2697
0
      if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2698
0
        {
2699
          /* There is not enough space, need realloc.  */
2700
0
          wchar_t *new_array_start, *new_array_end;
2701
0
          Idx new_nranges;
2702
2703
          /* +1 in case of mbcset->nranges is 0.  */
2704
0
          new_nranges = 2 * mbcset->nranges + 1;
2705
          /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706
             are NULL if *range_alloc == 0.  */
2707
0
          new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708
0
                                        new_nranges);
2709
0
          new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710
0
                                      new_nranges);
2711
2712
0
          if (__glibc_unlikely (new_array_start == NULL
2713
0
                                || new_array_end == NULL))
2714
0
            {
2715
0
              re_free (new_array_start);
2716
0
              re_free (new_array_end);
2717
0
              return REG_ESPACE;
2718
0
            }
2719
2720
0
          mbcset->range_starts = new_array_start;
2721
0
          mbcset->range_ends = new_array_end;
2722
0
          *range_alloc = new_nranges;
2723
0
        }
2724
2725
0
      mbcset->range_starts[mbcset->nranges] = start_wc;
2726
0
      mbcset->range_ends[mbcset->nranges++] = end_wc;
2727
0
    }
2728
2729
  /* Build the table for single byte characters.  */
2730
4.22M
  for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
2731
4.21M
    {
2732
4.21M
      if (start_wc <= wc && wc <= end_wc)
2733
92.3k
        bitset_set (sbcset, wc);
2734
4.21M
    }
2735
2736
16.4k
  return REG_NOERROR;
2737
16.4k
}
2738
#endif /* not _LIBC */
2739
2740
#ifndef _LIBC
2741
/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
2742
   Build the collating element which is represented by NAME.
2743
   The result are written to MBCSET and SBCSET.
2744
   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2745
   pointer argument since we may update it.  */
2746
2747
static reg_errcode_t
2748
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2749
      Idx *coll_sym_alloc, const unsigned char *name,
2750
      uint_fast32_t nrules, int_fast32_t table_size,
2751
      const void *symb_table, const unsigned char *extra)
2752
83
{
2753
83
  size_t name_len = strlen ((const char *) name);
2754
83
  if (__glibc_unlikely (name_len != 1))
2755
0
    return REG_ECOLLATE;
2756
83
  else
2757
83
    {
2758
83
      bitset_set (sbcset, name[0]);
2759
83
      return REG_NOERROR;
2760
83
    }
2761
83
}
2762
#endif /* not _LIBC */
2763
2764
#ifdef _LIBC
2765
/* Local function for parse_bracket_exp used in _LIBC environment.
2766
   Seek the collating symbol entry corresponding to NAME.
2767
   Return the index of the symbol in the SYMB_TABLE,
2768
   or -1 if not found.  */
2769
2770
static __always_inline int32_t
2771
seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2772
           const int32_t *symb_table,
2773
           int_fast32_t table_size,
2774
           const unsigned char *extra)
2775
{
2776
  int_fast32_t elem;
2777
2778
  for (elem = 0; elem < table_size; elem++)
2779
    if (symb_table[2 * elem] != 0)
2780
      {
2781
  int32_t idx = symb_table[2 * elem + 1];
2782
  /* Skip the name of collating element name.  */
2783
  idx += 1 + extra[idx];
2784
  if (/* Compare the length of the name.  */
2785
      name_len == extra[idx]
2786
      /* Compare the name.  */
2787
      && memcmp (name, &extra[idx + 1], name_len) == 0)
2788
    /* Yep, this is the entry.  */
2789
    return elem;
2790
      }
2791
  return -1;
2792
}
2793
2794
/* Local function for parse_bracket_exp used in _LIBC environment.
2795
   Look up the collation sequence value of BR_ELEM.
2796
   Return the value if succeeded, UINT_MAX otherwise.  */
2797
2798
static __always_inline unsigned int
2799
lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2800
         const unsigned char *collseqmb,
2801
         const char *collseqwc,
2802
         int_fast32_t table_size,
2803
         const int32_t *symb_table,
2804
         const unsigned char *extra)
2805
{
2806
  if (br_elem->type == SB_CHAR)
2807
    {
2808
      /* if (MB_CUR_MAX == 1) */
2809
      if (nrules == 0)
2810
  return collseqmb[br_elem->opr.ch];
2811
      else
2812
  {
2813
    wint_t wc = __btowc (br_elem->opr.ch);
2814
    return __collseq_table_lookup (collseqwc, wc);
2815
  }
2816
    }
2817
  else if (br_elem->type == MB_CHAR)
2818
    {
2819
      if (nrules != 0)
2820
  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2821
    }
2822
  else if (br_elem->type == COLL_SYM)
2823
    {
2824
      size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2825
      if (nrules != 0)
2826
  {
2827
    int32_t elem, idx;
2828
    elem = seek_collating_symbol_entry (br_elem->opr.name,
2829
                sym_name_len,
2830
                symb_table, table_size,
2831
                extra);
2832
    if (elem != -1)
2833
      {
2834
        /* We found the entry.  */
2835
        idx = symb_table[2 * elem + 1];
2836
        /* Skip the name of collating element name.  */
2837
        idx += 1 + extra[idx];
2838
        /* Skip the byte sequence of the collating element.  */
2839
        idx += 1 + extra[idx];
2840
        /* Adjust for the alignment.  */
2841
        idx = (idx + 3) & ~3;
2842
        /* Skip the multibyte collation sequence value.  */
2843
        idx += sizeof (unsigned int);
2844
        /* Skip the wide char sequence of the collating element.  */
2845
        idx += sizeof (unsigned int) *
2846
    (1 + *(unsigned int *) (extra + idx));
2847
        /* Return the collation sequence value.  */
2848
        return *(unsigned int *) (extra + idx);
2849
      }
2850
    else if (sym_name_len == 1)
2851
      {
2852
        /* No valid character.  Match it as a single byte
2853
     character.  */
2854
        return collseqmb[br_elem->opr.name[0]];
2855
      }
2856
  }
2857
      else if (sym_name_len == 1)
2858
  return collseqmb[br_elem->opr.name[0]];
2859
    }
2860
  return UINT_MAX;
2861
}
2862
2863
/* Local function for parse_bracket_exp used in _LIBC environment.
2864
   Build the range expression which starts from START_ELEM, and ends
2865
   at END_ELEM.  The result are written to MBCSET and SBCSET.
2866
   RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2867
   mbcset->range_ends, is a pointer argument since we may
2868
   update it.  */
2869
2870
static __always_inline reg_errcode_t
2871
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2872
     bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2873
     re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2874
     const unsigned char *collseqmb, const char *collseqwc,
2875
     int_fast32_t table_size, const int32_t *symb_table,
2876
     const unsigned char *extra)
2877
{
2878
  unsigned int ch;
2879
  uint32_t start_collseq;
2880
  uint32_t end_collseq;
2881
2882
  /* Equivalence Classes and Character Classes can't be a range
2883
     start/end.  */
2884
  if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2885
                        || start_elem->type == CHAR_CLASS
2886
                        || end_elem->type == EQUIV_CLASS
2887
                        || end_elem->type == CHAR_CLASS))
2888
    return REG_ERANGE;
2889
2890
  /* FIXME: Implement rational ranges here, too.  */
2891
  start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2892
               table_size, symb_table, extra);
2893
  end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2894
             table_size, symb_table, extra);
2895
  /* Check start/end collation sequence values.  */
2896
  if (__glibc_unlikely (start_collseq == UINT_MAX
2897
                        || end_collseq == UINT_MAX))
2898
    return REG_ECOLLATE;
2899
  if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2900
                        && start_collseq > end_collseq))
2901
    return REG_ERANGE;
2902
2903
  /* Got valid collation sequence values, add them as a new entry.
2904
     However, if we have no collation elements, and the character set
2905
     is single byte, the single byte character set that we
2906
     build below suffices. */
2907
  if (nrules > 0 || dfa->mb_cur_max > 1)
2908
    {
2909
      /* Check the space of the arrays.  */
2910
      if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2911
  {
2912
    /* There is not enough space, need realloc.  */
2913
    uint32_t *new_array_start;
2914
    uint32_t *new_array_end;
2915
    int new_nranges;
2916
2917
    /* +1 in case of mbcset->nranges is 0.  */
2918
    new_nranges = 2 * mbcset->nranges + 1;
2919
    new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2920
          new_nranges);
2921
    new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2922
              new_nranges);
2923
2924
          if (__glibc_unlikely (new_array_start == NULL
2925
                                || new_array_end == NULL))
2926
      return REG_ESPACE;
2927
2928
    mbcset->range_starts = new_array_start;
2929
    mbcset->range_ends = new_array_end;
2930
    *range_alloc = new_nranges;
2931
  }
2932
2933
      mbcset->range_starts[mbcset->nranges] = start_collseq;
2934
      mbcset->range_ends[mbcset->nranges++] = end_collseq;
2935
    }
2936
2937
  /* Build the table for single byte characters.  */
2938
  for (ch = 0; ch < SBC_MAX; ch++)
2939
    {
2940
      uint32_t ch_collseq;
2941
      /* if (MB_CUR_MAX == 1) */
2942
      if (nrules == 0)
2943
  ch_collseq = collseqmb[ch];
2944
      else
2945
  ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2946
      if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2947
  bitset_set (sbcset, ch);
2948
    }
2949
  return REG_NOERROR;
2950
}
2951
2952
/* Local function for parse_bracket_exp used in _LIBC environment.
2953
   Build the collating element which is represented by NAME.
2954
   The result are written to MBCSET and SBCSET.
2955
   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2956
   pointer argument since we may update it.  */
2957
2958
static __always_inline reg_errcode_t
2959
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2960
      Idx *coll_sym_alloc, const unsigned char *name,
2961
      uint_fast32_t nrules, int_fast32_t table_size,
2962
      const int32_t *symb_table, const unsigned char *extra)
2963
{
2964
  int32_t elem, idx;
2965
  size_t name_len = strlen ((const char *) name);
2966
  if (nrules != 0)
2967
    {
2968
      elem = seek_collating_symbol_entry (name, name_len, symb_table,
2969
            table_size, extra);
2970
      if (elem != -1)
2971
  {
2972
    /* We found the entry.  */
2973
    idx = symb_table[2 * elem + 1];
2974
    /* Skip the name of collating element name.  */
2975
    idx += 1 + extra[idx];
2976
  }
2977
      else if (name_len == 1)
2978
  {
2979
    /* No valid character, treat it as a normal
2980
       character.  */
2981
    bitset_set (sbcset, name[0]);
2982
    return REG_NOERROR;
2983
  }
2984
      else
2985
  return REG_ECOLLATE;
2986
2987
      /* Got valid collation sequence, add it as a new entry.  */
2988
      /* Check the space of the arrays.  */
2989
      if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
2990
  {
2991
    /* Not enough, realloc it.  */
2992
    /* +1 in case of mbcset->ncoll_syms is 0.  */
2993
    int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2994
    /* Use realloc since mbcset->coll_syms is NULL
2995
       if *alloc == 0.  */
2996
    int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2997
                 new_coll_sym_alloc);
2998
          if (__glibc_unlikely (new_coll_syms == NULL))
2999
      return REG_ESPACE;
3000
    mbcset->coll_syms = new_coll_syms;
3001
    *coll_sym_alloc = new_coll_sym_alloc;
3002
  }
3003
      mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3004
      return REG_NOERROR;
3005
    }
3006
  else
3007
    {
3008
      if (__glibc_unlikely (name_len != 1))
3009
  return REG_ECOLLATE;
3010
      else
3011
  {
3012
    bitset_set (sbcset, name[0]);
3013
    return REG_NOERROR;
3014
  }
3015
    }
3016
}
3017
#endif /* _LIBC */
3018
3019
/* This function parse bracket expression like "[abc]", "[a-c]",
3020
   "[[.a-a.]]" etc.  */
3021
3022
static bin_tree_t *
3023
parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3024
       reg_syntax_t syntax, reg_errcode_t *err)
3025
17.0k
{
3026
17.0k
  const unsigned char *collseqmb = NULL;
3027
17.0k
  const char *collseqwc = NULL;
3028
17.0k
  uint_fast32_t nrules = 0;
3029
17.0k
  int_fast32_t table_size = 0;
3030
17.0k
  const void *symb_table = NULL;
3031
17.0k
  const unsigned char *extra = NULL;
3032
3033
17.0k
  re_token_t br_token;
3034
17.0k
  re_bitset_ptr_t sbcset;
3035
17.0k
  re_charset_t *mbcset;
3036
17.0k
  Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3037
17.0k
  Idx equiv_class_alloc = 0, char_class_alloc = 0;
3038
17.0k
  bool non_match = false;
3039
17.0k
  bin_tree_t *work_tree;
3040
17.0k
  int token_len;
3041
17.0k
  bool first_round = true;
3042
#ifdef _LIBC
3043
  collseqmb = (const unsigned char *)
3044
    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3045
  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3046
  if (nrules)
3047
    {
3048
      /*
3049
      if (MB_CUR_MAX > 1)
3050
      */
3051
      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3052
      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3053
      symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
3054
      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3055
               _NL_COLLATE_SYMB_EXTRAMB);
3056
    }
3057
#endif
3058
17.0k
  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3059
17.0k
  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3060
17.0k
  if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3061
0
    {
3062
0
      re_free (sbcset);
3063
0
      re_free (mbcset);
3064
0
      *err = REG_ESPACE;
3065
0
      return NULL;
3066
0
    }
3067
3068
17.0k
  token_len = peek_token_bracket (token, regexp, syntax);
3069
17.0k
  if (__glibc_unlikely (token->type == END_OF_RE))
3070
0
    {
3071
0
      *err = REG_BADPAT;
3072
0
      goto parse_bracket_exp_free_return;
3073
0
    }
3074
17.0k
  if (token->type == OP_NON_MATCH_LIST)
3075
12.2k
    {
3076
12.2k
      mbcset->non_match = 1;
3077
12.2k
      non_match = true;
3078
12.2k
      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3079
0
  bitset_set (sbcset, '\n');
3080
12.2k
      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3081
12.2k
      token_len = peek_token_bracket (token, regexp, syntax);
3082
12.2k
      if (__glibc_unlikely (token->type == END_OF_RE))
3083
0
  {
3084
0
    *err = REG_BADPAT;
3085
0
    goto parse_bracket_exp_free_return;
3086
0
  }
3087
12.2k
    }
3088
3089
  /* We treat the first ']' as a normal character.  */
3090
17.0k
  if (token->type == OP_CLOSE_BRACKET)
3091
3.67k
    token->type = CHARACTER;
3092
3093
62.2M
  while (1)
3094
62.2M
    {
3095
62.2M
      bracket_elem_t start_elem, end_elem;
3096
62.2M
      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3097
62.2M
      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3098
62.2M
      reg_errcode_t ret;
3099
62.2M
      int token_len2 = 0;
3100
62.2M
      bool is_range_exp = false;
3101
62.2M
      re_token_t token2;
3102
3103
62.2M
      start_elem.opr.name = start_name_buf;
3104
62.2M
      start_elem.type = COLL_SYM;
3105
62.2M
      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3106
62.2M
           syntax, first_round);
3107
62.2M
      if (__glibc_unlikely (ret != REG_NOERROR))
3108
0
  {
3109
0
    *err = ret;
3110
0
    goto parse_bracket_exp_free_return;
3111
0
  }
3112
62.2M
      first_round = false;
3113
3114
      /* Get information about the next token.  We need it in any case.  */
3115
62.2M
      token_len = peek_token_bracket (token, regexp, syntax);
3116
3117
      /* Do not check for ranges if we know they are not allowed.  */
3118
62.2M
      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3119
62.2M
  {
3120
62.2M
    if (__glibc_unlikely (token->type == END_OF_RE))
3121
0
      {
3122
0
        *err = REG_EBRACK;
3123
0
        goto parse_bracket_exp_free_return;
3124
0
      }
3125
62.2M
    if (token->type == OP_CHARSET_RANGE)
3126
16.4k
      {
3127
16.4k
        re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3128
16.4k
        token_len2 = peek_token_bracket (&token2, regexp, syntax);
3129
16.4k
        if (__glibc_unlikely (token2.type == END_OF_RE))
3130
0
    {
3131
0
      *err = REG_EBRACK;
3132
0
      goto parse_bracket_exp_free_return;
3133
0
    }
3134
16.4k
        if (token2.type == OP_CLOSE_BRACKET)
3135
0
    {
3136
      /* We treat the last '-' as a normal character.  */
3137
0
      re_string_skip_bytes (regexp, -token_len);
3138
0
      token->type = CHARACTER;
3139
0
    }
3140
16.4k
        else
3141
16.4k
    is_range_exp = true;
3142
16.4k
      }
3143
62.2M
  }
3144
3145
62.2M
      if (is_range_exp == true)
3146
16.4k
  {
3147
16.4k
    end_elem.opr.name = end_name_buf;
3148
16.4k
    end_elem.type = COLL_SYM;
3149
16.4k
    ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3150
16.4k
               dfa, syntax, true);
3151
16.4k
    if (__glibc_unlikely (ret != REG_NOERROR))
3152
0
      {
3153
0
        *err = ret;
3154
0
        goto parse_bracket_exp_free_return;
3155
0
      }
3156
3157
16.4k
    token_len = peek_token_bracket (token, regexp, syntax);
3158
3159
16.4k
    *err = build_range_exp (sbcset, mbcset, &range_alloc,
3160
16.4k
          &start_elem, &end_elem,
3161
16.4k
          dfa, syntax, nrules, collseqmb, collseqwc,
3162
16.4k
          table_size, symb_table, extra);
3163
16.4k
    if (__glibc_unlikely (*err != REG_NOERROR))
3164
10
      goto parse_bracket_exp_free_return;
3165
16.4k
  }
3166
62.1M
      else
3167
62.1M
  {
3168
62.1M
    switch (start_elem.type)
3169
62.1M
      {
3170
62.1M
      case SB_CHAR:
3171
62.1M
        bitset_set (sbcset, start_elem.opr.ch);
3172
62.1M
        break;
3173
0
      case MB_CHAR:
3174
        /* Check whether the array has enough space.  */
3175
0
        if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3176
0
    {
3177
0
      wchar_t *new_mbchars;
3178
      /* Not enough, realloc it.  */
3179
      /* +1 in case of mbcset->nmbchars is 0.  */
3180
0
      mbchar_alloc = 2 * mbcset->nmbchars + 1;
3181
      /* Use realloc since array is NULL if *alloc == 0.  */
3182
0
      new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3183
0
              mbchar_alloc);
3184
0
      if (__glibc_unlikely (new_mbchars == NULL))
3185
0
        goto parse_bracket_exp_espace;
3186
0
      mbcset->mbchars = new_mbchars;
3187
0
    }
3188
0
        mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3189
0
        break;
3190
0
      case EQUIV_CLASS:
3191
0
        *err = build_equiv_class (sbcset,
3192
0
          mbcset, &equiv_class_alloc,
3193
0
          start_elem.opr.name);
3194
0
        if (__glibc_unlikely (*err != REG_NOERROR))
3195
0
    goto parse_bracket_exp_free_return;
3196
0
        break;
3197
83
      case COLL_SYM:
3198
83
        *err = build_collating_symbol (sbcset,
3199
83
               mbcset, &coll_sym_alloc,
3200
83
               start_elem.opr.name,
3201
83
               nrules, table_size, symb_table, extra);
3202
83
        if (__glibc_unlikely (*err != REG_NOERROR))
3203
0
    goto parse_bracket_exp_free_return;
3204
83
        break;
3205
83
      case CHAR_CLASS:
3206
0
        *err = build_charclass (regexp->trans, sbcset,
3207
0
              mbcset, &char_class_alloc,
3208
0
              (const char *) start_elem.opr.name,
3209
0
              syntax);
3210
0
        if (__glibc_unlikely (*err != REG_NOERROR))
3211
0
         goto parse_bracket_exp_free_return;
3212
0
        break;
3213
0
      default:
3214
0
        DEBUG_ASSERT (false);
3215
0
        break;
3216
62.1M
      }
3217
62.1M
  }
3218
62.2M
      if (__glibc_unlikely (token->type == END_OF_RE))
3219
0
  {
3220
0
    *err = REG_EBRACK;
3221
0
    goto parse_bracket_exp_free_return;
3222
0
  }
3223
62.2M
      if (token->type == OP_CLOSE_BRACKET)
3224
17.0k
  break;
3225
62.2M
    }
3226
3227
17.0k
  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3228
3229
  /* If it is non-matching list.  */
3230
17.0k
  if (non_match)
3231
12.2k
    bitset_not (sbcset);
3232
3233
  /* Ensure only single byte characters are set.  */
3234
17.0k
  if (dfa->mb_cur_max > 1)
3235
0
    bitset_mask (sbcset, dfa->sb_char);
3236
3237
17.0k
  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3238
17.0k
      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3239
0
                 || mbcset->non_match)))
3240
0
    {
3241
0
      bin_tree_t *mbc_tree;
3242
0
      int sbc_idx;
3243
      /* Build a tree for complex bracket.  */
3244
0
      dfa->has_mb_node = 1;
3245
0
      br_token.type = COMPLEX_BRACKET;
3246
0
      br_token.opr.mbcset = mbcset;
3247
0
      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3248
0
      if (__glibc_unlikely (mbc_tree == NULL))
3249
0
  goto parse_bracket_exp_espace;
3250
0
      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3251
0
  if (sbcset[sbc_idx])
3252
0
    break;
3253
      /* If there are no bits set in sbcset, there is no point
3254
   of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3255
0
      if (sbc_idx < BITSET_WORDS)
3256
0
  {
3257
    /* Build a tree for simple bracket.  */
3258
0
    br_token.type = SIMPLE_BRACKET;
3259
0
    br_token.opr.sbcset = sbcset;
3260
0
    work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3261
0
    if (__glibc_unlikely (work_tree == NULL))
3262
0
      goto parse_bracket_exp_espace;
3263
3264
    /* Then join them by ALT node.  */
3265
0
    work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3266
0
    if (__glibc_unlikely (work_tree == NULL))
3267
0
      goto parse_bracket_exp_espace;
3268
0
  }
3269
0
      else
3270
0
  {
3271
0
    re_free (sbcset);
3272
0
    work_tree = mbc_tree;
3273
0
  }
3274
0
    }
3275
17.0k
  else
3276
17.0k
    {
3277
17.0k
      free_charset (mbcset);
3278
      /* Build a tree for simple bracket.  */
3279
17.0k
      br_token.type = SIMPLE_BRACKET;
3280
17.0k
      br_token.opr.sbcset = sbcset;
3281
17.0k
      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282
17.0k
      if (__glibc_unlikely (work_tree == NULL))
3283
0
  goto parse_bracket_exp_espace;
3284
17.0k
    }
3285
17.0k
  return work_tree;
3286
3287
0
 parse_bracket_exp_espace:
3288
0
  *err = REG_ESPACE;
3289
10
 parse_bracket_exp_free_return:
3290
10
  re_free (sbcset);
3291
10
  free_charset (mbcset);
3292
10
  return NULL;
3293
0
}
3294
3295
/* Parse an element in the bracket expression.  */
3296
3297
static reg_errcode_t
3298
parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3299
           re_token_t *token, int token_len, re_dfa_t *dfa,
3300
           reg_syntax_t syntax, bool accept_hyphen)
3301
62.2M
{
3302
62.2M
  int cur_char_size;
3303
62.2M
  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3304
62.2M
  if (cur_char_size > 1)
3305
0
    {
3306
0
      elem->type = MB_CHAR;
3307
0
      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3308
0
      re_string_skip_bytes (regexp, cur_char_size);
3309
0
      return REG_NOERROR;
3310
0
    }
3311
62.2M
  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3312
62.2M
  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3313
62.2M
      || token->type == OP_OPEN_EQUIV_CLASS)
3314
83
    return parse_bracket_symbol (elem, regexp, token);
3315
62.2M
  if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3316
0
    {
3317
      /* A '-' must only appear as anything but a range indicator before
3318
   the closing bracket.  Everything else is an error.  */
3319
0
      re_token_t token2;
3320
0
      (void) peek_token_bracket (&token2, regexp, syntax);
3321
0
      if (token2.type != OP_CLOSE_BRACKET)
3322
  /* The actual error value is not standardized since this whole
3323
     case is undefined.  But ERANGE makes good sense.  */
3324
0
  return REG_ERANGE;
3325
0
    }
3326
62.2M
  elem->type = SB_CHAR;
3327
62.2M
  elem->opr.ch = token->opr.c;
3328
62.2M
  return REG_NOERROR;
3329
62.2M
}
3330
3331
/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3332
   such as [:<character_class>:], [.<collating_element>.], and
3333
   [=<equivalent_class>=].  */
3334
3335
static reg_errcode_t
3336
parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3337
          re_token_t *token)
3338
83
{
3339
83
  unsigned char ch, delim = token->opr.c;
3340
83
  int i = 0;
3341
83
  if (re_string_eoi(regexp))
3342
0
    return REG_EBRACK;
3343
83
  for (;; ++i)
3344
166
    {
3345
166
      if (i >= BRACKET_NAME_BUF_SIZE)
3346
0
  return REG_EBRACK;
3347
166
      if (token->type == OP_OPEN_CHAR_CLASS)
3348
0
  ch = re_string_fetch_byte_case (regexp);
3349
166
      else
3350
166
  ch = re_string_fetch_byte (regexp);
3351
166
      if (re_string_eoi(regexp))
3352
0
  return REG_EBRACK;
3353
166
      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3354
83
  break;
3355
83
      elem->opr.name[i] = ch;
3356
83
    }
3357
83
  re_string_skip_bytes (regexp, 1);
3358
83
  elem->opr.name[i] = '\0';
3359
83
  switch (token->type)
3360
83
    {
3361
83
    case OP_OPEN_COLL_ELEM:
3362
83
      elem->type = COLL_SYM;
3363
83
      break;
3364
0
    case OP_OPEN_EQUIV_CLASS:
3365
0
      elem->type = EQUIV_CLASS;
3366
0
      break;
3367
0
    case OP_OPEN_CHAR_CLASS:
3368
0
      elem->type = CHAR_CLASS;
3369
0
      break;
3370
0
    default:
3371
0
      break;
3372
83
    }
3373
83
  return REG_NOERROR;
3374
83
}
3375
3376
  /* Helper function for parse_bracket_exp.
3377
     Build the equivalence class which is represented by NAME.
3378
     The result are written to MBCSET and SBCSET.
3379
     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3380
     is a pointer argument since we may update it.  */
3381
3382
static reg_errcode_t
3383
build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3384
       Idx *equiv_class_alloc, const unsigned char *name)
3385
0
{
3386
#ifdef _LIBC
3387
  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388
  if (nrules != 0)
3389
    {
3390
      const int32_t *table, *indirect;
3391
      const unsigned char *weights, *extra, *cp;
3392
      unsigned char char_buf[2];
3393
      int32_t idx1, idx2;
3394
      unsigned int ch;
3395
      size_t len;
3396
      /* Calculate the index for equivalence class.  */
3397
      cp = name;
3398
      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3399
      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3400
                 _NL_COLLATE_WEIGHTMB);
3401
      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402
               _NL_COLLATE_EXTRAMB);
3403
      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3404
            _NL_COLLATE_INDIRECTMB);
3405
      idx1 = findidx (table, indirect, extra, &cp, -1);
3406
      if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3407
  /* This isn't a valid character.  */
3408
  return REG_ECOLLATE;
3409
3410
      /* Build single byte matching table for this equivalence class.  */
3411
      len = weights[idx1 & 0xffffff];
3412
      for (ch = 0; ch < SBC_MAX; ++ch)
3413
  {
3414
    char_buf[0] = ch;
3415
    cp = char_buf;
3416
    idx2 = findidx (table, indirect, extra, &cp, 1);
3417
/*
3418
    idx2 = table[ch];
3419
*/
3420
    if (idx2 == 0)
3421
      /* This isn't a valid character.  */
3422
      continue;
3423
    /* Compare only if the length matches and the collation rule
3424
       index is the same.  */
3425
    if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3426
        && memcmp (weights + (idx1 & 0xffffff) + 1,
3427
       weights + (idx2 & 0xffffff) + 1, len) == 0)
3428
      bitset_set (sbcset, ch);
3429
  }
3430
      /* Check whether the array has enough space.  */
3431
      if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3432
  {
3433
    /* Not enough, realloc it.  */
3434
    /* +1 in case of mbcset->nequiv_classes is 0.  */
3435
    Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3436
    /* Use realloc since the array is NULL if *alloc == 0.  */
3437
    int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3438
               int32_t,
3439
               new_equiv_class_alloc);
3440
    if (__glibc_unlikely (new_equiv_classes == NULL))
3441
      return REG_ESPACE;
3442
    mbcset->equiv_classes = new_equiv_classes;
3443
    *equiv_class_alloc = new_equiv_class_alloc;
3444
  }
3445
      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3446
    }
3447
  else
3448
#endif /* _LIBC */
3449
0
    {
3450
0
      if (__glibc_unlikely (strlen ((const char *) name) != 1))
3451
0
  return REG_ECOLLATE;
3452
0
      bitset_set (sbcset, *name);
3453
0
    }
3454
0
  return REG_NOERROR;
3455
0
}
3456
3457
  /* Helper function for parse_bracket_exp.
3458
     Build the character class which is represented by NAME.
3459
     The result are written to MBCSET and SBCSET.
3460
     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3461
     is a pointer argument since we may update it.  */
3462
3463
static reg_errcode_t
3464
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3465
     re_charset_t *mbcset, Idx *char_class_alloc,
3466
     const char *class_name, reg_syntax_t syntax)
3467
0
{
3468
0
  int i;
3469
0
  const char *name = class_name;
3470
3471
  /* In case of REG_ICASE "upper" and "lower" match the both of
3472
     upper and lower cases.  */
3473
0
  if ((syntax & RE_ICASE)
3474
0
      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3475
0
    name = "alpha";
3476
3477
  /* Check the space of the arrays.  */
3478
0
  if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3479
0
    {
3480
      /* Not enough, realloc it.  */
3481
      /* +1 in case of mbcset->nchar_classes is 0.  */
3482
0
      Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3483
      /* Use realloc since array is NULL if *alloc == 0.  */
3484
0
      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3485
0
                 new_char_class_alloc);
3486
0
      if (__glibc_unlikely (new_char_classes == NULL))
3487
0
  return REG_ESPACE;
3488
0
      mbcset->char_classes = new_char_classes;
3489
0
      *char_class_alloc = new_char_class_alloc;
3490
0
    }
3491
0
  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3492
3493
0
#define BUILD_CHARCLASS_LOOP(ctype_func)  \
3494
0
  do {           \
3495
0
    if (__glibc_unlikely (trans != NULL))     \
3496
0
      {           \
3497
0
  for (i = 0; i < SBC_MAX; ++i)   \
3498
0
    if (ctype_func (i))     \
3499
0
      bitset_set (sbcset, trans[i]); \
3500
0
      }           \
3501
0
    else          \
3502
0
      {           \
3503
0
  for (i = 0; i < SBC_MAX; ++i)   \
3504
0
    if (ctype_func (i))     \
3505
0
      bitset_set (sbcset, i);   \
3506
0
      }           \
3507
0
  } while (0)
3508
3509
0
  if (strcmp (name, "alnum") == 0)
3510
0
    BUILD_CHARCLASS_LOOP (isalnum);
3511
0
  else if (strcmp (name, "cntrl") == 0)
3512
0
    BUILD_CHARCLASS_LOOP (iscntrl);
3513
0
  else if (strcmp (name, "lower") == 0)
3514
0
    BUILD_CHARCLASS_LOOP (islower);
3515
0
  else if (strcmp (name, "space") == 0)
3516
0
    BUILD_CHARCLASS_LOOP (isspace);
3517
0
  else if (strcmp (name, "alpha") == 0)
3518
0
    BUILD_CHARCLASS_LOOP (isalpha);
3519
0
  else if (strcmp (name, "digit") == 0)
3520
0
    BUILD_CHARCLASS_LOOP (isdigit);
3521
0
  else if (strcmp (name, "print") == 0)
3522
0
    BUILD_CHARCLASS_LOOP (isprint);
3523
0
  else if (strcmp (name, "upper") == 0)
3524
0
    BUILD_CHARCLASS_LOOP (isupper);
3525
0
  else if (strcmp (name, "blank") == 0)
3526
0
    BUILD_CHARCLASS_LOOP (isblank);
3527
0
  else if (strcmp (name, "graph") == 0)
3528
0
    BUILD_CHARCLASS_LOOP (isgraph);
3529
0
  else if (strcmp (name, "punct") == 0)
3530
0
    BUILD_CHARCLASS_LOOP (ispunct);
3531
0
  else if (strcmp (name, "xdigit") == 0)
3532
0
    BUILD_CHARCLASS_LOOP (isxdigit);
3533
0
  else
3534
0
    return REG_ECTYPE;
3535
3536
0
  return REG_NOERROR;
3537
0
}
3538
3539
static bin_tree_t *
3540
build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3541
        const char *class_name,
3542
        const char *extra, bool non_match,
3543
        reg_errcode_t *err)
3544
0
{
3545
0
  re_bitset_ptr_t sbcset;
3546
0
  re_charset_t *mbcset;
3547
0
  Idx alloc = 0;
3548
0
  reg_errcode_t ret;
3549
0
  bin_tree_t *tree;
3550
3551
0
  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3552
0
  if (__glibc_unlikely (sbcset == NULL))
3553
0
    {
3554
0
      *err = REG_ESPACE;
3555
0
      return NULL;
3556
0
    }
3557
0
  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3558
0
  if (__glibc_unlikely (mbcset == NULL))
3559
0
    {
3560
0
      re_free (sbcset);
3561
0
      *err = REG_ESPACE;
3562
0
      return NULL;
3563
0
    }
3564
0
  mbcset->non_match = non_match;
3565
3566
  /* We don't care the syntax in this case.  */
3567
0
  ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
3568
3569
0
  if (__glibc_unlikely (ret != REG_NOERROR))
3570
0
    {
3571
0
      re_free (sbcset);
3572
0
      free_charset (mbcset);
3573
0
      *err = ret;
3574
0
      return NULL;
3575
0
    }
3576
  /* \w match '_' also.  */
3577
0
  for (; *extra; extra++)
3578
0
    bitset_set (sbcset, *extra);
3579
3580
  /* If it is non-matching list.  */
3581
0
  if (non_match)
3582
0
    bitset_not (sbcset);
3583
3584
  /* Ensure only single byte characters are set.  */
3585
0
  if (dfa->mb_cur_max > 1)
3586
0
    bitset_mask (sbcset, dfa->sb_char);
3587
3588
  /* Build a tree for simple bracket.  */
3589
0
  re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3590
0
  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3591
0
  if (__glibc_unlikely (tree == NULL))
3592
0
    goto build_word_op_espace;
3593
3594
0
  if (dfa->mb_cur_max > 1)
3595
0
    {
3596
0
      bin_tree_t *mbc_tree;
3597
      /* Build a tree for complex bracket.  */
3598
0
      br_token.type = COMPLEX_BRACKET;
3599
0
      br_token.opr.mbcset = mbcset;
3600
0
      dfa->has_mb_node = 1;
3601
0
      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3602
0
      if (__glibc_unlikely (mbc_tree == NULL))
3603
0
  goto build_word_op_espace;
3604
      /* Then join them by ALT node.  */
3605
0
      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3606
0
      if (__glibc_likely (mbc_tree != NULL))
3607
0
  return tree;
3608
0
    }
3609
0
  else
3610
0
    {
3611
0
      free_charset (mbcset);
3612
0
      return tree;
3613
0
    }
3614
3615
0
 build_word_op_espace:
3616
0
  re_free (sbcset);
3617
0
  free_charset (mbcset);
3618
0
  *err = REG_ESPACE;
3619
0
  return NULL;
3620
0
}
3621
3622
/* This is intended for the expressions like "a{1,3}".
3623
   Fetch a number from 'input', and return the number.
3624
   Return -1 if the number field is empty like "{,1}".
3625
   Return RE_DUP_MAX + 1 if the number field is too large.
3626
   Return -2 if an error occurred.  */
3627
3628
static Idx
3629
fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3630
0
{
3631
0
  Idx num = -1;
3632
0
  unsigned char c;
3633
0
  while (1)
3634
0
    {
3635
0
      fetch_token (token, input, syntax);
3636
0
      c = token->opr.c;
3637
0
      if (__glibc_unlikely (token->type == END_OF_RE))
3638
0
  return -2;
3639
0
      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3640
0
  break;
3641
0
      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3642
0
       ? -2
3643
0
       : num == -1
3644
0
       ? c - '0'
3645
0
       : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3646
0
    }
3647
0
  return num;
3648
0
}
3649

3650
static void
3651
free_charset (re_charset_t *cset)
3652
17.0k
{
3653
17.0k
  re_free (cset->mbchars);
3654
#ifdef _LIBC
3655
  re_free (cset->coll_syms);
3656
  re_free (cset->equiv_classes);
3657
#endif
3658
17.0k
  re_free (cset->range_starts);
3659
17.0k
  re_free (cset->range_ends);
3660
17.0k
  re_free (cset->char_classes);
3661
17.0k
  re_free (cset);
3662
17.0k
}
3663

3664
/* Functions for binary tree operation.  */
3665
3666
/* Create a tree node.  */
3667
3668
static bin_tree_t *
3669
create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3670
       re_token_type_t type)
3671
7.44M
{
3672
7.44M
  re_token_t t = { .type = type };
3673
7.44M
  return create_token_tree (dfa, left, right, &t);
3674
7.44M
}
3675
3676
static bin_tree_t *
3677
create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3678
       const re_token_t *token)
3679
16.0M
{
3680
16.0M
  bin_tree_t *tree;
3681
16.0M
  if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3682
1.07M
    {
3683
1.07M
      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3684
3685
1.07M
      if (storage == NULL)
3686
0
  return NULL;
3687
1.07M
      storage->next = dfa->str_tree_storage;
3688
1.07M
      dfa->str_tree_storage = storage;
3689
1.07M
      dfa->str_tree_storage_idx = 0;
3690
1.07M
    }
3691
16.0M
  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3692
3693
16.0M
  tree->parent = NULL;
3694
16.0M
  tree->left = left;
3695
16.0M
  tree->right = right;
3696
16.0M
  tree->token = *token;
3697
16.0M
  tree->token.duplicated = 0;
3698
16.0M
  tree->token.opt_subexp = 0;
3699
16.0M
  tree->first = NULL;
3700
16.0M
  tree->next = NULL;
3701
16.0M
  tree->node_idx = -1;
3702
3703
16.0M
  if (left != NULL)
3704
7.14M
    left->parent = tree;
3705
16.0M
  if (right != NULL)
3706
6.57M
    right->parent = tree;
3707
16.0M
  return tree;
3708
16.0M
}
3709
3710
/* Mark the tree SRC as an optional subexpression.
3711
   To be called from preorder or postorder.  */
3712
3713
static reg_errcode_t
3714
mark_opt_subexp (void *extra, bin_tree_t *node)
3715
2.34M
{
3716
2.34M
  Idx idx = (uintptr_t) extra;
3717
2.34M
  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3718
79
    node->token.opt_subexp = 1;
3719
3720
2.34M
  return REG_NOERROR;
3721
2.34M
}
3722
3723
/* Free the allocated memory inside NODE. */
3724
3725
static void
3726
free_token (re_token_t *node)
3727
11.0M
{
3728
11.0M
  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3729
0
    free_charset (node->opr.mbcset);
3730
11.0M
  else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3731
11.0M
    re_free (node->opr.sbcset);
3732
11.0M
}
3733
3734
/* Worker function for tree walking.  Free the allocated memory inside NODE
3735
   and its children. */
3736
3737
static reg_errcode_t
3738
free_tree (void *extra, bin_tree_t *node)
3739
6.05M
{
3740
6.05M
  free_token (&node->token);
3741
6.05M
  return REG_NOERROR;
3742
6.05M
}
3743
3744
3745
/* Duplicate the node SRC, and return new node.  This is a preorder
3746
   visit similar to the one implemented by the generic visitor, but
3747
   we need more infrastructure to maintain two parallel trees --- so,
3748
   it's easier to duplicate.  */
3749
3750
static bin_tree_t *
3751
duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3752
2.73k
{
3753
2.73k
  const bin_tree_t *node;
3754
2.73k
  bin_tree_t *dup_root;
3755
2.73k
  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3756
3757
2.73k
  for (node = root; ; )
3758
2.26M
    {
3759
      /* Create a new tree and link it back to the current parent.  */
3760
2.26M
      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3761
2.26M
      if (*p_new == NULL)
3762
0
  return NULL;
3763
2.26M
      (*p_new)->parent = dup_node;
3764
2.26M
      (*p_new)->token.duplicated = 1;
3765
2.26M
      dup_node = *p_new;
3766
3767
      /* Go to the left node, or up and to the right.  */
3768
2.26M
      if (node->left)
3769
1.39M
  {
3770
1.39M
    node = node->left;
3771
1.39M
    p_new = &dup_node->left;
3772
1.39M
  }
3773
877k
      else
3774
877k
  {
3775
877k
    const bin_tree_t *prev = NULL;
3776
3.14M
    while (node->right == prev || node->right == NULL)
3777
2.26M
      {
3778
2.26M
        prev = node;
3779
2.26M
        node = node->parent;
3780
2.26M
        dup_node = dup_node->parent;
3781
2.26M
        if (!node)
3782
2.73k
    return dup_root;
3783
2.26M
      }
3784
874k
    node = node->right;
3785
874k
    p_new = &dup_node->right;
3786
874k
  }
3787
2.26M
    }
3788
2.73k
}