Coverage Report

Created: 2023-03-26 07:33

/src/wget/lib/regcomp.c
Line
Count
Source (jump to first uncovered line)
1
/* Extended regular expression matching and search library.
2
   Copyright (C) 2002-2023 Free Software Foundation, Inc.
3
   This file is part of the GNU C Library.
4
   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6
   The GNU C Library is free software; you can redistribute it and/or
7
   modify it under the terms of the GNU Lesser General Public
8
   License as published by the Free Software Foundation; either
9
   version 2.1 of the License, or (at your option) any later version.
10
11
   The GNU C Library is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
   Lesser General Public License for more details.
15
16
   You should have received a copy of the GNU Lesser General Public
17
   License along with the GNU C Library; if not, see
18
   <https://www.gnu.org/licenses/>.  */
19
20
#ifdef _LIBC
21
# include <locale/weight.h>
22
#endif
23
24
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25
            size_t length, reg_syntax_t syntax);
26
static void re_compile_fastmap_iter (regex_t *bufp,
27
             const re_dfastate_t *init_state,
28
             char *fastmap);
29
static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30
static void free_charset (re_charset_t *cset);
31
static void free_workarea_compile (regex_t *preg);
32
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33
static void optimize_utf8 (re_dfa_t *dfa);
34
static reg_errcode_t analyze (regex_t *preg);
35
static reg_errcode_t preorder (bin_tree_t *root,
36
             reg_errcode_t (fn (void *, bin_tree_t *)),
37
             void *extra);
38
static reg_errcode_t postorder (bin_tree_t *root,
39
        reg_errcode_t (fn (void *, bin_tree_t *)),
40
        void *extra);
41
static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42
static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43
static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44
         bin_tree_t *node);
45
static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46
static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47
static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48
static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49
static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50
           unsigned int constraint);
51
static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52
static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53
           Idx node, bool root);
54
static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55
static Idx fetch_number (re_string_t *input, re_token_t *token,
56
       reg_syntax_t syntax);
57
static int peek_token (re_token_t *token, re_string_t *input,
58
      reg_syntax_t syntax);
59
static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60
        reg_syntax_t syntax, reg_errcode_t *err);
61
static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62
          re_token_t *token, reg_syntax_t syntax,
63
          Idx nest, reg_errcode_t *err);
64
static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65
         re_token_t *token, reg_syntax_t syntax,
66
         Idx nest, reg_errcode_t *err);
67
static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68
             re_token_t *token, reg_syntax_t syntax,
69
             Idx nest, reg_errcode_t *err);
70
static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71
          re_token_t *token, reg_syntax_t syntax,
72
          Idx nest, reg_errcode_t *err);
73
static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74
         re_dfa_t *dfa, re_token_t *token,
75
         reg_syntax_t syntax, reg_errcode_t *err);
76
static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77
              re_token_t *token, reg_syntax_t syntax,
78
              reg_errcode_t *err);
79
static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80
              re_string_t *regexp,
81
              re_token_t *token, int token_len,
82
              re_dfa_t *dfa,
83
              reg_syntax_t syntax,
84
              bool accept_hyphen);
85
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86
            re_string_t *regexp,
87
            re_token_t *token);
88
static reg_errcode_t build_equiv_class (bitset_t sbcset,
89
          re_charset_t *mbcset,
90
          Idx *equiv_class_alloc,
91
          const unsigned char *name);
92
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93
              bitset_t sbcset,
94
              re_charset_t *mbcset,
95
              Idx *char_class_alloc,
96
              const char *class_name,
97
              reg_syntax_t syntax);
98
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
99
               RE_TRANSLATE_TYPE trans,
100
               const char *class_name,
101
               const char *extra,
102
               bool non_match, reg_errcode_t *err);
103
static bin_tree_t *create_tree (re_dfa_t *dfa,
104
        bin_tree_t *left, bin_tree_t *right,
105
        re_token_type_t type);
106
static bin_tree_t *create_token_tree (re_dfa_t *dfa,
107
              bin_tree_t *left, bin_tree_t *right,
108
              const re_token_t *token);
109
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
110
static void free_token (re_token_t *node);
111
static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
112
static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
113

114
/* This table gives an error message for each of the error codes listed
115
   in regex.h.  Obviously the order here has to be same as there.
116
   POSIX doesn't require that we do anything for REG_NOERROR,
117
   but why not be nice?  */
118
119
static const char __re_error_msgid[] =
120
  {
121
#define REG_NOERROR_IDX 0
122
    gettext_noop ("Success")  /* REG_NOERROR */
123
    "\0"
124
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
125
    gettext_noop ("No match") /* REG_NOMATCH */
126
    "\0"
127
#define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
128
    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
129
    "\0"
130
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
131
    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
132
    "\0"
133
#define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
134
    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
135
    "\0"
136
#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
137
    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
138
    "\0"
139
#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
140
    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
141
    "\0"
142
#define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
143
    gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
144
    "\0"
145
#define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
146
    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
147
    "\0"
148
#define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
149
    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
150
    "\0"
151
#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
152
    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
153
    "\0"
154
#define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
155
    gettext_noop ("Invalid range end")  /* REG_ERANGE */
156
    "\0"
157
#define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
158
    gettext_noop ("Memory exhausted") /* REG_ESPACE */
159
    "\0"
160
#define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
161
    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
162
    "\0"
163
#define REG_EEND_IDX  (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
164
    gettext_noop ("Premature end of regular expression") /* REG_EEND */
165
    "\0"
166
#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
167
    gettext_noop ("Regular expression too big") /* REG_ESIZE */
168
    "\0"
169
#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
170
    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
171
  };
172
173
static const size_t __re_error_msgid_idx[] =
174
  {
175
    REG_NOERROR_IDX,
176
    REG_NOMATCH_IDX,
177
    REG_BADPAT_IDX,
178
    REG_ECOLLATE_IDX,
179
    REG_ECTYPE_IDX,
180
    REG_EESCAPE_IDX,
181
    REG_ESUBREG_IDX,
182
    REG_EBRACK_IDX,
183
    REG_EPAREN_IDX,
184
    REG_EBRACE_IDX,
185
    REG_BADBR_IDX,
186
    REG_ERANGE_IDX,
187
    REG_ESPACE_IDX,
188
    REG_BADRPT_IDX,
189
    REG_EEND_IDX,
190
    REG_ESIZE_IDX,
191
    REG_ERPAREN_IDX
192
  };
193

194
/* Entry points for GNU code.  */
195
196
/* re_compile_pattern is the GNU regular expression compiler: it
197
   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
198
   Returns 0 if the pattern was valid, otherwise an error string.
199
200
   Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
201
   are set in BUFP on entry.  */
202
203
const char *
204
re_compile_pattern (const char *pattern, size_t length,
205
        struct re_pattern_buffer *bufp)
206
0
{
207
0
  reg_errcode_t ret;
208
209
  /* And GNU code determines whether or not to get register information
210
     by passing null for the REGS argument to re_match, etc., not by
211
     setting no_sub, unless RE_NO_SUB is set.  */
212
0
  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
213
214
  /* Match anchors at newline.  */
215
0
  bufp->newline_anchor = 1;
216
217
0
  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
218
219
0
  if (!ret)
220
0
    return NULL;
221
0
  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
222
0
}
223
weak_alias (__re_compile_pattern, re_compile_pattern)
224
225
/* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
226
   also be assigned to arbitrarily: each pattern buffer stores its own
227
   syntax, so it can be changed between regex compilations.  */
228
/* This has no initializer because initialized variables in Emacs
229
   become read-only after dumping.  */
230
reg_syntax_t re_syntax_options;
231
232
233
/* Specify the precise syntax of regexps for compilation.  This provides
234
   for compatibility for various utilities which historically have
235
   different, incompatible syntaxes.
236
237
   The argument SYNTAX is a bit mask comprised of the various bits
238
   defined in regex.h.  We return the old syntax.  */
239
240
reg_syntax_t
241
re_set_syntax (reg_syntax_t syntax)
242
0
{
243
0
  reg_syntax_t ret = re_syntax_options;
244
245
0
  re_syntax_options = syntax;
246
0
  return ret;
247
0
}
248
weak_alias (__re_set_syntax, re_set_syntax)
249
250
int
251
re_compile_fastmap (struct re_pattern_buffer *bufp)
252
0
{
253
0
  re_dfa_t *dfa = bufp->buffer;
254
0
  char *fastmap = bufp->fastmap;
255
256
0
  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
257
0
  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
258
0
  if (dfa->init_state != dfa->init_state_word)
259
0
    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
260
0
  if (dfa->init_state != dfa->init_state_nl)
261
0
    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
262
0
  if (dfa->init_state != dfa->init_state_begbuf)
263
0
    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
264
0
  bufp->fastmap_accurate = 1;
265
0
  return 0;
266
0
}
267
weak_alias (__re_compile_fastmap, re_compile_fastmap)
268
269
static __always_inline void
270
re_set_fastmap (char *fastmap, bool icase, int ch)
271
0
{
272
0
  fastmap[ch] = 1;
273
0
  if (icase)
274
0
    fastmap[tolower (ch)] = 1;
275
0
}
276
277
/* Helper function for re_compile_fastmap.
278
   Compile fastmap for the initial_state INIT_STATE.  */
279
280
static void
281
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
282
       char *fastmap)
283
0
{
284
0
  re_dfa_t *dfa = bufp->buffer;
285
0
  Idx node_cnt;
286
0
  bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
287
0
  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
288
0
    {
289
0
      Idx node = init_state->nodes.elems[node_cnt];
290
0
      re_token_type_t type = dfa->nodes[node].type;
291
292
0
      if (type == CHARACTER)
293
0
  {
294
0
    re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
295
0
    if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
296
0
      {
297
0
        unsigned char buf[MB_LEN_MAX];
298
0
        unsigned char *p;
299
0
        wchar_t wc;
300
0
        mbstate_t state;
301
302
0
        p = buf;
303
0
        *p++ = dfa->nodes[node].opr.c;
304
0
        while (++node < dfa->nodes_len
305
0
         && dfa->nodes[node].type == CHARACTER
306
0
         && dfa->nodes[node].mb_partial)
307
0
    *p++ = dfa->nodes[node].opr.c;
308
0
        memset (&state, '\0', sizeof (state));
309
0
        if (__mbrtowc (&wc, (const char *) buf, p - buf,
310
0
           &state) == p - buf
311
0
      && (__wcrtomb ((char *) buf, __towlower (wc), &state)
312
0
          != (size_t) -1))
313
0
    re_set_fastmap (fastmap, false, buf[0]);
314
0
      }
315
0
  }
316
0
      else if (type == SIMPLE_BRACKET)
317
0
  {
318
0
    int i, ch;
319
0
    for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
320
0
      {
321
0
        int j;
322
0
        bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
323
0
        for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
324
0
    if (w & ((bitset_word_t) 1 << j))
325
0
      re_set_fastmap (fastmap, icase, ch);
326
0
      }
327
0
  }
328
0
      else if (type == COMPLEX_BRACKET)
329
0
  {
330
0
    re_charset_t *cset = dfa->nodes[node].opr.mbcset;
331
0
    Idx i;
332
333
#ifdef _LIBC
334
    /* See if we have to try all bytes which start multiple collation
335
       elements.
336
       e.g. In da_DK, we want to catch 'a' since "aa" is a valid
337
      collation element, and don't catch 'b' since 'b' is
338
      the only collation element which starts from 'b' (and
339
      it is caught by SIMPLE_BRACKET).  */
340
        if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
341
      && (cset->ncoll_syms || cset->nranges))
342
    {
343
      const int32_t *table = (const int32_t *)
344
        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
345
      for (i = 0; i < SBC_MAX; ++i)
346
        if (table[i] < 0)
347
          re_set_fastmap (fastmap, icase, i);
348
    }
349
#endif /* _LIBC */
350
351
    /* See if we have to start the match at all multibyte characters,
352
       i.e. where we would not find an invalid sequence.  This only
353
       applies to multibyte character sets; for single byte character
354
       sets, the SIMPLE_BRACKET again suffices.  */
355
0
    if (dfa->mb_cur_max > 1
356
0
        && (cset->nchar_classes || cset->non_match || cset->nranges
357
#ifdef _LIBC
358
      || cset->nequiv_classes
359
#endif /* _LIBC */
360
0
     ))
361
0
      {
362
0
        unsigned char c = 0;
363
0
        do
364
0
    {
365
0
      mbstate_t mbs;
366
0
      memset (&mbs, 0, sizeof (mbs));
367
0
      if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
368
0
        re_set_fastmap (fastmap, false, (int) c);
369
0
    }
370
0
        while (++c != 0);
371
0
      }
372
373
0
    else
374
0
      {
375
        /* ... Else catch all bytes which can start the mbchars.  */
376
0
        for (i = 0; i < cset->nmbchars; ++i)
377
0
    {
378
0
      char buf[256];
379
0
      mbstate_t state;
380
0
      memset (&state, '\0', sizeof (state));
381
0
      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
382
0
        re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
383
0
      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
384
0
        {
385
0
          if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
386
0
        != (size_t) -1)
387
0
      re_set_fastmap (fastmap, false, *(unsigned char *) buf);
388
0
        }
389
0
    }
390
0
      }
391
0
  }
392
0
      else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
393
0
  {
394
0
    memset (fastmap, '\1', sizeof (char) * SBC_MAX);
395
0
    if (type == END_OF_RE)
396
0
      bufp->can_be_null = 1;
397
0
    return;
398
0
  }
399
0
    }
400
0
}
401

402
/* Entry point for POSIX code.  */
403
/* regcomp takes a regular expression as a string and compiles it.
404
405
   PREG is a regex_t *.  We do not expect any fields to be initialized,
406
   since POSIX says we shouldn't.  Thus, we set
407
408
     'buffer' to the compiled pattern;
409
     'used' to the length of the compiled pattern;
410
     'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
411
       REG_EXTENDED bit in CFLAGS is set; otherwise, to
412
       RE_SYNTAX_POSIX_BASIC;
413
     'newline_anchor' to REG_NEWLINE being set in CFLAGS;
414
     'fastmap' to an allocated space for the fastmap;
415
     'fastmap_accurate' to zero;
416
     're_nsub' to the number of subexpressions in PATTERN.
417
418
   PATTERN is the address of the pattern string.
419
420
   CFLAGS is a series of bits which affect compilation.
421
422
     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
423
     use POSIX basic syntax.
424
425
     If REG_NEWLINE is set, then . and [^...] don't match newline.
426
     Also, regexec will try a match beginning after every newline.
427
428
     If REG_ICASE is set, then we considers upper- and lowercase
429
     versions of letters to be equivalent when matching.
430
431
     If REG_NOSUB is set, then when PREG is passed to regexec, that
432
     routine will report only success or failure, and nothing about the
433
     registers.
434
435
   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
436
   the return codes and their meanings.)  */
437
438
int
439
regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
440
0
{
441
0
  reg_errcode_t ret;
442
0
  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
443
0
       : RE_SYNTAX_POSIX_BASIC);
444
445
0
  preg->buffer = NULL;
446
0
  preg->allocated = 0;
447
0
  preg->used = 0;
448
449
  /* Try to allocate space for the fastmap.  */
450
0
  preg->fastmap = re_malloc (char, SBC_MAX);
451
0
  if (__glibc_unlikely (preg->fastmap == NULL))
452
0
    return REG_ESPACE;
453
454
0
  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
455
456
  /* If REG_NEWLINE is set, newlines are treated differently.  */
457
0
  if (cflags & REG_NEWLINE)
458
0
    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
459
0
      syntax &= ~RE_DOT_NEWLINE;
460
0
      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
461
      /* It also changes the matching behavior.  */
462
0
      preg->newline_anchor = 1;
463
0
    }
464
0
  else
465
0
    preg->newline_anchor = 0;
466
0
  preg->no_sub = !!(cflags & REG_NOSUB);
467
0
  preg->translate = NULL;
468
469
0
  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
470
471
  /* POSIX doesn't distinguish between an unmatched open-group and an
472
     unmatched close-group: both are REG_EPAREN.  */
473
0
  if (ret == REG_ERPAREN)
474
0
    ret = REG_EPAREN;
475
476
  /* We have already checked preg->fastmap != NULL.  */
477
0
  if (__glibc_likely (ret == REG_NOERROR))
478
    /* Compute the fastmap now, since regexec cannot modify the pattern
479
       buffer.  This function never fails in this implementation.  */
480
0
    (void) re_compile_fastmap (preg);
481
0
  else
482
0
    {
483
      /* Some error occurred while compiling the expression.  */
484
0
      re_free (preg->fastmap);
485
0
      preg->fastmap = NULL;
486
0
    }
487
488
0
  return (int) ret;
489
0
}
490
libc_hidden_def (__regcomp)
491
weak_alias (__regcomp, regcomp)
492
493
/* Returns a message corresponding to an error code, ERRCODE, returned
494
   from either regcomp or regexec.   We don't use PREG here.  */
495
496
size_t
497
regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
498
    size_t errbuf_size)
499
0
{
500
0
  const char *msg;
501
0
  size_t msg_size;
502
0
  int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
503
504
0
  if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
505
    /* Only error codes returned by the rest of the code should be passed
506
       to this routine.  If we are given anything else, or if other regex
507
       code generates an invalid error code, then the program has a bug.
508
       Dump core so we can fix it.  */
509
0
    abort ();
510
511
0
  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
512
513
0
  msg_size = strlen (msg) + 1; /* Includes the null.  */
514
515
0
  if (__glibc_likely (errbuf_size != 0))
516
0
    {
517
0
      size_t cpy_size = msg_size;
518
0
      if (__glibc_unlikely (msg_size > errbuf_size))
519
0
  {
520
0
    cpy_size = errbuf_size - 1;
521
0
    errbuf[cpy_size] = '\0';
522
0
  }
523
0
      memcpy (errbuf, msg, cpy_size);
524
0
    }
525
526
0
  return msg_size;
527
0
}
528
weak_alias (__regerror, regerror)
529
530
531
/* This static array is used for the map to single-byte characters when
532
   UTF-8 is used.  Otherwise we would allocate memory just to initialize
533
   it the same all the time.  UTF-8 is the preferred encoding so this is
534
   a worthwhile optimization.  */
535
static const bitset_t utf8_sb_map =
536
{
537
  /* Set the first 128 bits.  */
538
#if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
539
  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
540
#else
541
# if 4 * BITSET_WORD_BITS < ASCII_CHARS
542
#  error "bitset_word_t is narrower than 32 bits"
543
# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
544
  BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
545
# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
546
  BITSET_WORD_MAX, BITSET_WORD_MAX,
547
# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
548
  BITSET_WORD_MAX,
549
# endif
550
  (BITSET_WORD_MAX
551
   >> (SBC_MAX % BITSET_WORD_BITS == 0
552
       ? 0
553
       : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
554
#endif
555
};
556
557
558
static void
559
free_dfa_content (re_dfa_t *dfa)
560
0
{
561
0
  Idx i, j;
562
563
0
  if (dfa->nodes)
564
0
    for (i = 0; i < dfa->nodes_len; ++i)
565
0
      free_token (dfa->nodes + i);
566
0
  re_free (dfa->nexts);
567
0
  for (i = 0; i < dfa->nodes_len; ++i)
568
0
    {
569
0
      if (dfa->eclosures != NULL)
570
0
  re_node_set_free (dfa->eclosures + i);
571
0
      if (dfa->inveclosures != NULL)
572
0
  re_node_set_free (dfa->inveclosures + i);
573
0
      if (dfa->edests != NULL)
574
0
  re_node_set_free (dfa->edests + i);
575
0
    }
576
0
  re_free (dfa->edests);
577
0
  re_free (dfa->eclosures);
578
0
  re_free (dfa->inveclosures);
579
0
  re_free (dfa->nodes);
580
581
0
  if (dfa->state_table)
582
0
    for (i = 0; i <= dfa->state_hash_mask; ++i)
583
0
      {
584
0
  struct re_state_table_entry *entry = dfa->state_table + i;
585
0
  for (j = 0; j < entry->num; ++j)
586
0
    {
587
0
      re_dfastate_t *state = entry->array[j];
588
0
      free_state (state);
589
0
    }
590
0
  re_free (entry->array);
591
0
      }
592
0
  re_free (dfa->state_table);
593
0
  if (dfa->sb_char != utf8_sb_map)
594
0
    re_free (dfa->sb_char);
595
0
  re_free (dfa->subexp_map);
596
#ifdef DEBUG
597
  re_free (dfa->re_str);
598
#endif
599
600
0
  re_free (dfa);
601
0
}
602
603
604
/* Free dynamically allocated space used by PREG.  */
605
606
void
607
regfree (regex_t *preg)
608
0
{
609
0
  re_dfa_t *dfa = preg->buffer;
610
0
  if (__glibc_likely (dfa != NULL))
611
0
    {
612
0
      lock_fini (dfa->lock);
613
0
      free_dfa_content (dfa);
614
0
    }
615
0
  preg->buffer = NULL;
616
0
  preg->allocated = 0;
617
618
0
  re_free (preg->fastmap);
619
0
  preg->fastmap = NULL;
620
621
0
  re_free (preg->translate);
622
0
  preg->translate = NULL;
623
0
}
624
libc_hidden_def (__regfree)
625
weak_alias (__regfree, regfree)
626

627
/* Entry points compatible with 4.2 BSD regex library.  We don't define
628
   them unless specifically requested.  */
629
630
#if defined _REGEX_RE_COMP || defined _LIBC
631
632
/* BSD has one and only one pattern buffer.  */
633
static struct re_pattern_buffer re_comp_buf;
634
635
char *
636
# ifdef _LIBC
637
/* Make these definitions weak in libc, so POSIX programs can redefine
638
   these names if they don't use our functions, and still use
639
   regcomp/regexec above without link errors.  */
640
weak_function
641
# endif
642
re_comp (const char *s)
643
{
644
  reg_errcode_t ret;
645
  char *fastmap;
646
647
  if (!s)
648
    {
649
      if (!re_comp_buf.buffer)
650
  return gettext ("No previous regular expression");
651
      return 0;
652
    }
653
654
  if (re_comp_buf.buffer)
655
    {
656
      fastmap = re_comp_buf.fastmap;
657
      re_comp_buf.fastmap = NULL;
658
      __regfree (&re_comp_buf);
659
      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660
      re_comp_buf.fastmap = fastmap;
661
    }
662
663
  if (re_comp_buf.fastmap == NULL)
664
    {
665
      re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
666
      if (re_comp_buf.fastmap == NULL)
667
  return (char *) gettext (__re_error_msgid
668
         + __re_error_msgid_idx[(int) REG_ESPACE]);
669
    }
670
671
  /* Since 're_exec' always passes NULL for the 'regs' argument, we
672
     don't need to initialize the pattern buffer fields which affect it.  */
673
674
  /* Match anchors at newlines.  */
675
  re_comp_buf.newline_anchor = 1;
676
677
  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
678
679
  if (!ret)
680
    return NULL;
681
682
  /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
683
  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
684
}
685
686
#ifdef _LIBC
687
libc_freeres_fn (free_mem)
688
{
689
  __regfree (&re_comp_buf);
690
}
691
#endif
692
693
#endif /* _REGEX_RE_COMP */
694

695
/* Internal entry point.
696
   Compile the regular expression PATTERN, whose length is LENGTH.
697
   SYNTAX indicate regular expression's syntax.  */
698
699
static reg_errcode_t
700
re_compile_internal (regex_t *preg, const char * pattern, size_t length,
701
         reg_syntax_t syntax)
702
0
{
703
0
  reg_errcode_t err = REG_NOERROR;
704
0
  re_dfa_t *dfa;
705
0
  re_string_t regexp;
706
707
  /* Initialize the pattern buffer.  */
708
0
  preg->fastmap_accurate = 0;
709
0
  preg->syntax = syntax;
710
0
  preg->not_bol = preg->not_eol = 0;
711
0
  preg->used = 0;
712
0
  preg->re_nsub = 0;
713
0
  preg->can_be_null = 0;
714
0
  preg->regs_allocated = REGS_UNALLOCATED;
715
716
  /* Initialize the dfa.  */
717
0
  dfa = preg->buffer;
718
0
  if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
719
0
    {
720
      /* If zero allocated, but buffer is non-null, try to realloc
721
   enough space.  This loses if buffer's address is bogus, but
722
   that is the user's responsibility.  If ->buffer is NULL this
723
   is a simple allocation.  */
724
0
      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
725
0
      if (dfa == NULL)
726
0
  return REG_ESPACE;
727
0
      preg->allocated = sizeof (re_dfa_t);
728
0
      preg->buffer = dfa;
729
0
    }
730
0
  preg->used = sizeof (re_dfa_t);
731
732
0
  err = init_dfa (dfa, length);
733
0
  if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
734
0
    err = REG_ESPACE;
735
0
  if (__glibc_unlikely (err != REG_NOERROR))
736
0
    {
737
0
      free_dfa_content (dfa);
738
0
      preg->buffer = NULL;
739
0
      preg->allocated = 0;
740
0
      return err;
741
0
    }
742
#ifdef DEBUG
743
  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
744
  dfa->re_str = re_malloc (char, length + 1);
745
  strncpy (dfa->re_str, pattern, length + 1);
746
#endif
747
748
0
  err = re_string_construct (&regexp, pattern, length, preg->translate,
749
0
           (syntax & RE_ICASE) != 0, dfa);
750
0
  if (__glibc_unlikely (err != REG_NOERROR))
751
0
    {
752
0
    re_compile_internal_free_return:
753
0
      free_workarea_compile (preg);
754
0
      re_string_destruct (&regexp);
755
0
      lock_fini (dfa->lock);
756
0
      free_dfa_content (dfa);
757
0
      preg->buffer = NULL;
758
0
      preg->allocated = 0;
759
0
      return err;
760
0
    }
761
762
  /* Parse the regular expression, and build a structure tree.  */
763
0
  preg->re_nsub = 0;
764
0
  dfa->str_tree = parse (&regexp, preg, syntax, &err);
765
0
  if (__glibc_unlikely (dfa->str_tree == NULL))
766
0
    goto re_compile_internal_free_return;
767
768
  /* Analyze the tree and create the nfa.  */
769
0
  err = analyze (preg);
770
0
  if (__glibc_unlikely (err != REG_NOERROR))
771
0
    goto re_compile_internal_free_return;
772
773
  /* If possible, do searching in single byte encoding to speed things up.  */
774
0
  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
775
0
    optimize_utf8 (dfa);
776
777
  /* Then create the initial state of the dfa.  */
778
0
  err = create_initial_state (dfa);
779
780
  /* Release work areas.  */
781
0
  free_workarea_compile (preg);
782
0
  re_string_destruct (&regexp);
783
784
0
  if (__glibc_unlikely (err != REG_NOERROR))
785
0
    {
786
0
      lock_fini (dfa->lock);
787
0
      free_dfa_content (dfa);
788
0
      preg->buffer = NULL;
789
0
      preg->allocated = 0;
790
0
    }
791
792
0
  return err;
793
0
}
794
795
/* Initialize DFA.  We use the length of the regular expression PAT_LEN
796
   as the initial length of some arrays.  */
797
798
static reg_errcode_t
799
init_dfa (re_dfa_t *dfa, size_t pat_len)
800
0
{
801
0
  __re_size_t table_size;
802
0
#ifndef _LIBC
803
0
  const char *codeset_name;
804
0
#endif
805
0
  size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
806
0
  size_t max_object_size =
807
0
    MAX (sizeof (struct re_state_table_entry),
808
0
   MAX (sizeof (re_token_t),
809
0
        MAX (sizeof (re_node_set),
810
0
       MAX (sizeof (regmatch_t),
811
0
      max_i18n_object_size))));
812
813
0
  memset (dfa, '\0', sizeof (re_dfa_t));
814
815
  /* Force allocation of str_tree_storage the first time.  */
816
0
  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
817
818
  /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
819
     calculation below, and for similar doubling calculations
820
     elsewhere.  And it's <= rather than <, because some of the
821
     doubling calculations add 1 afterwards.  */
822
0
  if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
823
0
      <= pat_len))
824
0
    return REG_ESPACE;
825
826
0
  dfa->nodes_alloc = pat_len + 1;
827
0
  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
828
829
  /*  table_size = 2 ^ ceil(log pat_len) */
830
0
  for (table_size = 1; ; table_size <<= 1)
831
0
    if (table_size > pat_len)
832
0
      break;
833
834
0
  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
835
0
  dfa->state_hash_mask = table_size - 1;
836
837
0
  dfa->mb_cur_max = MB_CUR_MAX;
838
#ifdef _LIBC
839
  if (dfa->mb_cur_max == 6
840
      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
841
    dfa->is_utf8 = 1;
842
  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
843
           != 0);
844
#else
845
0
  codeset_name = nl_langinfo (CODESET);
846
0
  if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
847
0
      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
848
0
      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
849
0
      && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
850
0
    dfa->is_utf8 = 1;
851
852
  /* We check exhaustively in the loop below if this charset is a
853
     superset of ASCII.  */
854
0
  dfa->map_notascii = 0;
855
0
#endif
856
857
0
  if (dfa->mb_cur_max > 1)
858
0
    {
859
0
      if (dfa->is_utf8)
860
0
  dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861
0
      else
862
0
  {
863
0
    int i, j, ch;
864
865
0
    dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
866
0
    if (__glibc_unlikely (dfa->sb_char == NULL))
867
0
      return REG_ESPACE;
868
869
    /* Set the bits corresponding to single byte chars.  */
870
0
    for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
871
0
      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
872
0
        {
873
0
    wint_t wch = __btowc (ch);
874
0
    if (wch != WEOF)
875
0
      dfa->sb_char[i] |= (bitset_word_t) 1 << j;
876
0
#ifndef _LIBC
877
0
    if (isascii (ch) && wch != ch)
878
0
      dfa->map_notascii = 1;
879
0
#endif
880
0
        }
881
0
  }
882
0
    }
883
884
0
  if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
885
0
    return REG_ESPACE;
886
0
  return REG_NOERROR;
887
0
}
888
889
/* Initialize WORD_CHAR table, which indicate which character is
890
   "word".  In this case "word" means that it is the word construction
891
   character used by some operators like "\<", "\>", etc.  */
892
893
static void
894
init_word_char (re_dfa_t *dfa)
895
0
{
896
0
  int i = 0;
897
0
  int j;
898
0
  int ch = 0;
899
0
  dfa->word_ops_used = 1;
900
0
  if (__glibc_likely (dfa->map_notascii == 0))
901
0
    {
902
0
      bitset_word_t bits0 = 0x00000000;
903
0
      bitset_word_t bits1 = 0x03ff0000;
904
0
      bitset_word_t bits2 = 0x87fffffe;
905
0
      bitset_word_t bits3 = 0x07fffffe;
906
0
      if (BITSET_WORD_BITS == 64)
907
0
  {
908
    /* Pacify gcc -Woverflow on 32-bit platformns.  */
909
0
    dfa->word_char[0] = bits1 << 31 << 1 | bits0;
910
0
    dfa->word_char[1] = bits3 << 31 << 1 | bits2;
911
0
    i = 2;
912
0
  }
913
0
      else if (BITSET_WORD_BITS == 32)
914
0
  {
915
0
    dfa->word_char[0] = bits0;
916
0
    dfa->word_char[1] = bits1;
917
0
    dfa->word_char[2] = bits2;
918
0
    dfa->word_char[3] = bits3;
919
0
    i = 4;
920
0
  }
921
0
      else
922
0
        goto general_case;
923
0
      ch = 128;
924
925
0
      if (__glibc_likely (dfa->is_utf8))
926
0
  {
927
0
    memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
928
0
    return;
929
0
  }
930
0
    }
931
932
0
 general_case:
933
0
  for (; i < BITSET_WORDS; ++i)
934
0
    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
935
0
      if (isalnum (ch) || ch == '_')
936
0
  dfa->word_char[i] |= (bitset_word_t) 1 << j;
937
0
}
938
939
/* Free the work area which are only used while compiling.  */
940
941
static void
942
free_workarea_compile (regex_t *preg)
943
0
{
944
0
  re_dfa_t *dfa = preg->buffer;
945
0
  bin_tree_storage_t *storage, *next;
946
0
  for (storage = dfa->str_tree_storage; storage; storage = next)
947
0
    {
948
0
      next = storage->next;
949
0
      re_free (storage);
950
0
    }
951
0
  dfa->str_tree_storage = NULL;
952
0
  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
953
0
  dfa->str_tree = NULL;
954
0
  re_free (dfa->org_indices);
955
0
  dfa->org_indices = NULL;
956
0
}
957
958
/* Create initial states for all contexts.  */
959
960
static reg_errcode_t
961
create_initial_state (re_dfa_t *dfa)
962
0
{
963
0
  Idx first, i;
964
0
  reg_errcode_t err;
965
0
  re_node_set init_nodes;
966
967
  /* Initial states have the epsilon closure of the node which is
968
     the first node of the regular expression.  */
969
0
  first = dfa->str_tree->first->node_idx;
970
0
  dfa->init_node = first;
971
0
  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
972
0
  if (__glibc_unlikely (err != REG_NOERROR))
973
0
    return err;
974
975
  /* The back-references which are in initial states can epsilon transit,
976
     since in this case all of the subexpressions can be null.
977
     Then we add epsilon closures of the nodes which are the next nodes of
978
     the back-references.  */
979
0
  if (dfa->nbackref > 0)
980
0
    for (i = 0; i < init_nodes.nelem; ++i)
981
0
      {
982
0
  Idx node_idx = init_nodes.elems[i];
983
0
  re_token_type_t type = dfa->nodes[node_idx].type;
984
985
0
  Idx clexp_idx;
986
0
  if (type != OP_BACK_REF)
987
0
    continue;
988
0
  for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
989
0
    {
990
0
      re_token_t *clexp_node;
991
0
      clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
992
0
      if (clexp_node->type == OP_CLOSE_SUBEXP
993
0
    && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
994
0
        break;
995
0
    }
996
0
  if (clexp_idx == init_nodes.nelem)
997
0
    continue;
998
999
0
  if (type == OP_BACK_REF)
1000
0
    {
1001
0
      Idx dest_idx = dfa->edests[node_idx].elems[0];
1002
0
      if (!re_node_set_contains (&init_nodes, dest_idx))
1003
0
        {
1004
0
    reg_errcode_t merge_err
1005
0
                  = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1006
0
    if (merge_err != REG_NOERROR)
1007
0
      return merge_err;
1008
0
    i = 0;
1009
0
        }
1010
0
    }
1011
0
      }
1012
1013
  /* It must be the first time to invoke acquire_state.  */
1014
0
  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1015
  /* We don't check ERR here, since the initial state must not be NULL.  */
1016
0
  if (__glibc_unlikely (dfa->init_state == NULL))
1017
0
    return err;
1018
0
  if (dfa->init_state->has_constraint)
1019
0
    {
1020
0
      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1021
0
                   CONTEXT_WORD);
1022
0
      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1023
0
                 CONTEXT_NEWLINE);
1024
0
      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1025
0
               &init_nodes,
1026
0
               CONTEXT_NEWLINE
1027
0
               | CONTEXT_BEGBUF);
1028
0
      if (__glibc_unlikely (dfa->init_state_word == NULL
1029
0
          || dfa->init_state_nl == NULL
1030
0
          || dfa->init_state_begbuf == NULL))
1031
0
  return err;
1032
0
    }
1033
0
  else
1034
0
    dfa->init_state_word = dfa->init_state_nl
1035
0
      = dfa->init_state_begbuf = dfa->init_state;
1036
1037
0
  re_node_set_free (&init_nodes);
1038
0
  return REG_NOERROR;
1039
0
}
1040

1041
/* If it is possible to do searching in single byte encoding instead of UTF-8
1042
   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1043
   DFA nodes where needed.  */
1044
1045
static void
1046
optimize_utf8 (re_dfa_t *dfa)
1047
0
{
1048
0
  Idx node;
1049
0
  int i;
1050
0
  bool mb_chars = false;
1051
0
  bool has_period = false;
1052
1053
0
  for (node = 0; node < dfa->nodes_len; ++node)
1054
0
    switch (dfa->nodes[node].type)
1055
0
      {
1056
0
      case CHARACTER:
1057
0
  if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1058
0
    mb_chars = true;
1059
0
  break;
1060
0
      case ANCHOR:
1061
0
  switch (dfa->nodes[node].opr.ctx_type)
1062
0
    {
1063
0
    case LINE_FIRST:
1064
0
    case LINE_LAST:
1065
0
    case BUF_FIRST:
1066
0
    case BUF_LAST:
1067
0
      break;
1068
0
    default:
1069
      /* Word anchors etc. cannot be handled.  It's okay to test
1070
         opr.ctx_type since constraints (for all DFA nodes) are
1071
         created by ORing one or more opr.ctx_type values.  */
1072
0
      return;
1073
0
    }
1074
0
  break;
1075
0
      case OP_PERIOD:
1076
0
  has_period = true;
1077
0
  break;
1078
0
      case OP_BACK_REF:
1079
0
      case OP_ALT:
1080
0
      case END_OF_RE:
1081
0
      case OP_DUP_ASTERISK:
1082
0
      case OP_OPEN_SUBEXP:
1083
0
      case OP_CLOSE_SUBEXP:
1084
0
  break;
1085
0
      case COMPLEX_BRACKET:
1086
0
  return;
1087
0
      case SIMPLE_BRACKET:
1088
  /* Just double check.  */
1089
0
  {
1090
0
    int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1091
0
      ? 0
1092
0
      : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1093
0
    for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1094
0
      {
1095
0
        if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1096
0
    return;
1097
0
        rshift = 0;
1098
0
      }
1099
0
  }
1100
0
  break;
1101
0
      default:
1102
0
  abort ();
1103
0
      }
1104
1105
0
  if (mb_chars || has_period)
1106
0
    for (node = 0; node < dfa->nodes_len; ++node)
1107
0
      {
1108
0
  if (dfa->nodes[node].type == CHARACTER
1109
0
      && dfa->nodes[node].opr.c >= ASCII_CHARS)
1110
0
    dfa->nodes[node].mb_partial = 0;
1111
0
  else if (dfa->nodes[node].type == OP_PERIOD)
1112
0
    dfa->nodes[node].type = OP_UTF8_PERIOD;
1113
0
      }
1114
1115
  /* The search can be in single byte locale.  */
1116
0
  dfa->mb_cur_max = 1;
1117
0
  dfa->is_utf8 = 0;
1118
0
  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119
0
}
1120

1121
/* Analyze the structure tree, and calculate "first", "next", "edest",
1122
   "eclosure", and "inveclosure".  */
1123
1124
static reg_errcode_t
1125
analyze (regex_t *preg)
1126
0
{
1127
0
  re_dfa_t *dfa = preg->buffer;
1128
0
  reg_errcode_t ret;
1129
1130
  /* Allocate arrays.  */
1131
0
  dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1132
0
  dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1133
0
  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134
0
  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135
0
  if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1136
0
      || dfa->edests == NULL || dfa->eclosures == NULL))
1137
0
    return REG_ESPACE;
1138
1139
0
  dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1140
0
  if (dfa->subexp_map != NULL)
1141
0
    {
1142
0
      Idx i;
1143
0
      for (i = 0; i < preg->re_nsub; i++)
1144
0
  dfa->subexp_map[i] = i;
1145
0
      preorder (dfa->str_tree, optimize_subexps, dfa);
1146
0
      for (i = 0; i < preg->re_nsub; i++)
1147
0
  if (dfa->subexp_map[i] != i)
1148
0
    break;
1149
0
      if (i == preg->re_nsub)
1150
0
  {
1151
0
    re_free (dfa->subexp_map);
1152
0
    dfa->subexp_map = NULL;
1153
0
  }
1154
0
    }
1155
1156
0
  ret = postorder (dfa->str_tree, lower_subexps, preg);
1157
0
  if (__glibc_unlikely (ret != REG_NOERROR))
1158
0
    return ret;
1159
0
  ret = postorder (dfa->str_tree, calc_first, dfa);
1160
0
  if (__glibc_unlikely (ret != REG_NOERROR))
1161
0
    return ret;
1162
0
  preorder (dfa->str_tree, calc_next, dfa);
1163
0
  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164
0
  if (__glibc_unlikely (ret != REG_NOERROR))
1165
0
    return ret;
1166
0
  ret = calc_eclosure (dfa);
1167
0
  if (__glibc_unlikely (ret != REG_NOERROR))
1168
0
    return ret;
1169
1170
  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171
     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1172
0
  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173
0
      || dfa->nbackref)
1174
0
    {
1175
0
      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176
0
      if (__glibc_unlikely (dfa->inveclosures == NULL))
1177
0
  return REG_ESPACE;
1178
0
      ret = calc_inveclosure (dfa);
1179
0
    }
1180
1181
0
  return ret;
1182
0
}
1183
1184
/* Our parse trees are very unbalanced, so we cannot use a stack to
1185
   implement parse tree visits.  Instead, we use parent pointers and
1186
   some hairy code in these two functions.  */
1187
static reg_errcode_t
1188
postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189
     void *extra)
1190
0
{
1191
0
  bin_tree_t *node, *prev;
1192
1193
0
  for (node = root; ; )
1194
0
    {
1195
      /* Descend down the tree, preferably to the left (or to the right
1196
   if that's the only child).  */
1197
0
      while (node->left || node->right)
1198
0
  if (node->left)
1199
0
    node = node->left;
1200
0
  else
1201
0
    node = node->right;
1202
1203
0
      do
1204
0
  {
1205
0
    reg_errcode_t err = fn (extra, node);
1206
0
    if (__glibc_unlikely (err != REG_NOERROR))
1207
0
      return err;
1208
0
    if (node->parent == NULL)
1209
0
      return REG_NOERROR;
1210
0
    prev = node;
1211
0
    node = node->parent;
1212
0
  }
1213
      /* Go up while we have a node that is reached from the right.  */
1214
0
      while (node->right == prev || node->right == NULL);
1215
0
      node = node->right;
1216
0
    }
1217
0
}
1218
1219
static reg_errcode_t
1220
preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1221
    void *extra)
1222
0
{
1223
0
  bin_tree_t *node;
1224
1225
0
  for (node = root; ; )
1226
0
    {
1227
0
      reg_errcode_t err = fn (extra, node);
1228
0
      if (__glibc_unlikely (err != REG_NOERROR))
1229
0
  return err;
1230
1231
      /* Go to the left node, or up and to the right.  */
1232
0
      if (node->left)
1233
0
  node = node->left;
1234
0
      else
1235
0
  {
1236
0
    bin_tree_t *prev = NULL;
1237
0
    while (node->right == prev || node->right == NULL)
1238
0
      {
1239
0
        prev = node;
1240
0
        node = node->parent;
1241
0
        if (!node)
1242
0
    return REG_NOERROR;
1243
0
      }
1244
0
    node = node->right;
1245
0
  }
1246
0
    }
1247
0
}
1248
1249
/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1250
   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1251
   backreferences as well.  Requires a preorder visit.  */
1252
static reg_errcode_t
1253
optimize_subexps (void *extra, bin_tree_t *node)
1254
0
{
1255
0
  re_dfa_t *dfa = (re_dfa_t *) extra;
1256
1257
0
  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1258
0
    {
1259
0
      int idx = node->token.opr.idx;
1260
0
      node->token.opr.idx = dfa->subexp_map[idx];
1261
0
      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1262
0
    }
1263
1264
0
  else if (node->token.type == SUBEXP
1265
0
     && node->left && node->left->token.type == SUBEXP)
1266
0
    {
1267
0
      Idx other_idx = node->left->token.opr.idx;
1268
1269
0
      node->left = node->left->left;
1270
0
      if (node->left)
1271
0
  node->left->parent = node;
1272
1273
0
      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1274
0
      if (other_idx < BITSET_WORD_BITS)
1275
0
  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1276
0
    }
1277
1278
0
  return REG_NOERROR;
1279
0
}
1280
1281
/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1282
   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1283
static reg_errcode_t
1284
lower_subexps (void *extra, bin_tree_t *node)
1285
0
{
1286
0
  regex_t *preg = (regex_t *) extra;
1287
0
  reg_errcode_t err = REG_NOERROR;
1288
1289
0
  if (node->left && node->left->token.type == SUBEXP)
1290
0
    {
1291
0
      node->left = lower_subexp (&err, preg, node->left);
1292
0
      if (node->left)
1293
0
  node->left->parent = node;
1294
0
    }
1295
0
  if (node->right && node->right->token.type == SUBEXP)
1296
0
    {
1297
0
      node->right = lower_subexp (&err, preg, node->right);
1298
0
      if (node->right)
1299
0
  node->right->parent = node;
1300
0
    }
1301
1302
0
  return err;
1303
0
}
1304
1305
static bin_tree_t *
1306
lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1307
0
{
1308
0
  re_dfa_t *dfa = preg->buffer;
1309
0
  bin_tree_t *body = node->left;
1310
0
  bin_tree_t *op, *cls, *tree1, *tree;
1311
1312
0
  if (preg->no_sub
1313
      /* We do not optimize empty subexpressions, because otherwise we may
1314
   have bad CONCAT nodes with NULL children.  This is obviously not
1315
   very common, so we do not lose much.  An example that triggers
1316
   this case is the sed "script" /\(\)/x.  */
1317
0
      && node->left != NULL
1318
0
      && (node->token.opr.idx >= BITSET_WORD_BITS
1319
0
    || !(dfa->used_bkref_map
1320
0
         & ((bitset_word_t) 1 << node->token.opr.idx))))
1321
0
    return node->left;
1322
1323
  /* Convert the SUBEXP node to the concatenation of an
1324
     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1325
0
  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1326
0
  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1327
0
  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1328
0
  tree = create_tree (dfa, op, tree1, CONCAT);
1329
0
  if (__glibc_unlikely (tree == NULL || tree1 == NULL
1330
0
      || op == NULL || cls == NULL))
1331
0
    {
1332
0
      *err = REG_ESPACE;
1333
0
      return NULL;
1334
0
    }
1335
1336
0
  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1337
0
  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1338
0
  return tree;
1339
0
}
1340
1341
/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1342
   nodes.  Requires a postorder visit.  */
1343
static reg_errcode_t
1344
calc_first (void *extra, bin_tree_t *node)
1345
0
{
1346
0
  re_dfa_t *dfa = (re_dfa_t *) extra;
1347
0
  if (node->token.type == CONCAT)
1348
0
    {
1349
0
      node->first = node->left->first;
1350
0
      node->node_idx = node->left->node_idx;
1351
0
    }
1352
0
  else
1353
0
    {
1354
0
      node->first = node;
1355
0
      node->node_idx = re_dfa_add_node (dfa, node->token);
1356
0
      if (__glibc_unlikely (node->node_idx == -1))
1357
0
  return REG_ESPACE;
1358
0
      if (node->token.type == ANCHOR)
1359
0
  dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1360
0
    }
1361
0
  return REG_NOERROR;
1362
0
}
1363
1364
/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1365
static reg_errcode_t
1366
calc_next (void *extra, bin_tree_t *node)
1367
0
{
1368
0
  switch (node->token.type)
1369
0
    {
1370
0
    case OP_DUP_ASTERISK:
1371
0
      node->left->next = node;
1372
0
      break;
1373
0
    case CONCAT:
1374
0
      node->left->next = node->right->first;
1375
0
      node->right->next = node->next;
1376
0
      break;
1377
0
    default:
1378
0
      if (node->left)
1379
0
  node->left->next = node->next;
1380
0
      if (node->right)
1381
0
  node->right->next = node->next;
1382
0
      break;
1383
0
    }
1384
0
  return REG_NOERROR;
1385
0
}
1386
1387
/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1388
static reg_errcode_t
1389
link_nfa_nodes (void *extra, bin_tree_t *node)
1390
0
{
1391
0
  re_dfa_t *dfa = (re_dfa_t *) extra;
1392
0
  Idx idx = node->node_idx;
1393
0
  reg_errcode_t err = REG_NOERROR;
1394
1395
0
  switch (node->token.type)
1396
0
    {
1397
0
    case CONCAT:
1398
0
      break;
1399
1400
0
    case END_OF_RE:
1401
0
      DEBUG_ASSERT (node->next == NULL);
1402
0
      break;
1403
1404
0
    case OP_DUP_ASTERISK:
1405
0
    case OP_ALT:
1406
0
      {
1407
0
  Idx left, right;
1408
0
  dfa->has_plural_match = 1;
1409
0
  if (node->left != NULL)
1410
0
    left = node->left->first->node_idx;
1411
0
  else
1412
0
    left = node->next->node_idx;
1413
0
  if (node->right != NULL)
1414
0
    right = node->right->first->node_idx;
1415
0
  else
1416
0
    right = node->next->node_idx;
1417
0
  DEBUG_ASSERT (left > -1);
1418
0
  DEBUG_ASSERT (right > -1);
1419
0
  err = re_node_set_init_2 (dfa->edests + idx, left, right);
1420
0
      }
1421
0
      break;
1422
1423
0
    case ANCHOR:
1424
0
    case OP_OPEN_SUBEXP:
1425
0
    case OP_CLOSE_SUBEXP:
1426
0
      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1427
0
      break;
1428
1429
0
    case OP_BACK_REF:
1430
0
      dfa->nexts[idx] = node->next->node_idx;
1431
0
      if (node->token.type == OP_BACK_REF)
1432
0
  err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1433
0
      break;
1434
1435
0
    default:
1436
0
      DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1437
0
      dfa->nexts[idx] = node->next->node_idx;
1438
0
      break;
1439
0
    }
1440
1441
0
  return err;
1442
0
}
1443
1444
/* Duplicate the epsilon closure of the node ROOT_NODE.
1445
   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1446
   to their own constraint.  */
1447
1448
static reg_errcode_t
1449
duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1450
      Idx root_node, unsigned int init_constraint)
1451
0
{
1452
0
  Idx org_node, clone_node;
1453
0
  bool ok;
1454
0
  unsigned int constraint = init_constraint;
1455
0
  for (org_node = top_org_node, clone_node = top_clone_node;;)
1456
0
    {
1457
0
      Idx org_dest, clone_dest;
1458
0
      if (dfa->nodes[org_node].type == OP_BACK_REF)
1459
0
  {
1460
    /* If the back reference epsilon-transit, its destination must
1461
       also have the constraint.  Then duplicate the epsilon closure
1462
       of the destination of the back reference, and store it in
1463
       edests of the back reference.  */
1464
0
    org_dest = dfa->nexts[org_node];
1465
0
    re_node_set_empty (dfa->edests + clone_node);
1466
0
    clone_dest = duplicate_node (dfa, org_dest, constraint);
1467
0
    if (__glibc_unlikely (clone_dest == -1))
1468
0
      return REG_ESPACE;
1469
0
    dfa->nexts[clone_node] = dfa->nexts[org_node];
1470
0
    ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1471
0
    if (__glibc_unlikely (! ok))
1472
0
      return REG_ESPACE;
1473
0
  }
1474
0
      else if (dfa->edests[org_node].nelem == 0)
1475
0
  {
1476
    /* In case of the node can't epsilon-transit, don't duplicate the
1477
       destination and store the original destination as the
1478
       destination of the node.  */
1479
0
    dfa->nexts[clone_node] = dfa->nexts[org_node];
1480
0
    break;
1481
0
  }
1482
0
      else if (dfa->edests[org_node].nelem == 1)
1483
0
  {
1484
    /* In case of the node can epsilon-transit, and it has only one
1485
       destination.  */
1486
0
    org_dest = dfa->edests[org_node].elems[0];
1487
0
    re_node_set_empty (dfa->edests + clone_node);
1488
    /* If the node is root_node itself, it means the epsilon closure
1489
       has a loop.  Then tie it to the destination of the root_node.  */
1490
0
    if (org_node == root_node && clone_node != org_node)
1491
0
      {
1492
0
        ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1493
0
        if (__glibc_unlikely (! ok))
1494
0
          return REG_ESPACE;
1495
0
        break;
1496
0
      }
1497
    /* In case the node has another constraint, append it.  */
1498
0
    constraint |= dfa->nodes[org_node].constraint;
1499
0
    clone_dest = duplicate_node (dfa, org_dest, constraint);
1500
0
    if (__glibc_unlikely (clone_dest == -1))
1501
0
      return REG_ESPACE;
1502
0
    ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503
0
    if (__glibc_unlikely (! ok))
1504
0
      return REG_ESPACE;
1505
0
  }
1506
0
      else /* dfa->edests[org_node].nelem == 2 */
1507
0
  {
1508
    /* In case of the node can epsilon-transit, and it has two
1509
       destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1510
0
    org_dest = dfa->edests[org_node].elems[0];
1511
0
    re_node_set_empty (dfa->edests + clone_node);
1512
    /* Search for a duplicated node which satisfies the constraint.  */
1513
0
    clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1514
0
    if (clone_dest == -1)
1515
0
      {
1516
        /* There is no such duplicated node, create a new one.  */
1517
0
        reg_errcode_t err;
1518
0
        clone_dest = duplicate_node (dfa, org_dest, constraint);
1519
0
        if (__glibc_unlikely (clone_dest == -1))
1520
0
    return REG_ESPACE;
1521
0
        ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1522
0
        if (__glibc_unlikely (! ok))
1523
0
    return REG_ESPACE;
1524
0
        err = duplicate_node_closure (dfa, org_dest, clone_dest,
1525
0
              root_node, constraint);
1526
0
        if (__glibc_unlikely (err != REG_NOERROR))
1527
0
    return err;
1528
0
      }
1529
0
    else
1530
0
      {
1531
        /* There is a duplicated node which satisfies the constraint,
1532
     use it to avoid infinite loop.  */
1533
0
        ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534
0
        if (__glibc_unlikely (! ok))
1535
0
    return REG_ESPACE;
1536
0
      }
1537
1538
0
    org_dest = dfa->edests[org_node].elems[1];
1539
0
    clone_dest = duplicate_node (dfa, org_dest, constraint);
1540
0
    if (__glibc_unlikely (clone_dest == -1))
1541
0
      return REG_ESPACE;
1542
0
    ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543
0
    if (__glibc_unlikely (! ok))
1544
0
      return REG_ESPACE;
1545
0
  }
1546
0
      org_node = org_dest;
1547
0
      clone_node = clone_dest;
1548
0
    }
1549
0
  return REG_NOERROR;
1550
0
}
1551
1552
/* Search for a node which is duplicated from the node ORG_NODE, and
1553
   satisfies the constraint CONSTRAINT.  */
1554
1555
static Idx
1556
search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1557
      unsigned int constraint)
1558
0
{
1559
0
  Idx idx;
1560
0
  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1561
0
    {
1562
0
      if (org_node == dfa->org_indices[idx]
1563
0
    && constraint == dfa->nodes[idx].constraint)
1564
0
  return idx; /* Found.  */
1565
0
    }
1566
0
  return -1; /* Not found.  */
1567
0
}
1568
1569
/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1570
   Return the index of the new node, or -1 if insufficient storage is
1571
   available.  */
1572
1573
static Idx
1574
duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1575
0
{
1576
0
  Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1577
0
  if (__glibc_likely (dup_idx != -1))
1578
0
    {
1579
0
      dfa->nodes[dup_idx].constraint = constraint;
1580
0
      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1581
0
      dfa->nodes[dup_idx].duplicated = 1;
1582
1583
      /* Store the index of the original node.  */
1584
0
      dfa->org_indices[dup_idx] = org_idx;
1585
0
    }
1586
0
  return dup_idx;
1587
0
}
1588
1589
static reg_errcode_t
1590
calc_inveclosure (re_dfa_t *dfa)
1591
0
{
1592
0
  Idx src, idx;
1593
0
  bool ok;
1594
0
  for (idx = 0; idx < dfa->nodes_len; ++idx)
1595
0
    re_node_set_init_empty (dfa->inveclosures + idx);
1596
1597
0
  for (src = 0; src < dfa->nodes_len; ++src)
1598
0
    {
1599
0
      Idx *elems = dfa->eclosures[src].elems;
1600
0
      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1601
0
  {
1602
0
    ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1603
0
    if (__glibc_unlikely (! ok))
1604
0
      return REG_ESPACE;
1605
0
  }
1606
0
    }
1607
1608
0
  return REG_NOERROR;
1609
0
}
1610
1611
/* Calculate "eclosure" for all the node in DFA.  */
1612
1613
static reg_errcode_t
1614
calc_eclosure (re_dfa_t *dfa)
1615
0
{
1616
0
  Idx node_idx;
1617
0
  bool incomplete;
1618
0
  DEBUG_ASSERT (dfa->nodes_len > 0);
1619
0
  incomplete = false;
1620
  /* For each nodes, calculate epsilon closure.  */
1621
0
  for (node_idx = 0; ; ++node_idx)
1622
0
    {
1623
0
      reg_errcode_t err;
1624
0
      re_node_set eclosure_elem;
1625
0
      if (node_idx == dfa->nodes_len)
1626
0
  {
1627
0
    if (!incomplete)
1628
0
      break;
1629
0
    incomplete = false;
1630
0
    node_idx = 0;
1631
0
  }
1632
1633
0
      DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1634
1635
      /* If we have already calculated, skip it.  */
1636
0
      if (dfa->eclosures[node_idx].nelem != 0)
1637
0
  continue;
1638
      /* Calculate epsilon closure of 'node_idx'.  */
1639
0
      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1640
0
      if (__glibc_unlikely (err != REG_NOERROR))
1641
0
  return err;
1642
1643
0
      if (dfa->eclosures[node_idx].nelem == 0)
1644
0
  {
1645
0
    incomplete = true;
1646
0
    re_node_set_free (&eclosure_elem);
1647
0
  }
1648
0
    }
1649
0
  return REG_NOERROR;
1650
0
}
1651
1652
/* Calculate epsilon closure of NODE.  */
1653
1654
static reg_errcode_t
1655
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1656
0
{
1657
0
  reg_errcode_t err;
1658
0
  Idx i;
1659
0
  re_node_set eclosure;
1660
0
  bool incomplete = false;
1661
0
  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662
0
  if (__glibc_unlikely (err != REG_NOERROR))
1663
0
    return err;
1664
1665
  /* An epsilon closure includes itself.  */
1666
0
  eclosure.elems[eclosure.nelem++] = node;
1667
1668
  /* This indicates that we are calculating this node now.
1669
     We reference this value to avoid infinite loop.  */
1670
0
  dfa->eclosures[node].nelem = -1;
1671
1672
  /* If the current node has constraints, duplicate all nodes
1673
     since they must inherit the constraints.  */
1674
0
  if (dfa->nodes[node].constraint
1675
0
      && dfa->edests[node].nelem
1676
0
      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1677
0
    {
1678
0
      err = duplicate_node_closure (dfa, node, node, node,
1679
0
            dfa->nodes[node].constraint);
1680
0
      if (__glibc_unlikely (err != REG_NOERROR))
1681
0
  return err;
1682
0
    }
1683
1684
  /* Expand each epsilon destination nodes.  */
1685
0
  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1686
0
    for (i = 0; i < dfa->edests[node].nelem; ++i)
1687
0
      {
1688
0
  re_node_set eclosure_elem;
1689
0
  Idx edest = dfa->edests[node].elems[i];
1690
  /* If calculating the epsilon closure of 'edest' is in progress,
1691
     return intermediate result.  */
1692
0
  if (dfa->eclosures[edest].nelem == -1)
1693
0
    {
1694
0
      incomplete = true;
1695
0
      continue;
1696
0
    }
1697
  /* If we haven't calculated the epsilon closure of 'edest' yet,
1698
     calculate now. Otherwise use calculated epsilon closure.  */
1699
0
  if (dfa->eclosures[edest].nelem == 0)
1700
0
    {
1701
0
      err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1702
0
      if (__glibc_unlikely (err != REG_NOERROR))
1703
0
        return err;
1704
0
    }
1705
0
  else
1706
0
    eclosure_elem = dfa->eclosures[edest];
1707
  /* Merge the epsilon closure of 'edest'.  */
1708
0
  err = re_node_set_merge (&eclosure, &eclosure_elem);
1709
0
  if (__glibc_unlikely (err != REG_NOERROR))
1710
0
    return err;
1711
  /* If the epsilon closure of 'edest' is incomplete,
1712
     the epsilon closure of this node is also incomplete.  */
1713
0
  if (dfa->eclosures[edest].nelem == 0)
1714
0
    {
1715
0
      incomplete = true;
1716
0
      re_node_set_free (&eclosure_elem);
1717
0
    }
1718
0
      }
1719
1720
0
  if (incomplete && !root)
1721
0
    dfa->eclosures[node].nelem = 0;
1722
0
  else
1723
0
    dfa->eclosures[node] = eclosure;
1724
0
  *new_set = eclosure;
1725
0
  return REG_NOERROR;
1726
0
}
1727

1728
/* Functions for token which are used in the parser.  */
1729
1730
/* Fetch a token from INPUT.
1731
   We must not use this function inside bracket expressions.  */
1732
1733
static void
1734
fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1735
0
{
1736
0
  re_string_skip_bytes (input, peek_token (result, input, syntax));
1737
0
}
1738
1739
/* Peek a token from INPUT, and return the length of the token.
1740
   We must not use this function inside bracket expressions.  */
1741
1742
static int
1743
peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1744
0
{
1745
0
  unsigned char c;
1746
1747
0
  if (re_string_eoi (input))
1748
0
    {
1749
0
      token->type = END_OF_RE;
1750
0
      return 0;
1751
0
    }
1752
1753
0
  c = re_string_peek_byte (input, 0);
1754
0
  token->opr.c = c;
1755
1756
0
  token->word_char = 0;
1757
0
  token->mb_partial = 0;
1758
0
  if (input->mb_cur_max > 1
1759
0
      && !re_string_first_byte (input, re_string_cur_idx (input)))
1760
0
    {
1761
0
      token->type = CHARACTER;
1762
0
      token->mb_partial = 1;
1763
0
      return 1;
1764
0
    }
1765
0
  if (c == '\\')
1766
0
    {
1767
0
      unsigned char c2;
1768
0
      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1769
0
  {
1770
0
    token->type = BACK_SLASH;
1771
0
    return 1;
1772
0
  }
1773
1774
0
      c2 = re_string_peek_byte_case (input, 1);
1775
0
      token->opr.c = c2;
1776
0
      token->type = CHARACTER;
1777
0
      if (input->mb_cur_max > 1)
1778
0
  {
1779
0
    wint_t wc = re_string_wchar_at (input,
1780
0
            re_string_cur_idx (input) + 1);
1781
0
    token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1782
0
  }
1783
0
      else
1784
0
  token->word_char = IS_WORD_CHAR (c2) != 0;
1785
1786
0
      switch (c2)
1787
0
  {
1788
0
  case '|':
1789
0
    if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1790
0
      token->type = OP_ALT;
1791
0
    break;
1792
0
  case '1': case '2': case '3': case '4': case '5':
1793
0
  case '6': case '7': case '8': case '9':
1794
0
    if (!(syntax & RE_NO_BK_REFS))
1795
0
      {
1796
0
        token->type = OP_BACK_REF;
1797
0
        token->opr.idx = c2 - '1';
1798
0
      }
1799
0
    break;
1800
0
  case '<':
1801
0
    if (!(syntax & RE_NO_GNU_OPS))
1802
0
      {
1803
0
        token->type = ANCHOR;
1804
0
        token->opr.ctx_type = WORD_FIRST;
1805
0
      }
1806
0
    break;
1807
0
  case '>':
1808
0
    if (!(syntax & RE_NO_GNU_OPS))
1809
0
      {
1810
0
        token->type = ANCHOR;
1811
0
        token->opr.ctx_type = WORD_LAST;
1812
0
      }
1813
0
    break;
1814
0
  case 'b':
1815
0
    if (!(syntax & RE_NO_GNU_OPS))
1816
0
      {
1817
0
        token->type = ANCHOR;
1818
0
        token->opr.ctx_type = WORD_DELIM;
1819
0
      }
1820
0
    break;
1821
0
  case 'B':
1822
0
    if (!(syntax & RE_NO_GNU_OPS))
1823
0
      {
1824
0
        token->type = ANCHOR;
1825
0
        token->opr.ctx_type = NOT_WORD_DELIM;
1826
0
      }
1827
0
    break;
1828
0
  case 'w':
1829
0
    if (!(syntax & RE_NO_GNU_OPS))
1830
0
      token->type = OP_WORD;
1831
0
    break;
1832
0
  case 'W':
1833
0
    if (!(syntax & RE_NO_GNU_OPS))
1834
0
      token->type = OP_NOTWORD;
1835
0
    break;
1836
0
  case 's':
1837
0
    if (!(syntax & RE_NO_GNU_OPS))
1838
0
      token->type = OP_SPACE;
1839
0
    break;
1840
0
  case 'S':
1841
0
    if (!(syntax & RE_NO_GNU_OPS))
1842
0
      token->type = OP_NOTSPACE;
1843
0
    break;
1844
0
  case '`':
1845
0
    if (!(syntax & RE_NO_GNU_OPS))
1846
0
      {
1847
0
        token->type = ANCHOR;
1848
0
        token->opr.ctx_type = BUF_FIRST;
1849
0
      }
1850
0
    break;
1851
0
  case '\'':
1852
0
    if (!(syntax & RE_NO_GNU_OPS))
1853
0
      {
1854
0
        token->type = ANCHOR;
1855
0
        token->opr.ctx_type = BUF_LAST;
1856
0
      }
1857
0
    break;
1858
0
  case '(':
1859
0
    if (!(syntax & RE_NO_BK_PARENS))
1860
0
      token->type = OP_OPEN_SUBEXP;
1861
0
    break;
1862
0
  case ')':
1863
0
    if (!(syntax & RE_NO_BK_PARENS))
1864
0
      token->type = OP_CLOSE_SUBEXP;
1865
0
    break;
1866
0
  case '+':
1867
0
    if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1868
0
      token->type = OP_DUP_PLUS;
1869
0
    break;
1870
0
  case '?':
1871
0
    if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1872
0
      token->type = OP_DUP_QUESTION;
1873
0
    break;
1874
0
  case '{':
1875
0
    if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1876
0
      token->type = OP_OPEN_DUP_NUM;
1877
0
    break;
1878
0
  case '}':
1879
0
    if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1880
0
      token->type = OP_CLOSE_DUP_NUM;
1881
0
    break;
1882
0
  default:
1883
0
    break;
1884
0
  }
1885
0
      return 2;
1886
0
    }
1887
1888
0
  token->type = CHARACTER;
1889
0
  if (input->mb_cur_max > 1)
1890
0
    {
1891
0
      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1892
0
      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1893
0
    }
1894
0
  else
1895
0
    token->word_char = IS_WORD_CHAR (token->opr.c);
1896
1897
0
  switch (c)
1898
0
    {
1899
0
    case '\n':
1900
0
      if (syntax & RE_NEWLINE_ALT)
1901
0
  token->type = OP_ALT;
1902
0
      break;
1903
0
    case '|':
1904
0
      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1905
0
  token->type = OP_ALT;
1906
0
      break;
1907
0
    case '*':
1908
0
      token->type = OP_DUP_ASTERISK;
1909
0
      break;
1910
0
    case '+':
1911
0
      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1912
0
  token->type = OP_DUP_PLUS;
1913
0
      break;
1914
0
    case '?':
1915
0
      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1916
0
  token->type = OP_DUP_QUESTION;
1917
0
      break;
1918
0
    case '{':
1919
0
      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1920
0
  token->type = OP_OPEN_DUP_NUM;
1921
0
      break;
1922
0
    case '}':
1923
0
      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1924
0
  token->type = OP_CLOSE_DUP_NUM;
1925
0
      break;
1926
0
    case '(':
1927
0
      if (syntax & RE_NO_BK_PARENS)
1928
0
  token->type = OP_OPEN_SUBEXP;
1929
0
      break;
1930
0
    case ')':
1931
0
      if (syntax & RE_NO_BK_PARENS)
1932
0
  token->type = OP_CLOSE_SUBEXP;
1933
0
      break;
1934
0
    case '[':
1935
0
      token->type = OP_OPEN_BRACKET;
1936
0
      break;
1937
0
    case '.':
1938
0
      token->type = OP_PERIOD;
1939
0
      break;
1940
0
    case '^':
1941
0
      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1942
0
    && re_string_cur_idx (input) != 0)
1943
0
  {
1944
0
    char prev = re_string_peek_byte (input, -1);
1945
0
    if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1946
0
      break;
1947
0
  }
1948
0
      token->type = ANCHOR;
1949
0
      token->opr.ctx_type = LINE_FIRST;
1950
0
      break;
1951
0
    case '$':
1952
0
      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1953
0
    && re_string_cur_idx (input) + 1 != re_string_length (input))
1954
0
  {
1955
0
    re_token_t next;
1956
0
    re_string_skip_bytes (input, 1);
1957
0
    peek_token (&next, input, syntax);
1958
0
    re_string_skip_bytes (input, -1);
1959
0
    if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1960
0
      break;
1961
0
  }
1962
0
      token->type = ANCHOR;
1963
0
      token->opr.ctx_type = LINE_LAST;
1964
0
      break;
1965
0
    default:
1966
0
      break;
1967
0
    }
1968
0
  return 1;
1969
0
}
1970
1971
/* Peek a token from INPUT, and return the length of the token.
1972
   We must not use this function out of bracket expressions.  */
1973
1974
static int
1975
peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1976
0
{
1977
0
  unsigned char c;
1978
0
  if (re_string_eoi (input))
1979
0
    {
1980
0
      token->type = END_OF_RE;
1981
0
      return 0;
1982
0
    }
1983
0
  c = re_string_peek_byte (input, 0);
1984
0
  token->opr.c = c;
1985
1986
0
  if (input->mb_cur_max > 1
1987
0
      && !re_string_first_byte (input, re_string_cur_idx (input)))
1988
0
    {
1989
0
      token->type = CHARACTER;
1990
0
      return 1;
1991
0
    }
1992
1993
0
  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1994
0
      && re_string_cur_idx (input) + 1 < re_string_length (input))
1995
0
    {
1996
      /* In this case, '\' escape a character.  */
1997
0
      unsigned char c2;
1998
0
      re_string_skip_bytes (input, 1);
1999
0
      c2 = re_string_peek_byte (input, 0);
2000
0
      token->opr.c = c2;
2001
0
      token->type = CHARACTER;
2002
0
      return 1;
2003
0
    }
2004
0
  if (c == '[') /* '[' is a special char in a bracket exps.  */
2005
0
    {
2006
0
      unsigned char c2;
2007
0
      int token_len;
2008
0
      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2009
0
  c2 = re_string_peek_byte (input, 1);
2010
0
      else
2011
0
  c2 = 0;
2012
0
      token->opr.c = c2;
2013
0
      token_len = 2;
2014
0
      switch (c2)
2015
0
  {
2016
0
  case '.':
2017
0
    token->type = OP_OPEN_COLL_ELEM;
2018
0
    break;
2019
2020
0
  case '=':
2021
0
    token->type = OP_OPEN_EQUIV_CLASS;
2022
0
    break;
2023
2024
0
  case ':':
2025
0
    if (syntax & RE_CHAR_CLASSES)
2026
0
      {
2027
0
        token->type = OP_OPEN_CHAR_CLASS;
2028
0
        break;
2029
0
      }
2030
0
    FALLTHROUGH;
2031
0
  default:
2032
0
    token->type = CHARACTER;
2033
0
    token->opr.c = c;
2034
0
    token_len = 1;
2035
0
    break;
2036
0
  }
2037
0
      return token_len;
2038
0
    }
2039
0
  switch (c)
2040
0
    {
2041
0
    case ']':
2042
0
      token->type = OP_CLOSE_BRACKET;
2043
0
      break;
2044
0
    case '^':
2045
0
      token->type = OP_NON_MATCH_LIST;
2046
0
      break;
2047
0
    case '-':
2048
      /* In V7 Unix grep and Unix awk and mawk, [...---...]
2049
         (3 adjacent minus signs) stands for a single minus sign.
2050
         Support that without breaking anything else.  */
2051
0
      if (! (re_string_cur_idx (input) + 2 < re_string_length (input)
2052
0
             && re_string_peek_byte (input, 1) == '-'
2053
0
             && re_string_peek_byte (input, 2) == '-'))
2054
0
        {
2055
0
          token->type = OP_CHARSET_RANGE;
2056
0
          break;
2057
0
        }
2058
0
      re_string_skip_bytes (input, 2);
2059
0
      FALLTHROUGH;
2060
0
    default:
2061
0
      token->type = CHARACTER;
2062
0
    }
2063
0
  return 1;
2064
0
}
2065

2066
/* Functions for parser.  */
2067
2068
/* Entry point of the parser.
2069
   Parse the regular expression REGEXP and return the structure tree.
2070
   If an error occurs, ERR is set by error code, and return NULL.
2071
   This function build the following tree, from regular expression <reg_exp>:
2072
     CAT
2073
     / \
2074
    /   \
2075
   <reg_exp>  EOR
2076
2077
   CAT means concatenation.
2078
   EOR means end of regular expression.  */
2079
2080
static bin_tree_t *
2081
parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2082
       reg_errcode_t *err)
2083
0
{
2084
0
  re_dfa_t *dfa = preg->buffer;
2085
0
  bin_tree_t *tree, *eor, *root;
2086
0
  re_token_t current_token;
2087
0
  dfa->syntax = syntax;
2088
0
  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2089
0
  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2090
0
  if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2091
0
    return NULL;
2092
0
  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2093
0
  if (tree != NULL)
2094
0
    root = create_tree (dfa, tree, eor, CONCAT);
2095
0
  else
2096
0
    root = eor;
2097
0
  if (__glibc_unlikely (eor == NULL || root == NULL))
2098
0
    {
2099
0
      *err = REG_ESPACE;
2100
0
      return NULL;
2101
0
    }
2102
0
  return root;
2103
0
}
2104
2105
/* This function build the following tree, from regular expression
2106
   <branch1>|<branch2>:
2107
     ALT
2108
     / \
2109
    /   \
2110
   <branch1> <branch2>
2111
2112
   ALT means alternative, which represents the operator '|'.  */
2113
2114
static bin_tree_t *
2115
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2116
         reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2117
0
{
2118
0
  re_dfa_t *dfa = preg->buffer;
2119
0
  bin_tree_t *tree, *branch = NULL;
2120
0
  bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2121
0
  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2122
0
  if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2123
0
    return NULL;
2124
2125
0
  while (token->type == OP_ALT)
2126
0
    {
2127
0
      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2128
0
      if (token->type != OP_ALT && token->type != END_OF_RE
2129
0
    && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2130
0
  {
2131
0
    bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2132
0
    dfa->completed_bkref_map = initial_bkref_map;
2133
0
    branch = parse_branch (regexp, preg, token, syntax, nest, err);
2134
0
    if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2135
0
      {
2136
0
        if (tree != NULL)
2137
0
    postorder (tree, free_tree, NULL);
2138
0
        return NULL;
2139
0
      }
2140
0
    dfa->completed_bkref_map |= accumulated_bkref_map;
2141
0
  }
2142
0
      else
2143
0
  branch = NULL;
2144
0
      tree = create_tree (dfa, tree, branch, OP_ALT);
2145
0
      if (__glibc_unlikely (tree == NULL))
2146
0
  {
2147
0
    *err = REG_ESPACE;
2148
0
    return NULL;
2149
0
  }
2150
0
    }
2151
0
  return tree;
2152
0
}
2153
2154
/* This function build the following tree, from regular expression
2155
   <exp1><exp2>:
2156
  CAT
2157
  / \
2158
       /   \
2159
   <exp1> <exp2>
2160
2161
   CAT means concatenation.  */
2162
2163
static bin_tree_t *
2164
parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2165
        reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2166
0
{
2167
0
  bin_tree_t *tree, *expr;
2168
0
  re_dfa_t *dfa = preg->buffer;
2169
0
  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2170
0
  if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2171
0
    return NULL;
2172
2173
0
  while (token->type != OP_ALT && token->type != END_OF_RE
2174
0
   && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2175
0
    {
2176
0
      expr = parse_expression (regexp, preg, token, syntax, nest, err);
2177
0
      if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2178
0
  {
2179
0
    if (tree != NULL)
2180
0
      postorder (tree, free_tree, NULL);
2181
0
    return NULL;
2182
0
  }
2183
0
      if (tree != NULL && expr != NULL)
2184
0
  {
2185
0
    bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2186
0
    if (newtree == NULL)
2187
0
      {
2188
0
        postorder (expr, free_tree, NULL);
2189
0
        postorder (tree, free_tree, NULL);
2190
0
        *err = REG_ESPACE;
2191
0
        return NULL;
2192
0
      }
2193
0
    tree = newtree;
2194
0
  }
2195
0
      else if (tree == NULL)
2196
0
  tree = expr;
2197
      /* Otherwise expr == NULL, we don't need to create new tree.  */
2198
0
    }
2199
0
  return tree;
2200
0
}
2201
2202
/* This function build the following tree, from regular expression a*:
2203
   *
2204
   |
2205
   a
2206
*/
2207
2208
static bin_tree_t *
2209
parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2210
      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2211
0
{
2212
0
  re_dfa_t *dfa = preg->buffer;
2213
0
  bin_tree_t *tree;
2214
0
  switch (token->type)
2215
0
    {
2216
0
    case CHARACTER:
2217
0
      tree = create_token_tree (dfa, NULL, NULL, token);
2218
0
      if (__glibc_unlikely (tree == NULL))
2219
0
  {
2220
0
    *err = REG_ESPACE;
2221
0
    return NULL;
2222
0
  }
2223
0
      if (dfa->mb_cur_max > 1)
2224
0
  {
2225
0
    while (!re_string_eoi (regexp)
2226
0
     && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2227
0
      {
2228
0
        bin_tree_t *mbc_remain;
2229
0
        fetch_token (token, regexp, syntax);
2230
0
        mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2231
0
        tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2232
0
        if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2233
0
    {
2234
0
      *err = REG_ESPACE;
2235
0
      return NULL;
2236
0
    }
2237
0
      }
2238
0
  }
2239
0
      break;
2240
2241
0
    case OP_OPEN_SUBEXP:
2242
0
      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2243
0
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2244
0
  return NULL;
2245
0
      break;
2246
2247
0
    case OP_OPEN_BRACKET:
2248
0
      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2249
0
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2250
0
  return NULL;
2251
0
      break;
2252
2253
0
    case OP_BACK_REF:
2254
0
      if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2255
0
  {
2256
0
    *err = REG_ESUBREG;
2257
0
    return NULL;
2258
0
  }
2259
0
      dfa->used_bkref_map |= 1 << token->opr.idx;
2260
0
      tree = create_token_tree (dfa, NULL, NULL, token);
2261
0
      if (__glibc_unlikely (tree == NULL))
2262
0
  {
2263
0
    *err = REG_ESPACE;
2264
0
    return NULL;
2265
0
  }
2266
0
      ++dfa->nbackref;
2267
0
      dfa->has_mb_node = 1;
2268
0
      break;
2269
2270
0
    case OP_OPEN_DUP_NUM:
2271
0
      if (syntax & RE_CONTEXT_INVALID_DUP)
2272
0
  {
2273
0
    *err = REG_BADRPT;
2274
0
    return NULL;
2275
0
  }
2276
0
      FALLTHROUGH;
2277
0
    case OP_DUP_ASTERISK:
2278
0
    case OP_DUP_PLUS:
2279
0
    case OP_DUP_QUESTION:
2280
0
      if (syntax & RE_CONTEXT_INVALID_OPS)
2281
0
  {
2282
0
    *err = REG_BADRPT;
2283
0
    return NULL;
2284
0
  }
2285
0
      else if (syntax & RE_CONTEXT_INDEP_OPS)
2286
0
  {
2287
0
    fetch_token (token, regexp, syntax);
2288
0
    return parse_expression (regexp, preg, token, syntax, nest, err);
2289
0
  }
2290
0
      FALLTHROUGH;
2291
0
    case OP_CLOSE_SUBEXP:
2292
0
      if ((token->type == OP_CLOSE_SUBEXP)
2293
0
    && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2294
0
  {
2295
0
    *err = REG_ERPAREN;
2296
0
    return NULL;
2297
0
  }
2298
0
      FALLTHROUGH;
2299
0
    case OP_CLOSE_DUP_NUM:
2300
      /* We treat it as a normal character.  */
2301
2302
      /* Then we can these characters as normal characters.  */
2303
0
      token->type = CHARACTER;
2304
      /* mb_partial and word_char bits should be initialized already
2305
   by peek_token.  */
2306
0
      tree = create_token_tree (dfa, NULL, NULL, token);
2307
0
      if (__glibc_unlikely (tree == NULL))
2308
0
  {
2309
0
    *err = REG_ESPACE;
2310
0
    return NULL;
2311
0
  }
2312
0
      break;
2313
2314
0
    case ANCHOR:
2315
0
      if ((token->opr.ctx_type
2316
0
     & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2317
0
    && dfa->word_ops_used == 0)
2318
0
  init_word_char (dfa);
2319
0
      if (token->opr.ctx_type == WORD_DELIM
2320
0
    || token->opr.ctx_type == NOT_WORD_DELIM)
2321
0
  {
2322
0
    bin_tree_t *tree_first, *tree_last;
2323
0
    if (token->opr.ctx_type == WORD_DELIM)
2324
0
      {
2325
0
        token->opr.ctx_type = WORD_FIRST;
2326
0
        tree_first = create_token_tree (dfa, NULL, NULL, token);
2327
0
        token->opr.ctx_type = WORD_LAST;
2328
0
      }
2329
0
    else
2330
0
      {
2331
0
        token->opr.ctx_type = INSIDE_WORD;
2332
0
        tree_first = create_token_tree (dfa, NULL, NULL, token);
2333
0
        token->opr.ctx_type = INSIDE_NOTWORD;
2334
0
      }
2335
0
    tree_last = create_token_tree (dfa, NULL, NULL, token);
2336
0
    tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2337
0
    if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2338
0
        || tree == NULL))
2339
0
      {
2340
0
        *err = REG_ESPACE;
2341
0
        return NULL;
2342
0
      }
2343
0
  }
2344
0
      else
2345
0
  {
2346
0
    tree = create_token_tree (dfa, NULL, NULL, token);
2347
0
    if (__glibc_unlikely (tree == NULL))
2348
0
      {
2349
0
        *err = REG_ESPACE;
2350
0
        return NULL;
2351
0
      }
2352
0
  }
2353
      /* We must return here, since ANCHORs can't be followed
2354
   by repetition operators.
2355
   eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2356
       it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2357
0
      fetch_token (token, regexp, syntax);
2358
0
      return tree;
2359
2360
0
    case OP_PERIOD:
2361
0
      tree = create_token_tree (dfa, NULL, NULL, token);
2362
0
      if (__glibc_unlikely (tree == NULL))
2363
0
  {
2364
0
    *err = REG_ESPACE;
2365
0
    return NULL;
2366
0
  }
2367
0
      if (dfa->mb_cur_max > 1)
2368
0
  dfa->has_mb_node = 1;
2369
0
      break;
2370
2371
0
    case OP_WORD:
2372
0
    case OP_NOTWORD:
2373
0
      tree = build_charclass_op (dfa, regexp->trans,
2374
0
         "alnum",
2375
0
         "_",
2376
0
         token->type == OP_NOTWORD, err);
2377
0
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2378
0
  return NULL;
2379
0
      break;
2380
2381
0
    case OP_SPACE:
2382
0
    case OP_NOTSPACE:
2383
0
      tree = build_charclass_op (dfa, regexp->trans,
2384
0
         "space",
2385
0
         "",
2386
0
         token->type == OP_NOTSPACE, err);
2387
0
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2388
0
  return NULL;
2389
0
      break;
2390
2391
0
    case OP_ALT:
2392
0
    case END_OF_RE:
2393
0
      return NULL;
2394
2395
0
    case BACK_SLASH:
2396
0
      *err = REG_EESCAPE;
2397
0
      return NULL;
2398
2399
0
    default:
2400
      /* Must not happen?  */
2401
0
      DEBUG_ASSERT (false);
2402
0
      return NULL;
2403
0
    }
2404
0
  fetch_token (token, regexp, syntax);
2405
2406
0
  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2407
0
   || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2408
0
    {
2409
0
      bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2410
0
             syntax, err);
2411
0
      if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2412
0
  {
2413
0
    if (tree != NULL)
2414
0
      postorder (tree, free_tree, NULL);
2415
0
    return NULL;
2416
0
  }
2417
0
      tree = dup_tree;
2418
      /* In BRE consecutive duplications are not allowed.  */
2419
0
      if ((syntax & RE_CONTEXT_INVALID_DUP)
2420
0
    && (token->type == OP_DUP_ASTERISK
2421
0
        || token->type == OP_OPEN_DUP_NUM))
2422
0
  {
2423
0
    if (tree != NULL)
2424
0
      postorder (tree, free_tree, NULL);
2425
0
    *err = REG_BADRPT;
2426
0
    return NULL;
2427
0
  }
2428
0
    }
2429
2430
0
  return tree;
2431
0
}
2432
2433
/* This function build the following tree, from regular expression
2434
   (<reg_exp>):
2435
   SUBEXP
2436
      |
2437
  <reg_exp>
2438
*/
2439
2440
static bin_tree_t *
2441
parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2442
         reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2443
0
{
2444
0
  re_dfa_t *dfa = preg->buffer;
2445
0
  bin_tree_t *tree;
2446
0
  size_t cur_nsub;
2447
0
  cur_nsub = preg->re_nsub++;
2448
2449
0
  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450
2451
  /* The subexpression may be a null string.  */
2452
0
  if (token->type == OP_CLOSE_SUBEXP)
2453
0
    tree = NULL;
2454
0
  else
2455
0
    {
2456
0
      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2457
0
      if (__glibc_unlikely (*err == REG_NOERROR
2458
0
          && token->type != OP_CLOSE_SUBEXP))
2459
0
  {
2460
0
    if (tree != NULL)
2461
0
      postorder (tree, free_tree, NULL);
2462
0
    *err = REG_EPAREN;
2463
0
  }
2464
0
      if (__glibc_unlikely (*err != REG_NOERROR))
2465
0
  return NULL;
2466
0
    }
2467
2468
0
  if (cur_nsub <= '9' - '1')
2469
0
    dfa->completed_bkref_map |= 1 << cur_nsub;
2470
2471
0
  tree = create_tree (dfa, tree, NULL, SUBEXP);
2472
0
  if (__glibc_unlikely (tree == NULL))
2473
0
    {
2474
0
      *err = REG_ESPACE;
2475
0
      return NULL;
2476
0
    }
2477
0
  tree->token.opr.idx = cur_nsub;
2478
0
  return tree;
2479
0
}
2480
2481
/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2482
2483
static bin_tree_t *
2484
parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2485
        re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2486
0
{
2487
0
  bin_tree_t *tree = NULL, *old_tree = NULL;
2488
0
  Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2489
0
  re_token_t start_token = *token;
2490
2491
0
  if (token->type == OP_OPEN_DUP_NUM)
2492
0
    {
2493
0
      end = 0;
2494
0
      start = fetch_number (regexp, token, syntax);
2495
0
      if (start == -1)
2496
0
  {
2497
0
    if (token->type == CHARACTER && token->opr.c == ',')
2498
0
      start = 0; /* We treat "{,m}" as "{0,m}".  */
2499
0
    else
2500
0
      {
2501
0
        *err = REG_BADBR; /* <re>{} is invalid.  */
2502
0
        return NULL;
2503
0
      }
2504
0
  }
2505
0
      if (__glibc_likely (start != -2))
2506
0
  {
2507
    /* We treat "{n}" as "{n,n}".  */
2508
0
    end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509
0
     : ((token->type == CHARACTER && token->opr.c == ',')
2510
0
        ? fetch_number (regexp, token, syntax) : -2));
2511
0
  }
2512
0
      if (__glibc_unlikely (start == -2 || end == -2))
2513
0
  {
2514
    /* Invalid sequence.  */
2515
0
    if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2516
0
      {
2517
0
        if (token->type == END_OF_RE)
2518
0
    *err = REG_EBRACE;
2519
0
        else
2520
0
    *err = REG_BADBR;
2521
2522
0
        return NULL;
2523
0
      }
2524
2525
    /* If the syntax bit is set, rollback.  */
2526
0
    re_string_set_index (regexp, start_idx);
2527
0
    *token = start_token;
2528
0
    token->type = CHARACTER;
2529
    /* mb_partial and word_char bits should be already initialized by
2530
       peek_token.  */
2531
0
    return elem;
2532
0
  }
2533
2534
0
      if (__glibc_unlikely ((end != -1 && start > end)
2535
0
          || token->type != OP_CLOSE_DUP_NUM))
2536
0
  {
2537
    /* First number greater than second.  */
2538
0
    *err = REG_BADBR;
2539
0
    return NULL;
2540
0
  }
2541
2542
0
      if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2543
0
  {
2544
0
    *err = REG_ESIZE;
2545
0
    return NULL;
2546
0
  }
2547
0
    }
2548
0
  else
2549
0
    {
2550
0
      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551
0
      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2552
0
    }
2553
2554
0
  fetch_token (token, regexp, syntax);
2555
2556
0
  if (__glibc_unlikely (elem == NULL))
2557
0
    return NULL;
2558
0
  if (__glibc_unlikely (start == 0 && end == 0))
2559
0
    {
2560
0
      postorder (elem, free_tree, NULL);
2561
0
      return NULL;
2562
0
    }
2563
2564
  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2565
0
  if (__glibc_unlikely (start > 0))
2566
0
    {
2567
0
      tree = elem;
2568
0
      for (i = 2; i <= start; ++i)
2569
0
  {
2570
0
    elem = duplicate_tree (elem, dfa);
2571
0
    tree = create_tree (dfa, tree, elem, CONCAT);
2572
0
    if (__glibc_unlikely (elem == NULL || tree == NULL))
2573
0
      goto parse_dup_op_espace;
2574
0
  }
2575
2576
0
      if (start == end)
2577
0
  return tree;
2578
2579
      /* Duplicate ELEM before it is marked optional.  */
2580
0
      elem = duplicate_tree (elem, dfa);
2581
0
      if (__glibc_unlikely (elem == NULL))
2582
0
        goto parse_dup_op_espace;
2583
0
      old_tree = tree;
2584
0
    }
2585
0
  else
2586
0
    old_tree = NULL;
2587
2588
0
  if (elem->token.type == SUBEXP)
2589
0
    {
2590
0
      uintptr_t subidx = elem->token.opr.idx;
2591
0
      postorder (elem, mark_opt_subexp, (void *) subidx);
2592
0
    }
2593
2594
0
  tree = create_tree (dfa, elem, NULL,
2595
0
          (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2596
0
  if (__glibc_unlikely (tree == NULL))
2597
0
    goto parse_dup_op_espace;
2598
2599
  /* This loop is actually executed only when end != -1,
2600
     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2601
     already created the start+1-th copy.  */
2602
0
  if (TYPE_SIGNED (Idx) || end != -1)
2603
0
    for (i = start + 2; i <= end; ++i)
2604
0
      {
2605
0
  elem = duplicate_tree (elem, dfa);
2606
0
  tree = create_tree (dfa, tree, elem, CONCAT);
2607
0
  if (__glibc_unlikely (elem == NULL || tree == NULL))
2608
0
    goto parse_dup_op_espace;
2609
2610
0
  tree = create_tree (dfa, tree, NULL, OP_ALT);
2611
0
  if (__glibc_unlikely (tree == NULL))
2612
0
    goto parse_dup_op_espace;
2613
0
      }
2614
2615
0
  if (old_tree)
2616
0
    tree = create_tree (dfa, old_tree, tree, CONCAT);
2617
2618
0
  return tree;
2619
2620
0
 parse_dup_op_espace:
2621
0
  *err = REG_ESPACE;
2622
0
  return NULL;
2623
0
}
2624
2625
/* Size of the names for collating symbol/equivalence_class/character_class.
2626
   I'm not sure, but maybe enough.  */
2627
0
#define BRACKET_NAME_BUF_SIZE 32
2628
2629
#ifndef _LIBC
2630
2631
/* Convert the byte B to the corresponding wide character.  In a
2632
   unibyte locale, treat B as itself.  In a multibyte locale, return
2633
   WEOF if B is an encoding error.  */
2634
static wint_t
2635
parse_byte (unsigned char b, re_dfa_t const *dfa)
2636
0
{
2637
0
  return dfa->mb_cur_max > 1 ? __btowc (b) : b;
2638
0
}
2639
2640
/* Local function for parse_bracket_exp used in _LIBC environment.
2641
   Build the range expression which starts from START_ELEM, and ends
2642
   at END_ELEM.  The result are written to MBCSET and SBCSET.
2643
   RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2644
   mbcset->range_ends, is a pointer argument since we may
2645
   update it.  */
2646
2647
static reg_errcode_t
2648
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2649
     bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2650
     re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
2651
     const unsigned char *collseqmb, const char *collseqwc,
2652
     int_fast32_t table_size, const void *symb_table,
2653
     const unsigned char *extra)
2654
0
{
2655
  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2656
0
  if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2657
0
      || start_elem->type == CHAR_CLASS
2658
0
      || end_elem->type == EQUIV_CLASS
2659
0
      || end_elem->type == CHAR_CLASS))
2660
0
    return REG_ERANGE;
2661
2662
  /* We can handle no multi character collating elements without libc
2663
     support.  */
2664
0
  if (__glibc_unlikely ((start_elem->type == COLL_SYM
2665
0
       && strlen ((char *) start_elem->opr.name) > 1)
2666
0
      || (end_elem->type == COLL_SYM
2667
0
          && strlen ((char *) end_elem->opr.name) > 1)))
2668
0
    return REG_ECOLLATE;
2669
2670
0
  unsigned int
2671
0
    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2672
0
    : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2673
0
       : 0)),
2674
0
    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2675
0
        : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2676
0
     : 0));
2677
0
  wint_t
2678
0
    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2679
0
    ? parse_byte (start_ch, dfa) : start_elem->opr.wch),
2680
0
    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2681
0
        ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
2682
2683
0
  if (start_wc == WEOF || end_wc == WEOF)
2684
0
    return REG_ECOLLATE;
2685
0
  else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2686
0
                             && start_wc > end_wc))
2687
0
    return REG_ERANGE;
2688
2689
  /* Got valid collation sequence values, add them as a new entry.
2690
     However, for !_LIBC we have no collation elements: if the
2691
     character set is single byte, the single byte character set
2692
     that we build below suffices.  parse_bracket_exp passes
2693
     no MBCSET if dfa->mb_cur_max == 1.  */
2694
0
  if (dfa->mb_cur_max > 1)
2695
0
    {
2696
      /* Check the space of the arrays.  */
2697
0
      if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2698
0
        {
2699
          /* There is not enough space, need realloc.  */
2700
0
          wchar_t *new_array_start, *new_array_end;
2701
0
          Idx new_nranges;
2702
2703
          /* +1 in case of mbcset->nranges is 0.  */
2704
0
          new_nranges = 2 * mbcset->nranges + 1;
2705
          /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706
             are NULL if *range_alloc == 0.  */
2707
0
          new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708
0
                                        new_nranges);
2709
0
          new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710
0
                                      new_nranges);
2711
2712
0
          if (__glibc_unlikely (new_array_start == NULL
2713
0
                                || new_array_end == NULL))
2714
0
            {
2715
0
              re_free (new_array_start);
2716
0
              re_free (new_array_end);
2717
0
              return REG_ESPACE;
2718
0
            }
2719
2720
0
          mbcset->range_starts = new_array_start;
2721
0
          mbcset->range_ends = new_array_end;
2722
0
          *range_alloc = new_nranges;
2723
0
        }
2724
2725
0
      mbcset->range_starts[mbcset->nranges] = start_wc;
2726
0
      mbcset->range_ends[mbcset->nranges++] = end_wc;
2727
0
    }
2728
2729
  /* Build the table for single byte characters.  */
2730
0
  for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
2731
0
    {
2732
0
      if (start_wc <= wc && wc <= end_wc)
2733
0
        bitset_set (sbcset, wc);
2734
0
    }
2735
2736
0
  return REG_NOERROR;
2737
0
}
2738
#endif /* not _LIBC */
2739
2740
#ifndef _LIBC
2741
/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
2742
   Build the collating element which is represented by NAME.
2743
   The result are written to MBCSET and SBCSET.
2744
   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2745
   pointer argument since we may update it.  */
2746
2747
static reg_errcode_t
2748
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2749
      Idx *coll_sym_alloc, const unsigned char *name,
2750
      uint_fast32_t nrules, int_fast32_t table_size,
2751
      const void *symb_table, const unsigned char *extra)
2752
0
{
2753
0
  size_t name_len = strlen ((const char *) name);
2754
0
  if (__glibc_unlikely (name_len != 1))
2755
0
    return REG_ECOLLATE;
2756
0
  else
2757
0
    {
2758
0
      bitset_set (sbcset, name[0]);
2759
0
      return REG_NOERROR;
2760
0
    }
2761
0
}
2762
#endif /* not _LIBC */
2763
2764
#ifdef _LIBC
2765
/* Local function for parse_bracket_exp used in _LIBC environment.
2766
   Seek the collating symbol entry corresponding to NAME.
2767
   Return the index of the symbol in the SYMB_TABLE,
2768
   or -1 if not found.  */
2769
2770
static __always_inline int32_t
2771
seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2772
           const int32_t *symb_table,
2773
           int_fast32_t table_size,
2774
           const unsigned char *extra)
2775
{
2776
  int_fast32_t elem;
2777
2778
  for (elem = 0; elem < table_size; elem++)
2779
    if (symb_table[2 * elem] != 0)
2780
      {
2781
  int32_t idx = symb_table[2 * elem + 1];
2782
  /* Skip the name of collating element name.  */
2783
  idx += 1 + extra[idx];
2784
  if (/* Compare the length of the name.  */
2785
      name_len == extra[idx]
2786
      /* Compare the name.  */
2787
      && memcmp (name, &extra[idx + 1], name_len) == 0)
2788
    /* Yep, this is the entry.  */
2789
    return elem;
2790
      }
2791
  return -1;
2792
}
2793
2794
/* Local function for parse_bracket_exp used in _LIBC environment.
2795
   Look up the collation sequence value of BR_ELEM.
2796
   Return the value if succeeded, UINT_MAX otherwise.  */
2797
2798
static __always_inline unsigned int
2799
lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2800
         const unsigned char *collseqmb,
2801
         const char *collseqwc,
2802
         int_fast32_t table_size,
2803
         const int32_t *symb_table,
2804
         const unsigned char *extra)
2805
{
2806
  if (br_elem->type == SB_CHAR)
2807
    {
2808
      /* if (MB_CUR_MAX == 1) */
2809
      if (nrules == 0)
2810
  return collseqmb[br_elem->opr.ch];
2811
      else
2812
  {
2813
    wint_t wc = __btowc (br_elem->opr.ch);
2814
    return __collseq_table_lookup (collseqwc, wc);
2815
  }
2816
    }
2817
  else if (br_elem->type == MB_CHAR)
2818
    {
2819
      if (nrules != 0)
2820
  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2821
    }
2822
  else if (br_elem->type == COLL_SYM)
2823
    {
2824
      size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2825
      if (nrules != 0)
2826
  {
2827
    int32_t elem, idx;
2828
    elem = seek_collating_symbol_entry (br_elem->opr.name,
2829
                sym_name_len,
2830
                symb_table, table_size,
2831
                extra);
2832
    if (elem != -1)
2833
      {
2834
        /* We found the entry.  */
2835
        idx = symb_table[2 * elem + 1];
2836
        /* Skip the name of collating element name.  */
2837
        idx += 1 + extra[idx];
2838
        /* Skip the byte sequence of the collating element.  */
2839
        idx += 1 + extra[idx];
2840
        /* Adjust for the alignment.  */
2841
        idx = (idx + 3) & ~3;
2842
        /* Skip the multibyte collation sequence value.  */
2843
        idx += sizeof (unsigned int);
2844
        /* Skip the wide char sequence of the collating element.  */
2845
        idx += sizeof (unsigned int) *
2846
    (1 + *(unsigned int *) (extra + idx));
2847
        /* Return the collation sequence value.  */
2848
        return *(unsigned int *) (extra + idx);
2849
      }
2850
    else if (sym_name_len == 1)
2851
      {
2852
        /* No valid character.  Match it as a single byte
2853
     character.  */
2854
        return collseqmb[br_elem->opr.name[0]];
2855
      }
2856
  }
2857
      else if (sym_name_len == 1)
2858
  return collseqmb[br_elem->opr.name[0]];
2859
    }
2860
  return UINT_MAX;
2861
}
2862
2863
/* Local function for parse_bracket_exp used in _LIBC environment.
2864
   Build the range expression which starts from START_ELEM, and ends
2865
   at END_ELEM.  The result are written to MBCSET and SBCSET.
2866
   RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2867
   mbcset->range_ends, is a pointer argument since we may
2868
   update it.  */
2869
2870
static __always_inline reg_errcode_t
2871
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2872
     bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2873
     re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2874
     const unsigned char *collseqmb, const char *collseqwc,
2875
     int_fast32_t table_size, const int32_t *symb_table,
2876
     const unsigned char *extra)
2877
{
2878
  unsigned int ch;
2879
  uint32_t start_collseq;
2880
  uint32_t end_collseq;
2881
2882
  /* Equivalence Classes and Character Classes can't be a range
2883
     start/end.  */
2884
  if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2885
                        || start_elem->type == CHAR_CLASS
2886
                        || end_elem->type == EQUIV_CLASS
2887
                        || end_elem->type == CHAR_CLASS))
2888
    return REG_ERANGE;
2889
2890
  /* FIXME: Implement rational ranges here, too.  */
2891
  start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2892
               table_size, symb_table, extra);
2893
  end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2894
             table_size, symb_table, extra);
2895
  /* Check start/end collation sequence values.  */
2896
  if (__glibc_unlikely (start_collseq == UINT_MAX
2897
                        || end_collseq == UINT_MAX))
2898
    return REG_ECOLLATE;
2899
  if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2900
                        && start_collseq > end_collseq))
2901
    return REG_ERANGE;
2902
2903
  /* Got valid collation sequence values, add them as a new entry.
2904
     However, if we have no collation elements, and the character set
2905
     is single byte, the single byte character set that we
2906
     build below suffices. */
2907
  if (nrules > 0 || dfa->mb_cur_max > 1)
2908
    {
2909
      /* Check the space of the arrays.  */
2910
      if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2911
  {
2912
    /* There is not enough space, need realloc.  */
2913
    uint32_t *new_array_start;
2914
    uint32_t *new_array_end;
2915
    int new_nranges;
2916
2917
    /* +1 in case of mbcset->nranges is 0.  */
2918
    new_nranges = 2 * mbcset->nranges + 1;
2919
    new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2920
          new_nranges);
2921
    new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2922
              new_nranges);
2923
2924
          if (__glibc_unlikely (new_array_start == NULL
2925
                                || new_array_end == NULL))
2926
      return REG_ESPACE;
2927
2928
    mbcset->range_starts = new_array_start;
2929
    mbcset->range_ends = new_array_end;
2930
    *range_alloc = new_nranges;
2931
  }
2932
2933
      mbcset->range_starts[mbcset->nranges] = start_collseq;
2934
      mbcset->range_ends[mbcset->nranges++] = end_collseq;
2935
    }
2936
2937
  /* Build the table for single byte characters.  */
2938
  for (ch = 0; ch < SBC_MAX; ch++)
2939
    {
2940
      uint32_t ch_collseq;
2941
      /* if (MB_CUR_MAX == 1) */
2942
      if (nrules == 0)
2943
  ch_collseq = collseqmb[ch];
2944
      else
2945
  ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2946
      if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2947
  bitset_set (sbcset, ch);
2948
    }
2949
  return REG_NOERROR;
2950
}
2951
2952
/* Local function for parse_bracket_exp used in _LIBC environment.
2953
   Build the collating element which is represented by NAME.
2954
   The result are written to MBCSET and SBCSET.
2955
   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2956
   pointer argument since we may update it.  */
2957
2958
static __always_inline reg_errcode_t
2959
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2960
      Idx *coll_sym_alloc, const unsigned char *name,
2961
      uint_fast32_t nrules, int_fast32_t table_size,
2962
      const int32_t *symb_table, const unsigned char *extra)
2963
{
2964
  int32_t elem, idx;
2965
  size_t name_len = strlen ((const char *) name);
2966
  if (nrules != 0)
2967
    {
2968
      elem = seek_collating_symbol_entry (name, name_len, symb_table,
2969
            table_size, extra);
2970
      if (elem != -1)
2971
  {
2972
    /* We found the entry.  */
2973
    idx = symb_table[2 * elem + 1];
2974
    /* Skip the name of collating element name.  */
2975
    idx += 1 + extra[idx];
2976
  }
2977
      else if (name_len == 1)
2978
  {
2979
    /* No valid character, treat it as a normal
2980
       character.  */
2981
    bitset_set (sbcset, name[0]);
2982
    return REG_NOERROR;
2983
  }
2984
      else
2985
  return REG_ECOLLATE;
2986
2987
      /* Got valid collation sequence, add it as a new entry.  */
2988
      /* Check the space of the arrays.  */
2989
      if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
2990
  {
2991
    /* Not enough, realloc it.  */
2992
    /* +1 in case of mbcset->ncoll_syms is 0.  */
2993
    int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2994
    /* Use realloc since mbcset->coll_syms is NULL
2995
       if *alloc == 0.  */
2996
    int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2997
                 new_coll_sym_alloc);
2998
          if (__glibc_unlikely (new_coll_syms == NULL))
2999
      return REG_ESPACE;
3000
    mbcset->coll_syms = new_coll_syms;
3001
    *coll_sym_alloc = new_coll_sym_alloc;
3002
  }
3003
      mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3004
      return REG_NOERROR;
3005
    }
3006
  else
3007
    {
3008
      if (__glibc_unlikely (name_len != 1))
3009
  return REG_ECOLLATE;
3010
      else
3011
  {
3012
    bitset_set (sbcset, name[0]);
3013
    return REG_NOERROR;
3014
  }
3015
    }
3016
}
3017
#endif /* _LIBC */
3018
3019
/* This function parse bracket expression like "[abc]", "[a-c]",
3020
   "[[.a-a.]]" etc.  */
3021
3022
static bin_tree_t *
3023
parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3024
       reg_syntax_t syntax, reg_errcode_t *err)
3025
0
{
3026
0
  const unsigned char *collseqmb = NULL;
3027
0
  const char *collseqwc = NULL;
3028
0
  uint_fast32_t nrules = 0;
3029
0
  int_fast32_t table_size = 0;
3030
0
  const void *symb_table = NULL;
3031
0
  const unsigned char *extra = NULL;
3032
3033
0
  re_token_t br_token;
3034
0
  re_bitset_ptr_t sbcset;
3035
0
  re_charset_t *mbcset;
3036
0
  Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3037
0
  Idx equiv_class_alloc = 0, char_class_alloc = 0;
3038
0
  bool non_match = false;
3039
0
  bin_tree_t *work_tree;
3040
0
  int token_len;
3041
0
  bool first_round = true;
3042
#ifdef _LIBC
3043
  collseqmb = (const unsigned char *)
3044
    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3045
  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3046
  if (nrules)
3047
    {
3048
      /*
3049
      if (MB_CUR_MAX > 1)
3050
      */
3051
      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3052
      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3053
      symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
3054
      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3055
               _NL_COLLATE_SYMB_EXTRAMB);
3056
    }
3057
#endif
3058
0
  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3059
0
  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3060
0
  if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3061
0
    {
3062
0
      re_free (sbcset);
3063
0
      re_free (mbcset);
3064
0
      *err = REG_ESPACE;
3065
0
      return NULL;
3066
0
    }
3067
3068
0
  token_len = peek_token_bracket (token, regexp, syntax);
3069
0
  if (__glibc_unlikely (token->type == END_OF_RE))
3070
0
    {
3071
0
      *err = REG_BADPAT;
3072
0
      goto parse_bracket_exp_free_return;
3073
0
    }
3074
0
  if (token->type == OP_NON_MATCH_LIST)
3075
0
    {
3076
0
      mbcset->non_match = 1;
3077
0
      non_match = true;
3078
0
      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3079
0
  bitset_set (sbcset, '\n');
3080
0
      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3081
0
      token_len = peek_token_bracket (token, regexp, syntax);
3082
0
      if (__glibc_unlikely (token->type == END_OF_RE))
3083
0
  {
3084
0
    *err = REG_BADPAT;
3085
0
    goto parse_bracket_exp_free_return;
3086
0
  }
3087
0
    }
3088
3089
  /* We treat the first ']' as a normal character.  */
3090
0
  if (token->type == OP_CLOSE_BRACKET)
3091
0
    token->type = CHARACTER;
3092
3093
0
  while (1)
3094
0
    {
3095
0
      bracket_elem_t start_elem, end_elem;
3096
0
      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3097
0
      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3098
0
      reg_errcode_t ret;
3099
0
      int token_len2 = 0;
3100
0
      bool is_range_exp = false;
3101
0
      re_token_t token2;
3102
3103
0
      start_elem.opr.name = start_name_buf;
3104
0
      start_elem.type = COLL_SYM;
3105
0
      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3106
0
           syntax, first_round);
3107
0
      if (__glibc_unlikely (ret != REG_NOERROR))
3108
0
  {
3109
0
    *err = ret;
3110
0
    goto parse_bracket_exp_free_return;
3111
0
  }
3112
0
      first_round = false;
3113
3114
      /* Get information about the next token.  We need it in any case.  */
3115
0
      token_len = peek_token_bracket (token, regexp, syntax);
3116
3117
      /* Do not check for ranges if we know they are not allowed.  */
3118
0
      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3119
0
  {
3120
0
    if (__glibc_unlikely (token->type == END_OF_RE))
3121
0
      {
3122
0
        *err = REG_EBRACK;
3123
0
        goto parse_bracket_exp_free_return;
3124
0
      }
3125
0
    if (token->type == OP_CHARSET_RANGE)
3126
0
      {
3127
0
        re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3128
0
        token_len2 = peek_token_bracket (&token2, regexp, syntax);
3129
0
        if (__glibc_unlikely (token2.type == END_OF_RE))
3130
0
    {
3131
0
      *err = REG_EBRACK;
3132
0
      goto parse_bracket_exp_free_return;
3133
0
    }
3134
0
        if (token2.type == OP_CLOSE_BRACKET)
3135
0
    {
3136
      /* We treat the last '-' as a normal character.  */
3137
0
      re_string_skip_bytes (regexp, -token_len);
3138
0
      token->type = CHARACTER;
3139
0
    }
3140
0
        else
3141
0
    is_range_exp = true;
3142
0
      }
3143
0
  }
3144
3145
0
      if (is_range_exp == true)
3146
0
  {
3147
0
    end_elem.opr.name = end_name_buf;
3148
0
    end_elem.type = COLL_SYM;
3149
0
    ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3150
0
               dfa, syntax, true);
3151
0
    if (__glibc_unlikely (ret != REG_NOERROR))
3152
0
      {
3153
0
        *err = ret;
3154
0
        goto parse_bracket_exp_free_return;
3155
0
      }
3156
3157
0
    token_len = peek_token_bracket (token, regexp, syntax);
3158
3159
0
    *err = build_range_exp (sbcset, mbcset, &range_alloc,
3160
0
          &start_elem, &end_elem,
3161
0
          dfa, syntax, nrules, collseqmb, collseqwc,
3162
0
          table_size, symb_table, extra);
3163
0
    if (__glibc_unlikely (*err != REG_NOERROR))
3164
0
      goto parse_bracket_exp_free_return;
3165
0
  }
3166
0
      else
3167
0
  {
3168
0
    switch (start_elem.type)
3169
0
      {
3170
0
      case SB_CHAR:
3171
0
        bitset_set (sbcset, start_elem.opr.ch);
3172
0
        break;
3173
0
      case MB_CHAR:
3174
        /* Check whether the array has enough space.  */
3175
0
        if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3176
0
    {
3177
0
      wchar_t *new_mbchars;
3178
      /* Not enough, realloc it.  */
3179
      /* +1 in case of mbcset->nmbchars is 0.  */
3180
0
      mbchar_alloc = 2 * mbcset->nmbchars + 1;
3181
      /* Use realloc since array is NULL if *alloc == 0.  */
3182
0
      new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3183
0
              mbchar_alloc);
3184
0
      if (__glibc_unlikely (new_mbchars == NULL))
3185
0
        goto parse_bracket_exp_espace;
3186
0
      mbcset->mbchars = new_mbchars;
3187
0
    }
3188
0
        mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3189
0
        break;
3190
0
      case EQUIV_CLASS:
3191
0
        *err = build_equiv_class (sbcset,
3192
0
          mbcset, &equiv_class_alloc,
3193
0
          start_elem.opr.name);
3194
0
        if (__glibc_unlikely (*err != REG_NOERROR))
3195
0
    goto parse_bracket_exp_free_return;
3196
0
        break;
3197
0
      case COLL_SYM:
3198
0
        *err = build_collating_symbol (sbcset,
3199
0
               mbcset, &coll_sym_alloc,
3200
0
               start_elem.opr.name,
3201
0
               nrules, table_size, symb_table, extra);
3202
0
        if (__glibc_unlikely (*err != REG_NOERROR))
3203
0
    goto parse_bracket_exp_free_return;
3204
0
        break;
3205
0
      case CHAR_CLASS:
3206
0
        *err = build_charclass (regexp->trans, sbcset,
3207
0
              mbcset, &char_class_alloc,
3208
0
              (const char *) start_elem.opr.name,
3209
0
              syntax);
3210
0
        if (__glibc_unlikely (*err != REG_NOERROR))
3211
0
         goto parse_bracket_exp_free_return;
3212
0
        break;
3213
0
      default:
3214
0
        DEBUG_ASSERT (false);
3215
0
        break;
3216
0
      }
3217
0
  }
3218
0
      if (__glibc_unlikely (token->type == END_OF_RE))
3219
0
  {
3220
0
    *err = REG_EBRACK;
3221
0
    goto parse_bracket_exp_free_return;
3222
0
  }
3223
0
      if (token->type == OP_CLOSE_BRACKET)
3224
0
  break;
3225
0
    }
3226
3227
0
  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3228
3229
  /* If it is non-matching list.  */
3230
0
  if (non_match)
3231
0
    bitset_not (sbcset);
3232
3233
  /* Ensure only single byte characters are set.  */
3234
0
  if (dfa->mb_cur_max > 1)
3235
0
    bitset_mask (sbcset, dfa->sb_char);
3236
3237
0
  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3238
0
      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3239
0
                 || mbcset->non_match)))
3240
0
    {
3241
0
      bin_tree_t *mbc_tree;
3242
0
      int sbc_idx;
3243
      /* Build a tree for complex bracket.  */
3244
0
      dfa->has_mb_node = 1;
3245
0
      br_token.type = COMPLEX_BRACKET;
3246
0
      br_token.opr.mbcset = mbcset;
3247
0
      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3248
0
      if (__glibc_unlikely (mbc_tree == NULL))
3249
0
  goto parse_bracket_exp_espace;
3250
0
      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3251
0
  if (sbcset[sbc_idx])
3252
0
    break;
3253
      /* If there are no bits set in sbcset, there is no point
3254
   of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3255
0
      if (sbc_idx < BITSET_WORDS)
3256
0
  {
3257
    /* Build a tree for simple bracket.  */
3258
0
    br_token.type = SIMPLE_BRACKET;
3259
0
    br_token.opr.sbcset = sbcset;
3260
0
    work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3261
0
    if (__glibc_unlikely (work_tree == NULL))
3262
0
      goto parse_bracket_exp_espace;
3263
3264
    /* Then join them by ALT node.  */
3265
0
    work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3266
0
    if (__glibc_unlikely (work_tree == NULL))
3267
0
      goto parse_bracket_exp_espace;
3268
0
  }
3269
0
      else
3270
0
  {
3271
0
    re_free (sbcset);
3272
0
    work_tree = mbc_tree;
3273
0
  }
3274
0
    }
3275
0
  else
3276
0
    {
3277
0
      free_charset (mbcset);
3278
      /* Build a tree for simple bracket.  */
3279
0
      br_token.type = SIMPLE_BRACKET;
3280
0
      br_token.opr.sbcset = sbcset;
3281
0
      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282
0
      if (__glibc_unlikely (work_tree == NULL))
3283
0
  goto parse_bracket_exp_espace;
3284
0
    }
3285
0
  return work_tree;
3286
3287
0
 parse_bracket_exp_espace:
3288
0
  *err = REG_ESPACE;
3289
0
 parse_bracket_exp_free_return:
3290
0
  re_free (sbcset);
3291
0
  free_charset (mbcset);
3292
0
  return NULL;
3293
0
}
3294
3295
/* Parse an element in the bracket expression.  */
3296
3297
static reg_errcode_t
3298
parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3299
           re_token_t *token, int token_len, re_dfa_t *dfa,
3300
           reg_syntax_t syntax, bool accept_hyphen)
3301
0
{
3302
0
  int cur_char_size;
3303
0
  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3304
0
  if (cur_char_size > 1)
3305
0
    {
3306
0
      elem->type = MB_CHAR;
3307
0
      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3308
0
      re_string_skip_bytes (regexp, cur_char_size);
3309
0
      return REG_NOERROR;
3310
0
    }
3311
0
  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3312
0
  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3313
0
      || token->type == OP_OPEN_EQUIV_CLASS)
3314
0
    return parse_bracket_symbol (elem, regexp, token);
3315
0
  if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3316
0
    {
3317
      /* A '-' must only appear as anything but a range indicator before
3318
   the closing bracket.  Everything else is an error.  */
3319
0
      re_token_t token2;
3320
0
      (void) peek_token_bracket (&token2, regexp, syntax);
3321
0
      if (token2.type != OP_CLOSE_BRACKET)
3322
  /* The actual error value is not standardized since this whole
3323
     case is undefined.  But ERANGE makes good sense.  */
3324
0
  return REG_ERANGE;
3325
0
    }
3326
0
  elem->type = SB_CHAR;
3327
0
  elem->opr.ch = token->opr.c;
3328
0
  return REG_NOERROR;
3329
0
}
3330
3331
/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3332
   such as [:<character_class>:], [.<collating_element>.], and
3333
   [=<equivalent_class>=].  */
3334
3335
static reg_errcode_t
3336
parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3337
          re_token_t *token)
3338
0
{
3339
0
  unsigned char ch, delim = token->opr.c;
3340
0
  int i = 0;
3341
0
  if (re_string_eoi(regexp))
3342
0
    return REG_EBRACK;
3343
0
  for (;; ++i)
3344
0
    {
3345
0
      if (i >= BRACKET_NAME_BUF_SIZE)
3346
0
  return REG_EBRACK;
3347
0
      if (token->type == OP_OPEN_CHAR_CLASS)
3348
0
  ch = re_string_fetch_byte_case (regexp);
3349
0
      else
3350
0
  ch = re_string_fetch_byte (regexp);
3351
0
      if (re_string_eoi(regexp))
3352
0
  return REG_EBRACK;
3353
0
      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3354
0
  break;
3355
0
      elem->opr.name[i] = ch;
3356
0
    }
3357
0
  re_string_skip_bytes (regexp, 1);
3358
0
  elem->opr.name[i] = '\0';
3359
0
  switch (token->type)
3360
0
    {
3361
0
    case OP_OPEN_COLL_ELEM:
3362
0
      elem->type = COLL_SYM;
3363
0
      break;
3364
0
    case OP_OPEN_EQUIV_CLASS:
3365
0
      elem->type = EQUIV_CLASS;
3366
0
      break;
3367
0
    case OP_OPEN_CHAR_CLASS:
3368
0
      elem->type = CHAR_CLASS;
3369
0
      break;
3370
0
    default:
3371
0
      break;
3372
0
    }
3373
0
  return REG_NOERROR;
3374
0
}
3375
3376
  /* Helper function for parse_bracket_exp.
3377
     Build the equivalence class which is represented by NAME.
3378
     The result are written to MBCSET and SBCSET.
3379
     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3380
     is a pointer argument since we may update it.  */
3381
3382
static reg_errcode_t
3383
build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3384
       Idx *equiv_class_alloc, const unsigned char *name)
3385
0
{
3386
#ifdef _LIBC
3387
  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388
  if (nrules != 0)
3389
    {
3390
      const int32_t *table, *indirect;
3391
      const unsigned char *weights, *extra, *cp;
3392
      unsigned char char_buf[2];
3393
      int32_t idx1, idx2;
3394
      unsigned int ch;
3395
      size_t len;
3396
      /* Calculate the index for equivalence class.  */
3397
      cp = name;
3398
      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3399
      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3400
                 _NL_COLLATE_WEIGHTMB);
3401
      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402
               _NL_COLLATE_EXTRAMB);
3403
      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3404
            _NL_COLLATE_INDIRECTMB);
3405
      idx1 = findidx (table, indirect, extra, &cp, -1);
3406
      if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3407
  /* This isn't a valid character.  */
3408
  return REG_ECOLLATE;
3409
3410
      /* Build single byte matching table for this equivalence class.  */
3411
      len = weights[idx1 & 0xffffff];
3412
      for (ch = 0; ch < SBC_MAX; ++ch)
3413
  {
3414
    char_buf[0] = ch;
3415
    cp = char_buf;
3416
    idx2 = findidx (table, indirect, extra, &cp, 1);
3417
/*
3418
    idx2 = table[ch];
3419
*/
3420
    if (idx2 == 0)
3421
      /* This isn't a valid character.  */
3422
      continue;
3423
    /* Compare only if the length matches and the collation rule
3424
       index is the same.  */
3425
    if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3426
        && memcmp (weights + (idx1 & 0xffffff) + 1,
3427
       weights + (idx2 & 0xffffff) + 1, len) == 0)
3428
      bitset_set (sbcset, ch);
3429
  }
3430
      /* Check whether the array has enough space.  */
3431
      if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3432
  {
3433
    /* Not enough, realloc it.  */
3434
    /* +1 in case of mbcset->nequiv_classes is 0.  */
3435
    Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3436
    /* Use realloc since the array is NULL if *alloc == 0.  */
3437
    int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3438
               int32_t,
3439
               new_equiv_class_alloc);
3440
    if (__glibc_unlikely (new_equiv_classes == NULL))
3441
      return REG_ESPACE;
3442
    mbcset->equiv_classes = new_equiv_classes;
3443
    *equiv_class_alloc = new_equiv_class_alloc;
3444
  }
3445
      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3446
    }
3447
  else
3448
#endif /* _LIBC */
3449
0
    {
3450
0
      if (__glibc_unlikely (strlen ((const char *) name) != 1))
3451
0
  return REG_ECOLLATE;
3452
0
      bitset_set (sbcset, *name);
3453
0
    }
3454
0
  return REG_NOERROR;
3455
0
}
3456
3457
  /* Helper function for parse_bracket_exp.
3458
     Build the character class which is represented by NAME.
3459
     The result are written to MBCSET and SBCSET.
3460
     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3461
     is a pointer argument since we may update it.  */
3462
3463
static reg_errcode_t
3464
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3465
     re_charset_t *mbcset, Idx *char_class_alloc,
3466
     const char *class_name, reg_syntax_t syntax)
3467
0
{
3468
0
  int i;
3469
0
  const char *name = class_name;
3470
3471
  /* In case of REG_ICASE "upper" and "lower" match the both of
3472
     upper and lower cases.  */
3473
0
  if ((syntax & RE_ICASE)
3474
0
      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3475
0
    name = "alpha";
3476
3477
  /* Check the space of the arrays.  */
3478
0
  if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3479
0
    {
3480
      /* Not enough, realloc it.  */
3481
      /* +1 in case of mbcset->nchar_classes is 0.  */
3482
0
      Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3483
      /* Use realloc since array is NULL if *alloc == 0.  */
3484
0
      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3485
0
                 new_char_class_alloc);
3486
0
      if (__glibc_unlikely (new_char_classes == NULL))
3487
0
  return REG_ESPACE;
3488
0
      mbcset->char_classes = new_char_classes;
3489
0
      *char_class_alloc = new_char_class_alloc;
3490
0
    }
3491
0
  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3492
3493
0
#define BUILD_CHARCLASS_LOOP(ctype_func)  \
3494
0
  do {           \
3495
0
    if (__glibc_unlikely (trans != NULL))     \
3496
0
      {           \
3497
0
  for (i = 0; i < SBC_MAX; ++i)   \
3498
0
    if (ctype_func (i))     \
3499
0
      bitset_set (sbcset, trans[i]); \
3500
0
      }           \
3501
0
    else          \
3502
0
      {           \
3503
0
  for (i = 0; i < SBC_MAX; ++i)   \
3504
0
    if (ctype_func (i))     \
3505
0
      bitset_set (sbcset, i);   \
3506
0
      }           \
3507
0
  } while (0)
3508
3509
0
  if (strcmp (name, "alnum") == 0)
3510
0
    BUILD_CHARCLASS_LOOP (isalnum);
3511
0
  else if (strcmp (name, "cntrl") == 0)
3512
0
    BUILD_CHARCLASS_LOOP (iscntrl);
3513
0
  else if (strcmp (name, "lower") == 0)
3514
0
    BUILD_CHARCLASS_LOOP (islower);
3515
0
  else if (strcmp (name, "space") == 0)
3516
0
    BUILD_CHARCLASS_LOOP (isspace);
3517
0
  else if (strcmp (name, "alpha") == 0)
3518
0
    BUILD_CHARCLASS_LOOP (isalpha);
3519
0
  else if (strcmp (name, "digit") == 0)
3520
0
    BUILD_CHARCLASS_LOOP (isdigit);
3521
0
  else if (strcmp (name, "print") == 0)
3522
0
    BUILD_CHARCLASS_LOOP (isprint);
3523
0
  else if (strcmp (name, "upper") == 0)
3524
0
    BUILD_CHARCLASS_LOOP (isupper);
3525
0
  else if (strcmp (name, "blank") == 0)
3526
0
    BUILD_CHARCLASS_LOOP (isblank);
3527
0
  else if (strcmp (name, "graph") == 0)
3528
0
    BUILD_CHARCLASS_LOOP (isgraph);
3529
0
  else if (strcmp (name, "punct") == 0)
3530
0
    BUILD_CHARCLASS_LOOP (ispunct);
3531
0
  else if (strcmp (name, "xdigit") == 0)
3532
0
    BUILD_CHARCLASS_LOOP (isxdigit);
3533
0
  else
3534
0
    return REG_ECTYPE;
3535
3536
0
  return REG_NOERROR;
3537
0
}
3538
3539
static bin_tree_t *
3540
build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3541
        const char *class_name,
3542
        const char *extra, bool non_match,
3543
        reg_errcode_t *err)
3544
0
{
3545
0
  re_bitset_ptr_t sbcset;
3546
0
  re_charset_t *mbcset;
3547
0
  Idx alloc = 0;
3548
0
  reg_errcode_t ret;
3549
0
  bin_tree_t *tree;
3550
3551
0
  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3552
0
  if (__glibc_unlikely (sbcset == NULL))
3553
0
    {
3554
0
      *err = REG_ESPACE;
3555
0
      return NULL;
3556
0
    }
3557
0
  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3558
0
  if (__glibc_unlikely (mbcset == NULL))
3559
0
    {
3560
0
      re_free (sbcset);
3561
0
      *err = REG_ESPACE;
3562
0
      return NULL;
3563
0
    }
3564
0
  mbcset->non_match = non_match;
3565
3566
  /* We don't care the syntax in this case.  */
3567
0
  ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
3568
3569
0
  if (__glibc_unlikely (ret != REG_NOERROR))
3570
0
    {
3571
0
      re_free (sbcset);
3572
0
      free_charset (mbcset);
3573
0
      *err = ret;
3574
0
      return NULL;
3575
0
    }
3576
  /* \w match '_' also.  */
3577
0
  for (; *extra; extra++)
3578
0
    bitset_set (sbcset, *extra);
3579
3580
  /* If it is non-matching list.  */
3581
0
  if (non_match)
3582
0
    bitset_not (sbcset);
3583
3584
  /* Ensure only single byte characters are set.  */
3585
0
  if (dfa->mb_cur_max > 1)
3586
0
    bitset_mask (sbcset, dfa->sb_char);
3587
3588
  /* Build a tree for simple bracket.  */
3589
0
  re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3590
0
  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3591
0
  if (__glibc_unlikely (tree == NULL))
3592
0
    goto build_word_op_espace;
3593
3594
0
  if (dfa->mb_cur_max > 1)
3595
0
    {
3596
0
      bin_tree_t *mbc_tree;
3597
      /* Build a tree for complex bracket.  */
3598
0
      br_token.type = COMPLEX_BRACKET;
3599
0
      br_token.opr.mbcset = mbcset;
3600
0
      dfa->has_mb_node = 1;
3601
0
      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3602
0
      if (__glibc_unlikely (mbc_tree == NULL))
3603
0
  goto build_word_op_espace;
3604
      /* Then join them by ALT node.  */
3605
0
      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3606
0
      if (__glibc_likely (mbc_tree != NULL))
3607
0
  return tree;
3608
0
    }
3609
0
  else
3610
0
    {
3611
0
      free_charset (mbcset);
3612
0
      return tree;
3613
0
    }
3614
3615
0
 build_word_op_espace:
3616
0
  re_free (sbcset);
3617
0
  free_charset (mbcset);
3618
0
  *err = REG_ESPACE;
3619
0
  return NULL;
3620
0
}
3621
3622
/* This is intended for the expressions like "a{1,3}".
3623
   Fetch a number from 'input', and return the number.
3624
   Return -1 if the number field is empty like "{,1}".
3625
   Return RE_DUP_MAX + 1 if the number field is too large.
3626
   Return -2 if an error occurred.  */
3627
3628
static Idx
3629
fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3630
0
{
3631
0
  Idx num = -1;
3632
0
  unsigned char c;
3633
0
  while (1)
3634
0
    {
3635
0
      fetch_token (token, input, syntax);
3636
0
      c = token->opr.c;
3637
0
      if (__glibc_unlikely (token->type == END_OF_RE))
3638
0
  return -2;
3639
0
      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3640
0
  break;
3641
0
      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3642
0
       ? -2
3643
0
       : num == -1
3644
0
       ? c - '0'
3645
0
       : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3646
0
    }
3647
0
  return num;
3648
0
}
3649

3650
static void
3651
free_charset (re_charset_t *cset)
3652
0
{
3653
0
  re_free (cset->mbchars);
3654
#ifdef _LIBC
3655
  re_free (cset->coll_syms);
3656
  re_free (cset->equiv_classes);
3657
#endif
3658
0
  re_free (cset->range_starts);
3659
0
  re_free (cset->range_ends);
3660
0
  re_free (cset->char_classes);
3661
0
  re_free (cset);
3662
0
}
3663

3664
/* Functions for binary tree operation.  */
3665
3666
/* Create a tree node.  */
3667
3668
static bin_tree_t *
3669
create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3670
       re_token_type_t type)
3671
0
{
3672
0
  re_token_t t = { .type = type };
3673
0
  return create_token_tree (dfa, left, right, &t);
3674
0
}
3675
3676
static bin_tree_t *
3677
create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3678
       const re_token_t *token)
3679
0
{
3680
0
  bin_tree_t *tree;
3681
0
  if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3682
0
    {
3683
0
      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3684
3685
0
      if (storage == NULL)
3686
0
  return NULL;
3687
0
      storage->next = dfa->str_tree_storage;
3688
0
      dfa->str_tree_storage = storage;
3689
0
      dfa->str_tree_storage_idx = 0;
3690
0
    }
3691
0
  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3692
3693
0
  tree->parent = NULL;
3694
0
  tree->left = left;
3695
0
  tree->right = right;
3696
0
  tree->token = *token;
3697
0
  tree->token.duplicated = 0;
3698
0
  tree->token.opt_subexp = 0;
3699
0
  tree->first = NULL;
3700
0
  tree->next = NULL;
3701
0
  tree->node_idx = -1;
3702
3703
0
  if (left != NULL)
3704
0
    left->parent = tree;
3705
0
  if (right != NULL)
3706
0
    right->parent = tree;
3707
0
  return tree;
3708
0
}
3709
3710
/* Mark the tree SRC as an optional subexpression.
3711
   To be called from preorder or postorder.  */
3712
3713
static reg_errcode_t
3714
mark_opt_subexp (void *extra, bin_tree_t *node)
3715
0
{
3716
0
  Idx idx = (uintptr_t) extra;
3717
0
  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3718
0
    node->token.opt_subexp = 1;
3719
3720
0
  return REG_NOERROR;
3721
0
}
3722
3723
/* Free the allocated memory inside NODE. */
3724
3725
static void
3726
free_token (re_token_t *node)
3727
0
{
3728
0
  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3729
0
    free_charset (node->opr.mbcset);
3730
0
  else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3731
0
    re_free (node->opr.sbcset);
3732
0
}
3733
3734
/* Worker function for tree walking.  Free the allocated memory inside NODE
3735
   and its children. */
3736
3737
static reg_errcode_t
3738
free_tree (void *extra, bin_tree_t *node)
3739
0
{
3740
0
  free_token (&node->token);
3741
0
  return REG_NOERROR;
3742
0
}
3743
3744
3745
/* Duplicate the node SRC, and return new node.  This is a preorder
3746
   visit similar to the one implemented by the generic visitor, but
3747
   we need more infrastructure to maintain two parallel trees --- so,
3748
   it's easier to duplicate.  */
3749
3750
static bin_tree_t *
3751
duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3752
0
{
3753
0
  const bin_tree_t *node;
3754
0
  bin_tree_t *dup_root;
3755
0
  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3756
3757
0
  for (node = root; ; )
3758
0
    {
3759
      /* Create a new tree and link it back to the current parent.  */
3760
0
      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3761
0
      if (*p_new == NULL)
3762
0
  return NULL;
3763
0
      (*p_new)->parent = dup_node;
3764
0
      (*p_new)->token.duplicated = 1;
3765
0
      dup_node = *p_new;
3766
3767
      /* Go to the left node, or up and to the right.  */
3768
0
      if (node->left)
3769
0
  {
3770
0
    node = node->left;
3771
0
    p_new = &dup_node->left;
3772
0
  }
3773
0
      else
3774
0
  {
3775
0
    const bin_tree_t *prev = NULL;
3776
0
    while (node->right == prev || node->right == NULL)
3777
0
      {
3778
0
        prev = node;
3779
0
        node = node->parent;
3780
0
        dup_node = dup_node->parent;
3781
0
        if (!node)
3782
0
    return dup_root;
3783
0
      }
3784
0
    node = node->right;
3785
0
    p_new = &dup_node->right;
3786
0
  }
3787
0
    }
3788
0
}