Coverage Report

Created: 2026-06-30 07:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/wget/lib/regcomp.c
Line
Count
Source
1
/* Extended regular expression matching and search library.
2
   Copyright (C) 2002-2026 Free Software Foundation, Inc.
3
   This file is part of the GNU C Library.
4
   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6
   The GNU C Library is free software; you can redistribute it and/or
7
   modify it under the terms of the GNU Lesser General Public
8
   License as published by the Free Software Foundation; either
9
   version 2.1 of the License, or (at your option) any later version.
10
11
   The GNU C Library is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
   Lesser General Public License for more details.
15
16
   You should have received a copy of the GNU Lesser General Public
17
   License along with the GNU C Library; if not, see
18
   <https://www.gnu.org/licenses/>.  */
19
20
#ifdef _LIBC
21
# include <locale/weight.h>
22
#endif
23
24
/* The localeinfo-related code fixes glibc bug 20381.
25
   Someday this fix should be merged into glibc.  */
26
#if !defined _LIBC && !defined _REGEX_AVOID_UCHAR_H
27
# include "localeinfo.h"
28
#endif
29
30
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
31
            size_t length, reg_syntax_t syntax);
32
static void re_compile_fastmap_iter (regex_t *bufp,
33
             const re_dfastate_t *init_state,
34
             char *fastmap);
35
static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
36
static void free_charset (re_charset_t *cset);
37
static void free_workarea_compile (regex_t *preg);
38
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
39
static void optimize_utf8 (re_dfa_t *dfa);
40
static reg_errcode_t analyze (regex_t *preg);
41
static reg_errcode_t preorder (bin_tree_t *root,
42
             reg_errcode_t (fn (void *, bin_tree_t *)),
43
             void *extra);
44
static reg_errcode_t postorder (bin_tree_t *root,
45
        reg_errcode_t (fn (void *, bin_tree_t *)),
46
        void *extra);
47
static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
48
static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
49
static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
50
         bin_tree_t *node);
51
static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
52
static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
53
static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
54
static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
55
static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
56
           unsigned int constraint);
57
static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
58
static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
59
           Idx node, bool root);
60
static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
61
static Idx fetch_number (re_string_t *input, re_token_t *token,
62
       reg_syntax_t syntax);
63
static int peek_token (re_token_t *token, re_string_t *input,
64
      reg_syntax_t syntax);
65
static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
66
        reg_syntax_t syntax, reg_errcode_t *err);
67
static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
68
          re_token_t *token, reg_syntax_t syntax,
69
          Idx nest, reg_errcode_t *err);
70
static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
71
         re_token_t *token, reg_syntax_t syntax,
72
         Idx nest, reg_errcode_t *err);
73
static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
74
             re_token_t *token, reg_syntax_t syntax,
75
             Idx nest, reg_errcode_t *err);
76
static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
77
          re_token_t *token, reg_syntax_t syntax,
78
          Idx nest, reg_errcode_t *err);
79
static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
80
         re_dfa_t *dfa, re_token_t *token,
81
         reg_syntax_t syntax, reg_errcode_t *err);
82
static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
83
              re_token_t *token, reg_syntax_t syntax,
84
              reg_errcode_t *err);
85
static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
86
              re_string_t *regexp,
87
              re_token_t *token, int token_len,
88
              re_dfa_t *dfa,
89
              reg_syntax_t syntax,
90
              bool accept_hyphen);
91
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
92
            re_string_t *regexp,
93
            re_token_t *token);
94
static reg_errcode_t build_equiv_class (bitset_t sbcset,
95
          re_charset_t *mbcset,
96
          Idx *equiv_class_alloc,
97
          const unsigned char *name);
98
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
99
              bitset_t sbcset,
100
              re_charset_t *mbcset,
101
              Idx *char_class_alloc,
102
              const char *class_name,
103
              reg_syntax_t syntax);
104
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
105
               RE_TRANSLATE_TYPE trans,
106
               const char *class_name,
107
               const char *extra,
108
               bool non_match, reg_errcode_t *err);
109
static bin_tree_t *create_tree (re_dfa_t *dfa,
110
        bin_tree_t *left, bin_tree_t *right,
111
        re_token_type_t type);
112
static bin_tree_t *create_token_tree (re_dfa_t *dfa,
113
              bin_tree_t *left, bin_tree_t *right,
114
              const re_token_t *token);
115
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
116
static void free_token (re_token_t *node);
117
static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
118
static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
119

120
/* This table gives an error message for each of the error codes listed
121
   in regex.h.  Obviously the order here has to be same as there.
122
   POSIX doesn't require that we do anything for REG_NOERROR,
123
   but why not be nice?  */
124
125
static const char __re_error_msgid[] =
126
  {
127
#define REG_NOERROR_IDX 0
128
    gettext_noop ("Success")  /* REG_NOERROR */
129
    "\0"
130
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
131
    gettext_noop ("No match") /* REG_NOMATCH */
132
    "\0"
133
#define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
134
    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
135
    "\0"
136
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
137
    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
138
    "\0"
139
#define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
140
    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
141
    "\0"
142
#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
143
    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
144
    "\0"
145
#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
146
    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
147
    "\0"
148
#define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
149
    gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
150
    "\0"
151
#define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
152
    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
153
    "\0"
154
#define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
155
    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
156
    "\0"
157
#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
158
    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
159
    "\0"
160
#define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
161
    gettext_noop ("Invalid range end")  /* REG_ERANGE */
162
    "\0"
163
#define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
164
    gettext_noop ("Memory exhausted") /* REG_ESPACE */
165
    "\0"
166
#define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
167
    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
168
    "\0"
169
#define REG_EEND_IDX  (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
170
    gettext_noop ("Premature end of regular expression") /* REG_EEND */
171
    "\0"
172
#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
173
    gettext_noop ("Regular expression too big") /* REG_ESIZE */
174
    "\0"
175
#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
176
    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
177
  };
178
179
static const size_t __re_error_msgid_idx[] =
180
  {
181
    REG_NOERROR_IDX,
182
    REG_NOMATCH_IDX,
183
    REG_BADPAT_IDX,
184
    REG_ECOLLATE_IDX,
185
    REG_ECTYPE_IDX,
186
    REG_EESCAPE_IDX,
187
    REG_ESUBREG_IDX,
188
    REG_EBRACK_IDX,
189
    REG_EPAREN_IDX,
190
    REG_EBRACE_IDX,
191
    REG_BADBR_IDX,
192
    REG_ERANGE_IDX,
193
    REG_ESPACE_IDX,
194
    REG_BADRPT_IDX,
195
    REG_EEND_IDX,
196
    REG_ESIZE_IDX,
197
    REG_ERPAREN_IDX
198
  };
199

200
/* Entry points for GNU code.  */
201
202
/* re_compile_pattern is the GNU regular expression compiler: it
203
   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
204
   Returns 0 if the pattern was valid, otherwise an error string.
205
206
   Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
207
   are set in BUFP on entry.  */
208
209
const char *
210
re_compile_pattern (const char *pattern, size_t length,
211
        struct re_pattern_buffer *bufp)
212
0
{
213
0
  reg_errcode_t ret;
214
215
  /* And GNU code determines whether or not to get register information
216
     by passing null for the REGS argument to re_match, etc., not by
217
     setting no_sub, unless RE_NO_SUB is set.  */
218
0
  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
219
220
  /* Match anchors at newline.  */
221
0
  bufp->newline_anchor = 1;
222
223
0
  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
224
225
0
  if (!ret)
226
0
    return NULL;
227
0
  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
228
0
}
229
weak_alias (__re_compile_pattern, re_compile_pattern)
230
231
/* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
232
   also be assigned to arbitrarily: each pattern buffer stores its own
233
   syntax, so it can be changed between regex compilations.  */
234
/* This has no initializer because initialized variables in Emacs
235
   become read-only after dumping.  */
236
reg_syntax_t re_syntax_options;
237
238
239
/* Specify the precise syntax of regexps for compilation.  This provides
240
   for compatibility for various utilities which historically have
241
   different, incompatible syntaxes.
242
243
   The argument SYNTAX is a bit mask comprised of the various bits
244
   defined in regex.h.  We return the old syntax.  */
245
246
reg_syntax_t
247
re_set_syntax (reg_syntax_t syntax)
248
0
{
249
0
  reg_syntax_t ret = re_syntax_options;
250
251
0
  re_syntax_options = syntax;
252
0
  return ret;
253
0
}
254
weak_alias (__re_set_syntax, re_set_syntax)
255
256
int
257
re_compile_fastmap (struct re_pattern_buffer *bufp)
258
0
{
259
0
  re_dfa_t *dfa = bufp->buffer;
260
0
  char *fastmap = bufp->fastmap;
261
262
0
  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
263
0
  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
264
0
  if (dfa->init_state != dfa->init_state_word)
265
0
    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
266
0
  if (dfa->init_state != dfa->init_state_nl)
267
0
    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
268
0
  if (dfa->init_state != dfa->init_state_begbuf)
269
0
    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
270
0
  bufp->fastmap_accurate = 1;
271
0
  return 0;
272
0
}
273
weak_alias (__re_compile_fastmap, re_compile_fastmap)
274
275
static __always_inline void
276
re_set_fastmap (char *fastmap, unsigned char ch)
277
0
{
278
0
  fastmap[ch] = 1;
279
0
}
280
281
/* Record in FASTMAP the initial byte of the representations of all
282
   characters that match WC ignoring case, other than WC itself.
283
   Use MBS as a scratch state.  */
284
285
static void
286
re_set_fastmap_icase (char *fastmap, wchar_t wc, mbstate_t *mbs)
287
0
{
288
#if defined _LIBC || defined _REGEX_AVOID_UCHAR_H
289
  wchar_t folded[1] = {__towlower (wc)};
290
  int nfolded = folded[0] != wc;
291
#else
292
0
  wchar_t folded[CASE_FOLDED_BUFSIZE];
293
0
  int nfolded = case_folded_counterparts (wc, folded);
294
0
#endif
295
0
  for (int i = 0; i < nfolded; i++)
296
0
    {
297
0
      char buf[MB_LEN_MAX];
298
0
      if (__wcrtomb (buf, folded[i], mbs) != (size_t) -1)
299
0
  re_set_fastmap (fastmap, buf[0]);
300
0
    }
301
0
}
302
303
/* Helper function for re_compile_fastmap.
304
   Compile fastmap for the initial_state INIT_STATE.  */
305
306
static void
307
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308
       char *fastmap)
309
0
{
310
0
  re_dfa_t *dfa = bufp->buffer;
311
0
  Idx node_cnt;
312
0
  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
313
0
    {
314
0
      Idx node = init_state->nodes.elems[node_cnt];
315
0
      re_token_type_t type = dfa->nodes[node].type;
316
317
0
      if (type == CHARACTER)
318
0
  {
319
0
    re_set_fastmap (fastmap, dfa->nodes[node].opr.c);
320
0
    if (bufp->syntax & RE_ICASE)
321
0
      {
322
0
        unsigned char buf[MB_LEN_MAX];
323
0
        unsigned char *p;
324
0
        wchar_t wc;
325
0
        mbstate_t state;
326
327
0
        p = buf;
328
0
        *p++ = dfa->nodes[node].opr.c;
329
0
        while (++node < dfa->nodes_len
330
0
         && dfa->nodes[node].type == CHARACTER
331
0
         && dfa->nodes[node].mb_partial)
332
0
    *p++ = dfa->nodes[node].opr.c;
333
0
        memset (&state, '\0', sizeof (state));
334
0
        if (__mbrtowc (&wc, (const char *) buf, p - buf,
335
0
           &state) == p - buf)
336
0
    re_set_fastmap_icase (fastmap, wc, &state);
337
0
      }
338
0
  }
339
0
      else if (type == SIMPLE_BRACKET)
340
0
  {
341
0
    int i, ch;
342
0
    for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343
0
      {
344
0
        int j;
345
0
        bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
346
0
        for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
347
0
    if (w & ((bitset_word_t) 1 << j))
348
0
      re_set_fastmap (fastmap, ch);
349
0
      }
350
0
  }
351
0
      else if (type == COMPLEX_BRACKET)
352
0
  {
353
0
    re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354
0
    Idx i;
355
356
#ifdef _LIBC
357
    /* See if we have to try all bytes which start multiple collation
358
       elements.
359
       e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360
      collation element, and don't catch 'b' since 'b' is
361
      the only collation element which starts from 'b' (and
362
      it is caught by SIMPLE_BRACKET).  */
363
        if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364
      && (cset->ncoll_syms || cset->nranges))
365
    {
366
      const int32_t *table = (const int32_t *)
367
        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368
      for (i = 0; i < SBC_MAX; ++i)
369
        if (table[i] < 0)
370
          re_set_fastmap (fastmap, i);
371
    }
372
#endif /* _LIBC */
373
374
    /* See if we have to start the match at all multibyte characters,
375
       i.e. where we would not find an invalid sequence.  This only
376
       applies to multibyte character sets; for single byte character
377
       sets, the SIMPLE_BRACKET again suffices.  */
378
0
    if (dfa->mb_cur_max > 1
379
0
        && (cset->nchar_classes || cset->non_match || cset->nranges
380
#ifdef _LIBC
381
      || cset->nequiv_classes
382
#endif /* _LIBC */
383
0
     ))
384
0
      {
385
0
        unsigned char c = 0;
386
0
        do
387
0
    {
388
0
      mbstate_t mbs;
389
0
      memset (&mbs, 0, sizeof (mbs));
390
0
      if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391
0
        re_set_fastmap (fastmap, c);
392
0
    }
393
0
        while (++c != 0);
394
0
      }
395
396
0
    else
397
0
      {
398
        /* ... Else catch all bytes which can start the mbchars.  */
399
0
        for (i = 0; i < cset->nmbchars; ++i)
400
0
    {
401
0
      char buf[MB_LEN_MAX];
402
0
      mbstate_t state;
403
0
      memset (&state, '\0', sizeof (state));
404
0
      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405
0
        re_set_fastmap (fastmap, buf[0]);
406
0
      if (bufp->syntax & RE_ICASE)
407
0
        re_set_fastmap_icase (fastmap, cset->mbchars[i], &state);
408
0
    }
409
0
      }
410
0
  }
411
0
      else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
412
0
  {
413
0
    memset (fastmap, '\1', sizeof (char) * SBC_MAX);
414
0
    if (type == END_OF_RE)
415
0
      bufp->can_be_null = 1;
416
0
    return;
417
0
  }
418
0
    }
419
0
}
420

421
/* Entry point for POSIX code.  */
422
/* regcomp takes a regular expression as a string and compiles it.
423
424
   PREG is a regex_t *.  We do not expect any fields to be initialized,
425
   since POSIX says we shouldn't.  Thus, we set
426
427
     'buffer' to the compiled pattern;
428
     'used' to the length of the compiled pattern;
429
     'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
430
       REG_EXTENDED bit in CFLAGS is set; otherwise, to
431
       RE_SYNTAX_POSIX_BASIC;
432
     'newline_anchor' to REG_NEWLINE being set in CFLAGS;
433
     'fastmap' to an allocated space for the fastmap;
434
     'fastmap_accurate' to zero;
435
     're_nsub' to the number of subexpressions in PATTERN.
436
437
   PATTERN is the address of the pattern string.
438
439
   CFLAGS is a series of bits which affect compilation.
440
441
     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
442
     use POSIX basic syntax.
443
444
     If REG_NEWLINE is set, then . and [^...] don't match newline.
445
     Also, regexec will try a match beginning after every newline.
446
447
     If REG_ICASE is set, then we considers upper- and lowercase
448
     versions of letters to be equivalent when matching.
449
450
     If REG_NOSUB is set, then when PREG is passed to regexec, that
451
     routine will report only success or failure, and nothing about the
452
     registers.
453
454
   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
455
   the return codes and their meanings.)  */
456
457
int
458
regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
459
0
{
460
0
  reg_errcode_t ret;
461
0
  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
462
0
       : RE_SYNTAX_POSIX_BASIC);
463
464
0
  preg->buffer = NULL;
465
0
  preg->allocated = 0;
466
0
  preg->used = 0;
467
468
  /* Try to allocate space for the fastmap.  */
469
0
  preg->fastmap = re_malloc (char, SBC_MAX);
470
0
  if (__glibc_unlikely (preg->fastmap == NULL))
471
0
    return REG_ESPACE;
472
473
0
  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
474
475
  /* If REG_NEWLINE is set, newlines are treated differently.  */
476
0
  if (cflags & REG_NEWLINE)
477
0
    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
478
0
      syntax &= ~RE_DOT_NEWLINE;
479
0
      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
480
      /* It also changes the matching behavior.  */
481
0
      preg->newline_anchor = 1;
482
0
    }
483
0
  else
484
0
    preg->newline_anchor = 0;
485
0
  preg->no_sub = !!(cflags & REG_NOSUB);
486
0
  preg->translate = NULL;
487
488
0
  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
489
490
  /* POSIX doesn't distinguish between an unmatched open-group and an
491
     unmatched close-group: both are REG_EPAREN.  */
492
0
  if (ret == REG_ERPAREN)
493
0
    ret = REG_EPAREN;
494
495
  /* We have already checked preg->fastmap != NULL.  */
496
0
  if (__glibc_likely (ret == REG_NOERROR))
497
    /* Compute the fastmap now, since regexec cannot modify the pattern
498
       buffer.  This function never fails in this implementation.  */
499
0
    (void) re_compile_fastmap (preg);
500
0
  else
501
0
    {
502
      /* Some error occurred while compiling the expression.  */
503
0
      re_free (preg->fastmap);
504
0
      preg->fastmap = NULL;
505
0
    }
506
507
0
  return (int) ret;
508
0
}
509
libc_hidden_def (__regcomp)
510
weak_alias (__regcomp, regcomp)
511
512
/* Returns a message corresponding to an error code, ERRCODE, returned
513
   from either regcomp or regexec.   We don't use PREG here.  */
514
515
size_t
516
regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
517
    size_t errbuf_size)
518
0
{
519
0
  const char *msg;
520
0
  size_t msg_size;
521
0
  int nerrcodes = countof (__re_error_msgid_idx);
522
523
0
  if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
524
    /* Only error codes returned by the rest of the code should be passed
525
       to this routine.  If we are given anything else, or if other regex
526
       code generates an invalid error code, then the program has a bug.
527
       Dump core so we can fix it.  */
528
0
    abort ();
529
530
0
  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
531
532
0
  msg_size = strlen (msg) + 1; /* Includes the null.  */
533
534
0
  if (__glibc_likely (errbuf_size != 0))
535
0
    {
536
0
      size_t cpy_size = msg_size;
537
0
      if (__glibc_unlikely (msg_size > errbuf_size))
538
0
  {
539
0
    cpy_size = errbuf_size - 1;
540
0
    errbuf[cpy_size] = '\0';
541
0
  }
542
0
      memcpy (errbuf, msg, cpy_size);
543
0
    }
544
545
0
  return msg_size;
546
0
}
547
weak_alias (__regerror, regerror)
548
549
550
/* This static array is used for the map to single-byte characters when
551
   UTF-8 is used.  Otherwise we would allocate memory just to initialize
552
   it the same all the time.  UTF-8 is the preferred encoding so this is
553
   a worthwhile optimization.  */
554
static const bitset_t utf8_sb_map =
555
{
556
  /* Set the first 128 bits.  */
557
#if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
558
  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
559
#else
560
# if 4 * BITSET_WORD_BITS < ASCII_CHARS
561
#  error "bitset_word_t is narrower than 32 bits"
562
# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
563
  BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
564
# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
565
  BITSET_WORD_MAX, BITSET_WORD_MAX,
566
# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
567
  BITSET_WORD_MAX,
568
# endif
569
  (BITSET_WORD_MAX
570
   >> (SBC_MAX % BITSET_WORD_BITS == 0
571
       ? 0
572
       : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
573
#endif
574
};
575
576
577
static void
578
free_dfa_content (re_dfa_t *dfa)
579
0
{
580
0
  Idx i, j;
581
582
0
  if (dfa->nodes)
583
0
    for (i = 0; i < dfa->nodes_len; ++i)
584
0
      free_token (dfa->nodes + i);
585
0
  re_free (dfa->nexts);
586
0
  for (i = 0; i < dfa->nodes_len; ++i)
587
0
    {
588
0
      if (dfa->eclosures != NULL)
589
0
  re_node_set_free (dfa->eclosures + i);
590
0
      if (dfa->inveclosures != NULL)
591
0
  re_node_set_free (dfa->inveclosures + i);
592
0
      if (dfa->edests != NULL)
593
0
  re_node_set_free (dfa->edests + i);
594
0
    }
595
0
  re_free (dfa->edests);
596
0
  re_free (dfa->eclosures);
597
0
  re_free (dfa->inveclosures);
598
0
  re_free (dfa->nodes);
599
600
0
  if (dfa->state_table)
601
0
    for (i = 0; i <= dfa->state_hash_mask; ++i)
602
0
      {
603
0
  struct re_state_table_entry *entry = dfa->state_table + i;
604
0
  for (j = 0; j < entry->num; ++j)
605
0
    {
606
0
      re_dfastate_t *state = entry->array[j];
607
0
      free_state (state);
608
0
    }
609
0
  re_free (entry->array);
610
0
      }
611
0
  re_free (dfa->state_table);
612
0
  if (dfa->sb_char != utf8_sb_map)
613
0
    re_free (dfa->sb_char);
614
0
  re_free (dfa->subexp_map);
615
#ifdef DEBUG
616
  re_free (dfa->re_str);
617
#endif
618
619
0
  re_free (dfa);
620
0
}
621
622
623
/* Free dynamically allocated space used by PREG.  */
624
625
void
626
regfree (regex_t *preg)
627
0
{
628
0
  re_dfa_t *dfa = preg->buffer;
629
0
  if (__glibc_likely (dfa != NULL))
630
0
    {
631
0
      lock_fini (dfa->lock);
632
0
      free_dfa_content (dfa);
633
0
    }
634
0
  preg->buffer = NULL;
635
0
  preg->allocated = 0;
636
637
0
  re_free (preg->fastmap);
638
0
  preg->fastmap = NULL;
639
640
0
  re_free (preg->translate);
641
0
  preg->translate = NULL;
642
0
}
643
libc_hidden_def (__regfree)
644
weak_alias (__regfree, regfree)
645

646
/* Entry points compatible with 4.2 BSD regex library.  We don't define
647
   them unless specifically requested.  */
648
649
#if defined _REGEX_RE_COMP || defined _LIBC
650
651
/* BSD has one and only one pattern buffer.  */
652
static struct re_pattern_buffer re_comp_buf;
653
654
char *
655
# ifdef _LIBC
656
/* Make these definitions weak in libc, so POSIX programs can redefine
657
   these names if they don't use our functions, and still use
658
   regcomp/regexec above without link errors.  */
659
weak_function
660
# endif
661
re_comp (const char *s)
662
{
663
  reg_errcode_t ret;
664
  char *fastmap;
665
666
  if (!s)
667
    {
668
      if (!re_comp_buf.buffer)
669
  return gettext ("No previous regular expression");
670
      return 0;
671
    }
672
673
  if (re_comp_buf.buffer)
674
    {
675
      fastmap = re_comp_buf.fastmap;
676
      re_comp_buf.fastmap = NULL;
677
      __regfree (&re_comp_buf);
678
      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
679
      re_comp_buf.fastmap = fastmap;
680
    }
681
682
  if (re_comp_buf.fastmap == NULL)
683
    {
684
      re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
685
      if (re_comp_buf.fastmap == NULL)
686
  return (char *) gettext (__re_error_msgid
687
         + __re_error_msgid_idx[(int) REG_ESPACE]);
688
    }
689
690
  /* Since 're_exec' always passes NULL for the 'regs' argument, we
691
     don't need to initialize the pattern buffer fields which affect it.  */
692
693
  /* Match anchors at newlines.  */
694
  re_comp_buf.newline_anchor = 1;
695
696
  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
697
698
  if (!ret)
699
    return NULL;
700
701
  /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
702
  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
703
}
704
705
#ifdef _LIBC
706
libc_freeres_fn (free_mem)
707
{
708
  __regfree (&re_comp_buf);
709
}
710
#endif
711
712
#endif /* _REGEX_RE_COMP */
713

714
/* Internal entry point.
715
   Compile the regular expression PATTERN, whose length is LENGTH.
716
   SYNTAX indicate regular expression's syntax.  */
717
718
static reg_errcode_t
719
re_compile_internal (regex_t *preg, const char * pattern, size_t length,
720
         reg_syntax_t syntax)
721
0
{
722
0
  reg_errcode_t err = REG_NOERROR;
723
0
  re_dfa_t *dfa;
724
0
  re_string_t regexp;
725
726
  /* Initialize the pattern buffer.  */
727
0
  preg->fastmap_accurate = 0;
728
0
  preg->syntax = syntax;
729
0
  preg->not_bol = preg->not_eol = 0;
730
0
  preg->used = 0;
731
0
  preg->re_nsub = 0;
732
0
  preg->can_be_null = 0;
733
0
  preg->regs_allocated = REGS_UNALLOCATED;
734
735
  /* Initialize the dfa.  */
736
0
  dfa = preg->buffer;
737
0
  if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
738
0
    {
739
      /* If zero allocated, but buffer is non-null, try to realloc
740
   enough space.  This loses if buffer's address is bogus, but
741
   that is the user's responsibility.  If ->buffer is NULL this
742
   is a simple allocation.  */
743
0
      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
744
0
      if (dfa == NULL)
745
0
  return REG_ESPACE;
746
0
      preg->allocated = sizeof (re_dfa_t);
747
0
      preg->buffer = dfa;
748
0
    }
749
0
  preg->used = sizeof (re_dfa_t);
750
751
0
  err = init_dfa (dfa, length);
752
0
  if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
753
0
    err = REG_ESPACE;
754
0
  if (__glibc_unlikely (err != REG_NOERROR))
755
0
    {
756
0
      free_dfa_content (dfa);
757
0
      preg->buffer = NULL;
758
0
      preg->allocated = 0;
759
0
      return err;
760
0
    }
761
#ifdef DEBUG
762
  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
763
  dfa->re_str = re_malloc (char, length + 1);
764
  strncpy (dfa->re_str, pattern, length + 1);
765
#endif
766
767
0
  err = re_string_construct (&regexp, pattern, length, preg->translate,
768
0
           (syntax & RE_ICASE) != 0, dfa);
769
0
  if (__glibc_unlikely (err != REG_NOERROR))
770
0
    {
771
0
    re_compile_internal_free_return:
772
0
      free_workarea_compile (preg);
773
0
      re_string_destruct (&regexp);
774
0
      lock_fini (dfa->lock);
775
0
      free_dfa_content (dfa);
776
0
      preg->buffer = NULL;
777
0
      preg->allocated = 0;
778
0
      return err;
779
0
    }
780
781
  /* Parse the regular expression, and build a structure tree.  */
782
0
  preg->re_nsub = 0;
783
0
  dfa->str_tree = parse (&regexp, preg, syntax, &err);
784
0
  if (__glibc_unlikely (dfa->str_tree == NULL))
785
0
    goto re_compile_internal_free_return;
786
787
  /* Analyze the tree and create the nfa.  */
788
0
  err = analyze (preg);
789
0
  if (__glibc_unlikely (err != REG_NOERROR))
790
0
    goto re_compile_internal_free_return;
791
792
  /* If possible, do searching in single byte encoding to speed things up.  */
793
0
  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
794
0
    optimize_utf8 (dfa);
795
796
  /* Then create the initial state of the dfa.  */
797
0
  err = create_initial_state (dfa);
798
799
  /* Release work areas.  */
800
0
  free_workarea_compile (preg);
801
0
  re_string_destruct (&regexp);
802
803
0
  if (__glibc_unlikely (err != REG_NOERROR))
804
0
    {
805
0
      lock_fini (dfa->lock);
806
0
      free_dfa_content (dfa);
807
0
      preg->buffer = NULL;
808
0
      preg->allocated = 0;
809
0
    }
810
811
0
  return err;
812
0
}
813
814
/* Initialize DFA.  We use the length of the regular expression PAT_LEN
815
   as the initial length of some arrays.  */
816
817
static reg_errcode_t
818
init_dfa (re_dfa_t *dfa, size_t pat_len)
819
0
{
820
0
  __re_size_t table_size;
821
0
#ifndef _LIBC
822
0
  const char *codeset_name;
823
0
#endif
824
0
  size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
825
0
  size_t max_object_size =
826
0
    MAX (sizeof (struct re_state_table_entry),
827
0
   MAX (sizeof (re_token_t),
828
0
        MAX (sizeof (re_node_set),
829
0
       MAX (sizeof (regmatch_t),
830
0
      max_i18n_object_size))));
831
832
0
  memset (dfa, '\0', sizeof (re_dfa_t));
833
834
  /* Force allocation of str_tree_storage the first time.  */
835
0
  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
836
837
  /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
838
     calculation below, and for similar doubling calculations
839
     elsewhere.  And it's <= rather than <, because some of the
840
     doubling calculations add 1 afterwards.  */
841
0
  if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
842
0
      <= pat_len))
843
0
    return REG_ESPACE;
844
845
0
  dfa->nodes_alloc = pat_len + 1;
846
0
  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
847
848
  /*  table_size = 2 ^ ceil(log pat_len) */
849
0
  for (table_size = 1; ; table_size <<= 1)
850
0
    if (table_size > pat_len)
851
0
      break;
852
853
0
  dfa->state_table = calloc (table_size, sizeof (struct re_state_table_entry));
854
0
  dfa->state_hash_mask = table_size - 1;
855
856
0
  dfa->mb_cur_max = MB_CUR_MAX;
857
#ifdef _LIBC
858
  if (dfa->mb_cur_max == 6
859
      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
860
    dfa->is_utf8 = 1;
861
  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
862
           != 0);
863
#else
864
0
  codeset_name = nl_langinfo (CODESET);
865
0
  if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
866
0
      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
867
0
      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
868
0
      && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
869
0
    dfa->is_utf8 = 1;
870
871
  /* We check exhaustively in the loop below if this charset is a
872
     superset of ASCII.  */
873
0
  dfa->map_notascii = 0;
874
0
#endif
875
876
0
  if (dfa->mb_cur_max > 1)
877
0
    {
878
0
      if (dfa->is_utf8)
879
0
  dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
880
0
      else
881
0
  {
882
0
    int i, j, ch;
883
884
0
    dfa->sb_char = (re_bitset_ptr_t) calloc (1, sizeof (bitset_t));
885
0
    if (__glibc_unlikely (dfa->sb_char == NULL))
886
0
      return REG_ESPACE;
887
888
    /* Set the bits corresponding to single byte chars.  */
889
0
    for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
890
0
      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
891
0
        {
892
0
    wint_t wch = __btowc (ch);
893
0
    if (wch != WEOF)
894
0
      dfa->sb_char[i] |= (bitset_word_t) 1 << j;
895
0
#ifndef _LIBC
896
0
    if (isascii (ch) && wch != ch)
897
0
      dfa->map_notascii = 1;
898
0
#endif
899
0
        }
900
0
  }
901
0
    }
902
903
0
  if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
904
0
    return REG_ESPACE;
905
0
  return REG_NOERROR;
906
0
}
907
908
/* Initialize WORD_CHAR table, which indicate which character is
909
   "word".  In this case "word" means that it is the word construction
910
   character used by some operators like "\<", "\>", etc.  */
911
912
static void
913
init_word_char (re_dfa_t *dfa)
914
0
{
915
0
  int i = 0;
916
0
  int j;
917
0
  int ch = 0;
918
0
  dfa->word_ops_used = 1;
919
0
  if (__glibc_likely (dfa->map_notascii == 0))
920
0
    {
921
0
      bitset_word_t bits0 = 0x00000000;
922
0
      bitset_word_t bits1 = 0x03ff0000;
923
0
      bitset_word_t bits2 = 0x87fffffe;
924
0
      bitset_word_t bits3 = 0x07fffffe;
925
0
      if (BITSET_WORD_BITS == 64)
926
0
  {
927
    /* Pacify gcc -Woverflow on 32-bit platforms.  */
928
0
    dfa->word_char[0] = bits1 << 31 << 1 | bits0;
929
0
    dfa->word_char[1] = bits3 << 31 << 1 | bits2;
930
0
    i = 2;
931
0
  }
932
0
      else if (BITSET_WORD_BITS == 32)
933
0
  {
934
0
    dfa->word_char[0] = bits0;
935
0
    dfa->word_char[1] = bits1;
936
0
    dfa->word_char[2] = bits2;
937
0
    dfa->word_char[3] = bits3;
938
0
    i = 4;
939
0
  }
940
0
      else
941
0
        goto general_case;
942
0
      ch = 128;
943
944
0
      if (__glibc_likely (dfa->is_utf8))
945
0
  {
946
0
    memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
947
0
    return;
948
0
  }
949
0
    }
950
951
0
 general_case:
952
0
  for (; i < BITSET_WORDS; ++i)
953
0
    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
954
0
      if (isalnum (ch) || ch == '_')
955
0
  dfa->word_char[i] |= (bitset_word_t) 1 << j;
956
0
}
957
958
/* Free the work area which are only used while compiling.  */
959
960
static void
961
free_workarea_compile (regex_t *preg)
962
0
{
963
0
  re_dfa_t *dfa = preg->buffer;
964
0
  bin_tree_storage_t *storage, *next;
965
0
  for (storage = dfa->str_tree_storage; storage; storage = next)
966
0
    {
967
0
      next = storage->next;
968
0
      re_free (storage);
969
0
    }
970
0
  dfa->str_tree_storage = NULL;
971
0
  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
972
0
  dfa->str_tree = NULL;
973
0
  re_free (dfa->org_indices);
974
0
  dfa->org_indices = NULL;
975
0
}
976
977
/* Create initial states for all contexts.  */
978
979
static reg_errcode_t
980
create_initial_state (re_dfa_t *dfa)
981
0
{
982
0
  Idx first, i;
983
0
  reg_errcode_t err;
984
0
  re_node_set init_nodes;
985
986
  /* Initial states have the epsilon closure of the node which is
987
     the first node of the regular expression.  */
988
0
  first = dfa->str_tree->first->node_idx;
989
0
  dfa->init_node = first;
990
0
  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
991
0
  if (__glibc_unlikely (err != REG_NOERROR))
992
0
    return err;
993
994
  /* The back-references which are in initial states can epsilon transit,
995
     since in this case all of the subexpressions can be null.
996
     Then we add epsilon closures of the nodes which are the next nodes of
997
     the back-references.  */
998
0
  if (dfa->nbackref > 0)
999
0
    for (i = 0; i < init_nodes.nelem; ++i)
1000
0
      {
1001
0
  Idx node_idx = init_nodes.elems[i];
1002
0
  re_token_type_t type = dfa->nodes[node_idx].type;
1003
1004
0
  Idx clexp_idx;
1005
0
  if (type != OP_BACK_REF)
1006
0
    continue;
1007
0
  for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1008
0
    {
1009
0
      re_token_t *clexp_node;
1010
0
      clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1011
0
      if (clexp_node->type == OP_CLOSE_SUBEXP
1012
0
    && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1013
0
        break;
1014
0
    }
1015
0
  if (clexp_idx == init_nodes.nelem)
1016
0
    continue;
1017
1018
0
  if (type == OP_BACK_REF)
1019
0
    {
1020
0
      Idx dest_idx = dfa->edests[node_idx].elems[0];
1021
0
      if (!re_node_set_contains (&init_nodes, dest_idx))
1022
0
        {
1023
0
    err
1024
0
                  = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1025
0
    if (err != REG_NOERROR)
1026
0
      break;
1027
0
    i = 0;
1028
0
        }
1029
0
    }
1030
0
      }
1031
1032
  /* It must be the first time to invoke acquire_state.  */
1033
0
  dfa->init_state
1034
0
    = (err == REG_NOERROR
1035
0
       ? re_acquire_state_context (&err, dfa, &init_nodes, 0)
1036
0
       : NULL);
1037
0
  if (__glibc_unlikely (dfa->init_state == NULL))
1038
0
    {
1039
      /* Don't check ERR here, as the initial state must not be null.  */
1040
0
    }
1041
0
  else if (dfa->init_state->has_constraint)
1042
0
    {
1043
0
      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1044
0
                   CONTEXT_WORD);
1045
0
      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1046
0
                 CONTEXT_NEWLINE);
1047
0
      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1048
0
               &init_nodes,
1049
0
               CONTEXT_NEWLINE
1050
0
               | CONTEXT_BEGBUF);
1051
0
    }
1052
0
  else
1053
0
    dfa->init_state_word = dfa->init_state_nl
1054
0
      = dfa->init_state_begbuf = dfa->init_state;
1055
1056
0
  re_node_set_free (&init_nodes);
1057
0
  return err;
1058
0
}
1059

1060
/* If it is possible to do searching in single byte encoding instead of UTF-8
1061
   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1062
   DFA nodes where needed.  */
1063
1064
static void
1065
optimize_utf8 (re_dfa_t *dfa)
1066
0
{
1067
0
  Idx node;
1068
0
  int i;
1069
0
  bool mb_chars = false;
1070
0
  bool has_period = false;
1071
1072
0
  for (node = 0; node < dfa->nodes_len; ++node)
1073
0
    switch (dfa->nodes[node].type)
1074
0
      {
1075
0
      case CHARACTER:
1076
0
  if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1077
0
    mb_chars = true;
1078
0
  break;
1079
0
      case ANCHOR:
1080
0
  switch (dfa->nodes[node].opr.ctx_type)
1081
0
    {
1082
0
    case LINE_FIRST:
1083
0
    case LINE_LAST:
1084
0
    case BUF_FIRST:
1085
0
    case BUF_LAST:
1086
0
      break;
1087
0
    default:
1088
      /* Word anchors etc. cannot be handled.  It's okay to test
1089
         opr.ctx_type since constraints (for all DFA nodes) are
1090
         created by ORing one or more opr.ctx_type values.  */
1091
0
      return;
1092
0
    }
1093
0
  break;
1094
0
      case OP_PERIOD:
1095
0
  has_period = true;
1096
0
  break;
1097
0
      case OP_BACK_REF:
1098
0
      case OP_ALT:
1099
0
      case END_OF_RE:
1100
0
      case OP_DUP_ASTERISK:
1101
0
      case OP_OPEN_SUBEXP:
1102
0
      case OP_CLOSE_SUBEXP:
1103
0
  break;
1104
0
      case COMPLEX_BRACKET:
1105
0
  return;
1106
0
      case SIMPLE_BRACKET:
1107
  /* Just double check.  */
1108
0
  {
1109
0
    int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1110
0
      ? 0
1111
0
      : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1112
0
    for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1113
0
      {
1114
0
        if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1115
0
    return;
1116
0
        rshift = 0;
1117
0
      }
1118
0
  }
1119
0
  break;
1120
0
      default:
1121
0
  abort ();
1122
0
      }
1123
1124
0
  if (mb_chars || has_period)
1125
0
    for (node = 0; node < dfa->nodes_len; ++node)
1126
0
      {
1127
0
  if (dfa->nodes[node].type == CHARACTER
1128
0
      && dfa->nodes[node].opr.c >= ASCII_CHARS)
1129
0
    dfa->nodes[node].mb_partial = 0;
1130
0
  else if (dfa->nodes[node].type == OP_PERIOD)
1131
0
    dfa->nodes[node].type = OP_UTF8_PERIOD;
1132
0
      }
1133
1134
  /* The search can be in single byte locale.  */
1135
0
  dfa->mb_cur_max = 1;
1136
0
  dfa->is_utf8 = 0;
1137
0
  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1138
0
}
1139

1140
/* Analyze the structure tree, and calculate "first", "next", "edest",
1141
   "eclosure", and "inveclosure".  */
1142
1143
static reg_errcode_t
1144
analyze (regex_t *preg)
1145
0
{
1146
0
  re_dfa_t *dfa = preg->buffer;
1147
0
  reg_errcode_t ret;
1148
1149
  /* Allocate arrays.  */
1150
0
  dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1151
0
  dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1152
0
  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1153
0
  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1154
0
  if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1155
0
      || dfa->edests == NULL || dfa->eclosures == NULL))
1156
0
    return REG_ESPACE;
1157
1158
0
  dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1159
0
  if (dfa->subexp_map != NULL)
1160
0
    {
1161
0
      Idx i;
1162
0
      for (i = 0; i < preg->re_nsub; i++)
1163
0
  dfa->subexp_map[i] = i;
1164
0
      preorder (dfa->str_tree, optimize_subexps, dfa);
1165
0
      for (i = 0; i < preg->re_nsub; i++)
1166
0
  if (dfa->subexp_map[i] != i)
1167
0
    break;
1168
0
      if (i == preg->re_nsub)
1169
0
  {
1170
0
    re_free (dfa->subexp_map);
1171
0
    dfa->subexp_map = NULL;
1172
0
  }
1173
0
    }
1174
1175
0
  ret = postorder (dfa->str_tree, lower_subexps, preg);
1176
0
  if (__glibc_unlikely (ret != REG_NOERROR))
1177
0
    return ret;
1178
0
  ret = postorder (dfa->str_tree, calc_first, dfa);
1179
0
  if (__glibc_unlikely (ret != REG_NOERROR))
1180
0
    return ret;
1181
0
  preorder (dfa->str_tree, calc_next, dfa);
1182
0
  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1183
0
  if (__glibc_unlikely (ret != REG_NOERROR))
1184
0
    return ret;
1185
0
  ret = calc_eclosure (dfa);
1186
0
  if (__glibc_unlikely (ret != REG_NOERROR))
1187
0
    return ret;
1188
1189
  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1190
     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1191
0
  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1192
0
      || dfa->nbackref)
1193
0
    {
1194
0
      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1195
0
      if (__glibc_unlikely (dfa->inveclosures == NULL))
1196
0
  return REG_ESPACE;
1197
0
      ret = calc_inveclosure (dfa);
1198
0
    }
1199
1200
0
  return ret;
1201
0
}
1202
1203
/* Our parse trees are very unbalanced, so we cannot use a stack to
1204
   implement parse tree visits.  Instead, we use parent pointers and
1205
   some hairy code in these two functions.  */
1206
static reg_errcode_t
1207
postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1208
     void *extra)
1209
0
{
1210
0
  bin_tree_t *node, *prev;
1211
1212
0
  for (node = root; ; )
1213
0
    {
1214
      /* Descend down the tree, preferably to the left (or to the right
1215
   if that's the only child).  */
1216
0
      while (node->left || node->right)
1217
0
  if (node->left)
1218
0
    node = node->left;
1219
0
  else
1220
0
    node = node->right;
1221
1222
0
      do
1223
0
  {
1224
0
    reg_errcode_t err = fn (extra, node);
1225
0
    if (__glibc_unlikely (err != REG_NOERROR))
1226
0
      return err;
1227
0
    if (node->parent == NULL)
1228
0
      return REG_NOERROR;
1229
0
    prev = node;
1230
0
    node = node->parent;
1231
0
  }
1232
      /* Go up while we have a node that is reached from the right.  */
1233
0
      while (node->right == prev || node->right == NULL);
1234
0
      node = node->right;
1235
0
    }
1236
0
}
1237
1238
static reg_errcode_t
1239
preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1240
    void *extra)
1241
0
{
1242
0
  bin_tree_t *node;
1243
1244
0
  for (node = root; ; )
1245
0
    {
1246
0
      reg_errcode_t err = fn (extra, node);
1247
0
      if (__glibc_unlikely (err != REG_NOERROR))
1248
0
  return err;
1249
1250
      /* Go to the left node, or up and to the right.  */
1251
0
      if (node->left)
1252
0
  node = node->left;
1253
0
      else
1254
0
  {
1255
0
    bin_tree_t *prev = NULL;
1256
0
    while (node->right == prev || node->right == NULL)
1257
0
      {
1258
0
        prev = node;
1259
0
        node = node->parent;
1260
0
        if (!node)
1261
0
    return REG_NOERROR;
1262
0
      }
1263
0
    node = node->right;
1264
0
  }
1265
0
    }
1266
0
}
1267
1268
/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1269
   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1270
   backreferences as well.  Requires a preorder visit.  */
1271
static reg_errcode_t
1272
optimize_subexps (void *extra, bin_tree_t *node)
1273
0
{
1274
0
  re_dfa_t *dfa = (re_dfa_t *) extra;
1275
1276
0
  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1277
0
    {
1278
0
      int idx = node->token.opr.idx;
1279
0
      node->token.opr.idx = dfa->subexp_map[idx];
1280
0
      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1281
0
    }
1282
1283
0
  else if (node->token.type == SUBEXP
1284
0
     && node->left && node->left->token.type == SUBEXP)
1285
0
    {
1286
0
      Idx other_idx = node->left->token.opr.idx;
1287
1288
0
      node->left = node->left->left;
1289
0
      if (node->left)
1290
0
  node->left->parent = node;
1291
1292
0
      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1293
0
      if (other_idx < BITSET_WORD_BITS)
1294
0
  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1295
0
    }
1296
1297
0
  return REG_NOERROR;
1298
0
}
1299
1300
/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1301
   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1302
static reg_errcode_t
1303
lower_subexps (void *extra, bin_tree_t *node)
1304
0
{
1305
0
  regex_t *preg = (regex_t *) extra;
1306
0
  reg_errcode_t err = REG_NOERROR;
1307
1308
0
  if (node->left && node->left->token.type == SUBEXP)
1309
0
    {
1310
0
      node->left = lower_subexp (&err, preg, node->left);
1311
0
      if (node->left)
1312
0
  node->left->parent = node;
1313
0
    }
1314
0
  if (node->right && node->right->token.type == SUBEXP)
1315
0
    {
1316
0
      node->right = lower_subexp (&err, preg, node->right);
1317
0
      if (node->right)
1318
0
  node->right->parent = node;
1319
0
    }
1320
1321
0
  return err;
1322
0
}
1323
1324
static bin_tree_t *
1325
lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1326
0
{
1327
0
  re_dfa_t *dfa = preg->buffer;
1328
0
  bin_tree_t *body = node->left;
1329
0
  bin_tree_t *op, *cls, *tree1, *tree;
1330
1331
0
  if (preg->no_sub
1332
      /* We do not optimize empty subexpressions, because otherwise we may
1333
   have bad CONCAT nodes with NULL children.  This is obviously not
1334
   very common, so we do not lose much.  An example that triggers
1335
   this case is the sed "script" /\(\)/x.  */
1336
0
      && node->left != NULL
1337
0
      && (node->token.opr.idx >= BITSET_WORD_BITS
1338
0
    || !(dfa->used_bkref_map
1339
0
         & ((bitset_word_t) 1 << node->token.opr.idx))))
1340
0
    return node->left;
1341
1342
  /* Convert the SUBEXP node to the concatenation of an
1343
     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1344
0
  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1345
0
  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1346
0
  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1347
0
  tree = create_tree (dfa, op, tree1, CONCAT);
1348
0
  if (__glibc_unlikely (tree == NULL || tree1 == NULL
1349
0
      || op == NULL || cls == NULL))
1350
0
    {
1351
0
      *err = REG_ESPACE;
1352
0
      return NULL;
1353
0
    }
1354
1355
0
  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1356
0
  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1357
0
  return tree;
1358
0
}
1359
1360
/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1361
   nodes.  Requires a postorder visit.  */
1362
static reg_errcode_t
1363
calc_first (void *extra, bin_tree_t *node)
1364
0
{
1365
0
  re_dfa_t *dfa = (re_dfa_t *) extra;
1366
0
  if (node->token.type == CONCAT)
1367
0
    {
1368
0
      node->first = node->left->first;
1369
0
      node->node_idx = node->left->node_idx;
1370
0
    }
1371
0
  else
1372
0
    {
1373
0
      node->first = node;
1374
0
      node->node_idx = re_dfa_add_node (dfa, node->token);
1375
0
      if (__glibc_unlikely (node->node_idx == -1))
1376
0
  return REG_ESPACE;
1377
0
      if (node->token.type == ANCHOR)
1378
0
  dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1379
0
    }
1380
0
  return REG_NOERROR;
1381
0
}
1382
1383
/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1384
static reg_errcode_t
1385
calc_next (void *extra, bin_tree_t *node)
1386
0
{
1387
0
  switch (node->token.type)
1388
0
    {
1389
0
    case OP_DUP_ASTERISK:
1390
0
      node->left->next = node;
1391
0
      break;
1392
0
    case CONCAT:
1393
0
      node->left->next = node->right->first;
1394
0
      node->right->next = node->next;
1395
0
      break;
1396
0
    default:
1397
0
      if (node->left)
1398
0
  node->left->next = node->next;
1399
0
      if (node->right)
1400
0
  node->right->next = node->next;
1401
0
      break;
1402
0
    }
1403
0
  return REG_NOERROR;
1404
0
}
1405
1406
/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1407
static reg_errcode_t
1408
link_nfa_nodes (void *extra, bin_tree_t *node)
1409
0
{
1410
0
  re_dfa_t *dfa = (re_dfa_t *) extra;
1411
0
  Idx idx = node->node_idx;
1412
0
  reg_errcode_t err = REG_NOERROR;
1413
1414
0
  switch (node->token.type)
1415
0
    {
1416
0
    case CONCAT:
1417
0
      break;
1418
1419
0
    case END_OF_RE:
1420
0
      DEBUG_ASSERT (node->next == NULL);
1421
0
      break;
1422
1423
0
    case OP_DUP_ASTERISK:
1424
0
    case OP_ALT:
1425
0
      {
1426
0
  Idx left, right;
1427
0
  dfa->has_plural_match = 1;
1428
0
  if (node->left != NULL)
1429
0
    left = node->left->first->node_idx;
1430
0
  else
1431
0
    left = node->next->node_idx;
1432
0
  if (node->right != NULL)
1433
0
    right = node->right->first->node_idx;
1434
0
  else
1435
0
    right = node->next->node_idx;
1436
0
  DEBUG_ASSERT (left > -1);
1437
0
  DEBUG_ASSERT (right > -1);
1438
0
  err = re_node_set_init_2 (dfa->edests + idx, left, right);
1439
0
      }
1440
0
      break;
1441
1442
0
    case ANCHOR:
1443
0
    case OP_OPEN_SUBEXP:
1444
0
    case OP_CLOSE_SUBEXP:
1445
0
      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1446
0
      break;
1447
1448
0
    case OP_BACK_REF:
1449
0
      dfa->nexts[idx] = node->next->node_idx;
1450
0
      if (node->token.type == OP_BACK_REF)
1451
0
  err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1452
0
      break;
1453
1454
0
    default:
1455
0
      DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1456
0
      dfa->nexts[idx] = node->next->node_idx;
1457
0
      break;
1458
0
    }
1459
1460
0
  return err;
1461
0
}
1462
1463
/* Duplicate the epsilon closure of the node ROOT_NODE.
1464
   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1465
   to their own constraint.  */
1466
1467
static reg_errcode_t
1468
duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1469
      Idx root_node, unsigned int init_constraint)
1470
0
{
1471
0
  Idx org_node, clone_node;
1472
0
  bool ok;
1473
0
  unsigned int constraint = init_constraint;
1474
0
  for (org_node = top_org_node, clone_node = top_clone_node;;)
1475
0
    {
1476
0
      Idx org_dest, clone_dest;
1477
0
      if (dfa->nodes[org_node].type == OP_BACK_REF)
1478
0
  {
1479
    /* If the back reference epsilon-transit, its destination must
1480
       also have the constraint.  Then duplicate the epsilon closure
1481
       of the destination of the back reference, and store it in
1482
       edests of the back reference.  */
1483
0
    org_dest = dfa->nexts[org_node];
1484
0
    re_node_set_empty (dfa->edests + clone_node);
1485
0
    clone_dest = duplicate_node (dfa, org_dest, constraint);
1486
0
    if (__glibc_unlikely (clone_dest == -1))
1487
0
      return REG_ESPACE;
1488
0
    dfa->nexts[clone_node] = dfa->nexts[org_node];
1489
0
    ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1490
0
    if (__glibc_unlikely (! ok))
1491
0
      return REG_ESPACE;
1492
0
  }
1493
0
      else if (dfa->edests[org_node].nelem == 0)
1494
0
  {
1495
    /* In case of the node can't epsilon-transit, don't duplicate the
1496
       destination and store the original destination as the
1497
       destination of the node.  */
1498
0
    dfa->nexts[clone_node] = dfa->nexts[org_node];
1499
0
    break;
1500
0
  }
1501
0
      else if (dfa->edests[org_node].nelem == 1)
1502
0
  {
1503
    /* In case of the node can epsilon-transit, and it has only one
1504
       destination.  */
1505
0
    org_dest = dfa->edests[org_node].elems[0];
1506
0
    re_node_set_empty (dfa->edests + clone_node);
1507
    /* If the node is root_node itself, it means the epsilon closure
1508
       has a loop.  Then tie it to the destination of the root_node.  */
1509
0
    if (org_node == root_node && clone_node != org_node)
1510
0
      {
1511
0
        ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1512
0
        if (__glibc_unlikely (! ok))
1513
0
          return REG_ESPACE;
1514
0
        break;
1515
0
      }
1516
    /* In case the node has another constraint, append it.  */
1517
0
    constraint |= dfa->nodes[org_node].constraint;
1518
0
    clone_dest = duplicate_node (dfa, org_dest, constraint);
1519
0
    if (__glibc_unlikely (clone_dest == -1))
1520
0
      return REG_ESPACE;
1521
0
    ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1522
0
    if (__glibc_unlikely (! ok))
1523
0
      return REG_ESPACE;
1524
0
  }
1525
0
      else /* dfa->edests[org_node].nelem == 2 */
1526
0
  {
1527
    /* In case of the node can epsilon-transit, and it has two
1528
       destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1529
0
    org_dest = dfa->edests[org_node].elems[0];
1530
0
    re_node_set_empty (dfa->edests + clone_node);
1531
    /* Search for a duplicated node which satisfies the constraint.  */
1532
0
    clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1533
0
    if (clone_dest == -1)
1534
0
      {
1535
        /* There is no such duplicated node, create a new one.  */
1536
0
        reg_errcode_t err;
1537
0
        clone_dest = duplicate_node (dfa, org_dest, constraint);
1538
0
        if (__glibc_unlikely (clone_dest == -1))
1539
0
    return REG_ESPACE;
1540
0
        ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541
0
        if (__glibc_unlikely (! ok))
1542
0
    return REG_ESPACE;
1543
0
        err = duplicate_node_closure (dfa, org_dest, clone_dest,
1544
0
              root_node, constraint);
1545
0
        if (__glibc_unlikely (err != REG_NOERROR))
1546
0
    return err;
1547
0
      }
1548
0
    else
1549
0
      {
1550
        /* There is a duplicated node which satisfies the constraint,
1551
     use it to avoid infinite loop.  */
1552
0
        ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1553
0
        if (__glibc_unlikely (! ok))
1554
0
    return REG_ESPACE;
1555
0
      }
1556
1557
0
    org_dest = dfa->edests[org_node].elems[1];
1558
0
    clone_dest = duplicate_node (dfa, org_dest, constraint);
1559
0
    if (__glibc_unlikely (clone_dest == -1))
1560
0
      return REG_ESPACE;
1561
0
    ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1562
0
    if (__glibc_unlikely (! ok))
1563
0
      return REG_ESPACE;
1564
0
  }
1565
0
      org_node = org_dest;
1566
0
      clone_node = clone_dest;
1567
0
    }
1568
0
  return REG_NOERROR;
1569
0
}
1570
1571
/* Search for a node which is duplicated from the node ORG_NODE, and
1572
   satisfies the constraint CONSTRAINT.  */
1573
1574
static Idx
1575
search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1576
      unsigned int constraint)
1577
0
{
1578
0
  Idx idx;
1579
0
  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1580
0
    {
1581
0
      if (org_node == dfa->org_indices[idx]
1582
0
    && constraint == dfa->nodes[idx].constraint)
1583
0
  return idx; /* Found.  */
1584
0
    }
1585
0
  return -1; /* Not found.  */
1586
0
}
1587
1588
/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1589
   Return the index of the new node, or -1 if insufficient storage is
1590
   available.  */
1591
1592
static Idx
1593
duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1594
0
{
1595
0
  Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1596
0
  if (__glibc_likely (dup_idx != -1))
1597
0
    {
1598
0
      dfa->nodes[dup_idx].constraint = constraint;
1599
0
      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1600
0
      dfa->nodes[dup_idx].duplicated = 1;
1601
1602
      /* Store the index of the original node.  */
1603
0
      dfa->org_indices[dup_idx] = org_idx;
1604
0
    }
1605
0
  return dup_idx;
1606
0
}
1607
1608
static reg_errcode_t
1609
calc_inveclosure (re_dfa_t *dfa)
1610
0
{
1611
0
  Idx src, idx;
1612
0
  bool ok;
1613
0
  for (idx = 0; idx < dfa->nodes_len; ++idx)
1614
0
    re_node_set_init_empty (dfa->inveclosures + idx);
1615
1616
0
  for (src = 0; src < dfa->nodes_len; ++src)
1617
0
    {
1618
0
      Idx *elems = dfa->eclosures[src].elems;
1619
0
      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1620
0
  {
1621
0
    ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1622
0
    if (__glibc_unlikely (! ok))
1623
0
      return REG_ESPACE;
1624
0
  }
1625
0
    }
1626
1627
0
  return REG_NOERROR;
1628
0
}
1629
1630
/* Calculate "eclosure" for all the node in DFA.  */
1631
1632
static reg_errcode_t
1633
calc_eclosure (re_dfa_t *dfa)
1634
0
{
1635
0
  Idx node_idx;
1636
0
  bool incomplete;
1637
0
  DEBUG_ASSERT (dfa->nodes_len > 0);
1638
0
  incomplete = false;
1639
  /* For each nodes, calculate epsilon closure.  */
1640
0
  for (node_idx = 0; ; ++node_idx)
1641
0
    {
1642
0
      reg_errcode_t err;
1643
0
      re_node_set eclosure_elem;
1644
0
      if (node_idx == dfa->nodes_len)
1645
0
  {
1646
0
    if (!incomplete)
1647
0
      break;
1648
0
    incomplete = false;
1649
0
    node_idx = 0;
1650
0
  }
1651
1652
0
      DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1653
1654
      /* If we have already calculated, skip it.  */
1655
0
      if (dfa->eclosures[node_idx].nelem != 0)
1656
0
  continue;
1657
      /* Calculate epsilon closure of 'node_idx'.  */
1658
0
      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1659
0
      if (__glibc_unlikely (err != REG_NOERROR))
1660
0
  return err;
1661
1662
0
      if (dfa->eclosures[node_idx].nelem == 0)
1663
0
  {
1664
0
    incomplete = true;
1665
0
    re_node_set_free (&eclosure_elem);
1666
0
  }
1667
0
    }
1668
0
  return REG_NOERROR;
1669
0
}
1670
1671
/* Calculate epsilon closure of NODE.  */
1672
1673
static reg_errcode_t
1674
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1675
0
{
1676
0
  reg_errcode_t err;
1677
0
  Idx i;
1678
0
  re_node_set eclosure;
1679
0
  bool incomplete = false;
1680
0
  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1681
0
  if (__glibc_unlikely (err != REG_NOERROR))
1682
0
    return err;
1683
1684
  /* An epsilon closure includes itself.  */
1685
0
  eclosure.elems[eclosure.nelem++] = node;
1686
1687
  /* This indicates that we are calculating this node now.
1688
     We reference this value to avoid infinite loop.  */
1689
0
  dfa->eclosures[node].nelem = -1;
1690
1691
  /* If the current node has constraints, duplicate all nodes
1692
     since they must inherit the constraints.  */
1693
0
  if (dfa->nodes[node].constraint
1694
0
      && dfa->edests[node].nelem
1695
0
      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1696
0
    {
1697
0
      err = duplicate_node_closure (dfa, node, node, node,
1698
0
            dfa->nodes[node].constraint);
1699
0
    }
1700
1701
  /* Expand each epsilon destination nodes.  */
1702
0
  if (__glibc_likely (err == REG_NOERROR)
1703
0
      && IS_EPSILON_NODE (dfa->nodes[node].type))
1704
0
    for (i = 0; i < dfa->edests[node].nelem; ++i)
1705
0
      {
1706
0
  re_node_set eclosure_elem;
1707
0
  Idx edest = dfa->edests[node].elems[i];
1708
  /* If calculating the epsilon closure of 'edest' is in progress,
1709
     return intermediate result.  */
1710
0
  if (dfa->eclosures[edest].nelem == -1)
1711
0
    {
1712
0
      incomplete = true;
1713
0
      continue;
1714
0
    }
1715
  /* If we haven't calculated the epsilon closure of 'edest' yet,
1716
     calculate now. Otherwise use calculated epsilon closure.  */
1717
0
  if (dfa->eclosures[edest].nelem == 0)
1718
0
    {
1719
0
      err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1720
0
      if (__glibc_unlikely (err != REG_NOERROR))
1721
0
        break;
1722
0
    }
1723
0
  else
1724
0
    eclosure_elem = dfa->eclosures[edest];
1725
  /* Merge the epsilon closure of 'edest'.  */
1726
0
  err = re_node_set_merge (&eclosure, &eclosure_elem);
1727
0
  if (__glibc_unlikely (err != REG_NOERROR))
1728
0
    break;
1729
  /* If the epsilon closure of 'edest' is incomplete,
1730
     the epsilon closure of this node is also incomplete.  */
1731
0
  if (dfa->eclosures[edest].nelem == 0)
1732
0
    {
1733
0
      incomplete = true;
1734
0
      re_node_set_free (&eclosure_elem);
1735
0
    }
1736
0
      }
1737
1738
0
  if (err != REG_NOERROR)
1739
0
    re_node_set_free (&eclosure);
1740
0
  else
1741
0
    {
1742
0
      if (incomplete && !root)
1743
0
  dfa->eclosures[node].nelem = 0;
1744
0
      else
1745
0
  dfa->eclosures[node] = eclosure;
1746
0
      *new_set = eclosure;
1747
0
    }
1748
1749
0
  return err;
1750
0
}
1751

1752
/* Functions for token which are used in the parser.  */
1753
1754
/* Fetch a token from INPUT.
1755
   We must not use this function inside bracket expressions.  */
1756
1757
static void
1758
fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1759
0
{
1760
0
  re_string_skip_bytes (input, peek_token (result, input, syntax));
1761
0
}
1762
1763
/* Peek a token from INPUT, and return the length of the token.
1764
   We must not use this function inside bracket expressions.  */
1765
1766
static int
1767
peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1768
0
{
1769
0
  unsigned char c;
1770
1771
0
  if (re_string_eoi (input))
1772
0
    {
1773
0
      token->type = END_OF_RE;
1774
0
      return 0;
1775
0
    }
1776
1777
0
  c = re_string_peek_byte (input, 0);
1778
0
  token->opr.c = c;
1779
1780
0
  token->word_char = 0;
1781
0
  token->mb_partial = 0;
1782
0
  if (input->mb_cur_max > 1
1783
0
      && !re_string_first_byte (input, re_string_cur_idx (input)))
1784
0
    {
1785
0
      token->type = CHARACTER;
1786
0
      token->mb_partial = 1;
1787
0
      return 1;
1788
0
    }
1789
0
  if (c == '\\')
1790
0
    {
1791
0
      unsigned char c2;
1792
0
      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1793
0
  {
1794
0
    token->type = BACK_SLASH;
1795
0
    return 1;
1796
0
  }
1797
1798
0
      c2 = re_string_peek_byte_case (input, 1);
1799
0
      token->opr.c = c2;
1800
0
      token->type = CHARACTER;
1801
0
      if (input->mb_cur_max > 1)
1802
0
  {
1803
0
    wint_t wc = re_string_wchar_at (input,
1804
0
            re_string_cur_idx (input) + 1);
1805
0
    token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1806
0
  }
1807
0
      else
1808
0
  token->word_char = IS_WORD_CHAR (c2) != 0;
1809
1810
0
      switch (c2)
1811
0
  {
1812
0
  case '|':
1813
0
    if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1814
0
      token->type = OP_ALT;
1815
0
    break;
1816
0
  case '1': case '2': case '3': case '4': case '5':
1817
0
  case '6': case '7': case '8': case '9':
1818
0
    if (!(syntax & RE_NO_BK_REFS))
1819
0
      {
1820
0
        token->type = OP_BACK_REF;
1821
0
        token->opr.idx = c2 - '1';
1822
0
      }
1823
0
    break;
1824
0
  case '<':
1825
0
    if (!(syntax & RE_NO_GNU_OPS))
1826
0
      {
1827
0
        token->type = ANCHOR;
1828
0
        token->opr.ctx_type = WORD_FIRST;
1829
0
      }
1830
0
    break;
1831
0
  case '>':
1832
0
    if (!(syntax & RE_NO_GNU_OPS))
1833
0
      {
1834
0
        token->type = ANCHOR;
1835
0
        token->opr.ctx_type = WORD_LAST;
1836
0
      }
1837
0
    break;
1838
0
  case 'b':
1839
0
    if (!(syntax & RE_NO_GNU_OPS))
1840
0
      {
1841
0
        token->type = ANCHOR;
1842
0
        token->opr.ctx_type = WORD_DELIM;
1843
0
      }
1844
0
    break;
1845
0
  case 'B':
1846
0
    if (!(syntax & RE_NO_GNU_OPS))
1847
0
      {
1848
0
        token->type = ANCHOR;
1849
0
        token->opr.ctx_type = NOT_WORD_DELIM;
1850
0
      }
1851
0
    break;
1852
0
  case 'w':
1853
0
    if (!(syntax & RE_NO_GNU_OPS))
1854
0
      token->type = OP_WORD;
1855
0
    break;
1856
0
  case 'W':
1857
0
    if (!(syntax & RE_NO_GNU_OPS))
1858
0
      token->type = OP_NOTWORD;
1859
0
    break;
1860
0
  case 's':
1861
0
    if (!(syntax & RE_NO_GNU_OPS))
1862
0
      token->type = OP_SPACE;
1863
0
    break;
1864
0
  case 'S':
1865
0
    if (!(syntax & RE_NO_GNU_OPS))
1866
0
      token->type = OP_NOTSPACE;
1867
0
    break;
1868
0
  case '`':
1869
0
    if (!(syntax & RE_NO_GNU_OPS))
1870
0
      {
1871
0
        token->type = ANCHOR;
1872
0
        token->opr.ctx_type = BUF_FIRST;
1873
0
      }
1874
0
    break;
1875
0
  case '\'':
1876
0
    if (!(syntax & RE_NO_GNU_OPS))
1877
0
      {
1878
0
        token->type = ANCHOR;
1879
0
        token->opr.ctx_type = BUF_LAST;
1880
0
      }
1881
0
    break;
1882
0
  case '(':
1883
0
    if (!(syntax & RE_NO_BK_PARENS))
1884
0
      token->type = OP_OPEN_SUBEXP;
1885
0
    break;
1886
0
  case ')':
1887
0
    if (!(syntax & RE_NO_BK_PARENS))
1888
0
      token->type = OP_CLOSE_SUBEXP;
1889
0
    break;
1890
0
  case '+':
1891
0
    if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1892
0
      token->type = OP_DUP_PLUS;
1893
0
    break;
1894
0
  case '?':
1895
0
    if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1896
0
      token->type = OP_DUP_QUESTION;
1897
0
    break;
1898
0
  case '{':
1899
0
    if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1900
0
      token->type = OP_OPEN_DUP_NUM;
1901
0
    break;
1902
0
  case '}':
1903
0
    if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1904
0
      token->type = OP_CLOSE_DUP_NUM;
1905
0
    break;
1906
0
  default:
1907
0
    break;
1908
0
  }
1909
0
      return 2;
1910
0
    }
1911
1912
0
  token->type = CHARACTER;
1913
0
  if (input->mb_cur_max > 1)
1914
0
    {
1915
0
      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1916
0
      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1917
0
    }
1918
0
  else
1919
0
    token->word_char = IS_WORD_CHAR (token->opr.c);
1920
1921
0
  switch (c)
1922
0
    {
1923
0
    case '\n':
1924
0
      if (syntax & RE_NEWLINE_ALT)
1925
0
  token->type = OP_ALT;
1926
0
      break;
1927
0
    case '|':
1928
0
      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1929
0
  token->type = OP_ALT;
1930
0
      break;
1931
0
    case '*':
1932
0
      token->type = OP_DUP_ASTERISK;
1933
0
      break;
1934
0
    case '+':
1935
0
      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1936
0
  token->type = OP_DUP_PLUS;
1937
0
      break;
1938
0
    case '?':
1939
0
      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940
0
  token->type = OP_DUP_QUESTION;
1941
0
      break;
1942
0
    case '{':
1943
0
      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1944
0
  token->type = OP_OPEN_DUP_NUM;
1945
0
      break;
1946
0
    case '}':
1947
0
      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948
0
  token->type = OP_CLOSE_DUP_NUM;
1949
0
      break;
1950
0
    case '(':
1951
0
      if (syntax & RE_NO_BK_PARENS)
1952
0
  token->type = OP_OPEN_SUBEXP;
1953
0
      break;
1954
0
    case ')':
1955
0
      if (syntax & RE_NO_BK_PARENS)
1956
0
  token->type = OP_CLOSE_SUBEXP;
1957
0
      break;
1958
0
    case '[':
1959
0
      token->type = OP_OPEN_BRACKET;
1960
0
      break;
1961
0
    case '.':
1962
0
      token->type = OP_PERIOD;
1963
0
      break;
1964
0
    case '^':
1965
0
      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1966
0
    && re_string_cur_idx (input) != 0)
1967
0
  {
1968
0
    char prev = re_string_peek_byte (input, -1);
1969
0
    if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1970
0
      break;
1971
0
  }
1972
0
      token->type = ANCHOR;
1973
0
      token->opr.ctx_type = LINE_FIRST;
1974
0
      break;
1975
0
    case '$':
1976
0
      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1977
0
    && re_string_cur_idx (input) + 1 != re_string_length (input))
1978
0
  {
1979
0
    re_token_t next;
1980
0
    re_string_skip_bytes (input, 1);
1981
0
    peek_token (&next, input, syntax);
1982
0
    re_string_skip_bytes (input, -1);
1983
0
    if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1984
0
      break;
1985
0
  }
1986
0
      token->type = ANCHOR;
1987
0
      token->opr.ctx_type = LINE_LAST;
1988
0
      break;
1989
0
    default:
1990
0
      break;
1991
0
    }
1992
0
  return 1;
1993
0
}
1994
1995
/* Peek a token from INPUT, and return the length of the token.
1996
   We must not use this function out of bracket expressions.  */
1997
1998
static int
1999
peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2000
0
{
2001
0
  unsigned char c;
2002
0
  if (re_string_eoi (input))
2003
0
    {
2004
0
      token->type = END_OF_RE;
2005
0
      return 0;
2006
0
    }
2007
0
  c = re_string_peek_byte (input, 0);
2008
0
  token->opr.c = c;
2009
2010
0
  if (input->mb_cur_max > 1
2011
0
      && !re_string_first_byte (input, re_string_cur_idx (input)))
2012
0
    {
2013
0
      token->type = CHARACTER;
2014
0
      return 1;
2015
0
    }
2016
2017
0
  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2018
0
      && re_string_cur_idx (input) + 1 < re_string_length (input))
2019
0
    {
2020
      /* In this case, '\' escape a character.  */
2021
0
      unsigned char c2;
2022
0
      re_string_skip_bytes (input, 1);
2023
0
      c2 = re_string_peek_byte (input, 0);
2024
0
      token->opr.c = c2;
2025
0
      token->type = CHARACTER;
2026
0
      return 1;
2027
0
    }
2028
0
  if (c == '[') /* '[' is a special char in a bracket exps.  */
2029
0
    {
2030
0
      unsigned char c2;
2031
0
      int token_len;
2032
0
      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2033
0
  c2 = re_string_peek_byte (input, 1);
2034
0
      else
2035
0
  c2 = 0;
2036
0
      token->opr.c = c2;
2037
0
      token_len = 2;
2038
0
      switch (c2)
2039
0
  {
2040
0
  case '.':
2041
0
    token->type = OP_OPEN_COLL_ELEM;
2042
0
    break;
2043
2044
0
  case '=':
2045
0
    token->type = OP_OPEN_EQUIV_CLASS;
2046
0
    break;
2047
2048
0
  case ':':
2049
0
    if (syntax & RE_CHAR_CLASSES)
2050
0
      {
2051
0
        token->type = OP_OPEN_CHAR_CLASS;
2052
0
        break;
2053
0
      }
2054
0
    FALLTHROUGH;
2055
0
  default:
2056
0
    token->type = CHARACTER;
2057
0
    token->opr.c = c;
2058
0
    token_len = 1;
2059
0
    break;
2060
0
  }
2061
0
      return token_len;
2062
0
    }
2063
0
  switch (c)
2064
0
    {
2065
0
    case ']':
2066
0
      token->type = OP_CLOSE_BRACKET;
2067
0
      break;
2068
0
    case '^':
2069
0
      token->type = OP_NON_MATCH_LIST;
2070
0
      break;
2071
0
    case '-':
2072
      /* In V7 Unix grep and Unix awk and mawk, [...---...]
2073
         (3 adjacent minus signs) stands for a single minus sign.
2074
         Support that without breaking anything else.  */
2075
0
      if (! (re_string_cur_idx (input) + 2 < re_string_length (input)
2076
0
             && re_string_peek_byte (input, 1) == '-'
2077
0
             && re_string_peek_byte (input, 2) == '-'))
2078
0
        {
2079
0
          token->type = OP_CHARSET_RANGE;
2080
0
          break;
2081
0
        }
2082
0
      re_string_skip_bytes (input, 2);
2083
0
      FALLTHROUGH;
2084
0
    default:
2085
0
      token->type = CHARACTER;
2086
0
    }
2087
0
  return 1;
2088
0
}
2089

2090
/* Functions for parser.  */
2091
2092
/* Entry point of the parser.
2093
   Parse the regular expression REGEXP and return the structure tree.
2094
   If an error occurs, ERR is set by error code, and return NULL.
2095
   This function build the following tree, from regular expression <reg_exp>:
2096
     CAT
2097
     / \
2098
    /   \
2099
   <reg_exp>  EOR
2100
2101
   CAT means concatenation.
2102
   EOR means end of regular expression.  */
2103
2104
static bin_tree_t *
2105
parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2106
       reg_errcode_t *err)
2107
0
{
2108
0
  re_dfa_t *dfa = preg->buffer;
2109
0
  bin_tree_t *tree, *eor, *root;
2110
0
  re_token_t current_token;
2111
0
  dfa->syntax = syntax;
2112
0
  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2113
0
  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2114
0
  if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2115
0
    return NULL;
2116
0
  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2117
0
  if (tree != NULL)
2118
0
    root = create_tree (dfa, tree, eor, CONCAT);
2119
0
  else
2120
0
    root = eor;
2121
0
  if (__glibc_unlikely (eor == NULL || root == NULL))
2122
0
    {
2123
0
      *err = REG_ESPACE;
2124
0
      return NULL;
2125
0
    }
2126
0
  return root;
2127
0
}
2128
2129
/* This function build the following tree, from regular expression
2130
   <branch1>|<branch2>:
2131
     ALT
2132
     / \
2133
    /   \
2134
   <branch1> <branch2>
2135
2136
   ALT means alternative, which represents the operator '|'.  */
2137
2138
static bin_tree_t *
2139
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2140
         reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2141
0
{
2142
0
  re_dfa_t *dfa = preg->buffer;
2143
0
  bin_tree_t *tree, *branch = NULL;
2144
0
  bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2145
0
  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2146
0
  if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2147
0
    return NULL;
2148
2149
0
  while (token->type == OP_ALT)
2150
0
    {
2151
0
      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2152
0
      if (token->type != OP_ALT && token->type != END_OF_RE
2153
0
    && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2154
0
  {
2155
0
    bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2156
0
    dfa->completed_bkref_map = initial_bkref_map;
2157
0
    branch = parse_branch (regexp, preg, token, syntax, nest, err);
2158
0
    if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2159
0
      {
2160
0
        if (tree != NULL)
2161
0
    postorder (tree, free_tree, NULL);
2162
0
        return NULL;
2163
0
      }
2164
0
    dfa->completed_bkref_map |= accumulated_bkref_map;
2165
0
  }
2166
0
      else
2167
0
  branch = NULL;
2168
0
      tree = create_tree (dfa, tree, branch, OP_ALT);
2169
0
      if (__glibc_unlikely (tree == NULL))
2170
0
  {
2171
0
    *err = REG_ESPACE;
2172
0
    return NULL;
2173
0
  }
2174
0
    }
2175
0
  return tree;
2176
0
}
2177
2178
/* This function build the following tree, from regular expression
2179
   <exp1><exp2>:
2180
  CAT
2181
  / \
2182
       /   \
2183
   <exp1> <exp2>
2184
2185
   CAT means concatenation.  */
2186
2187
static bin_tree_t *
2188
parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2189
        reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2190
0
{
2191
0
  bin_tree_t *tree, *expr;
2192
0
  re_dfa_t *dfa = preg->buffer;
2193
0
  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2194
0
  if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2195
0
    return NULL;
2196
2197
0
  while (token->type != OP_ALT && token->type != END_OF_RE
2198
0
   && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2199
0
    {
2200
0
      expr = parse_expression (regexp, preg, token, syntax, nest, err);
2201
0
      if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2202
0
  {
2203
0
    if (tree != NULL)
2204
0
      postorder (tree, free_tree, NULL);
2205
0
    return NULL;
2206
0
  }
2207
0
      if (tree != NULL && expr != NULL)
2208
0
  {
2209
0
    bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2210
0
    if (newtree == NULL)
2211
0
      {
2212
0
        postorder (expr, free_tree, NULL);
2213
0
        postorder (tree, free_tree, NULL);
2214
0
        *err = REG_ESPACE;
2215
0
        return NULL;
2216
0
      }
2217
0
    tree = newtree;
2218
0
  }
2219
0
      else if (tree == NULL)
2220
0
  tree = expr;
2221
      /* Otherwise expr == NULL, we don't need to create new tree.  */
2222
0
    }
2223
0
  return tree;
2224
0
}
2225
2226
/* This function build the following tree, from regular expression a*:
2227
   *
2228
   |
2229
   a
2230
*/
2231
2232
static bin_tree_t *
2233
parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2234
      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2235
0
{
2236
0
  re_dfa_t *dfa = preg->buffer;
2237
0
  bin_tree_t *tree;
2238
0
  switch (token->type)
2239
0
    {
2240
0
    case CHARACTER:
2241
0
      tree = create_token_tree (dfa, NULL, NULL, token);
2242
0
      if (__glibc_unlikely (tree == NULL))
2243
0
  {
2244
0
    *err = REG_ESPACE;
2245
0
    return NULL;
2246
0
  }
2247
0
      if (dfa->mb_cur_max > 1)
2248
0
  {
2249
0
    while (!re_string_eoi (regexp)
2250
0
     && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2251
0
      {
2252
0
        bin_tree_t *mbc_remain;
2253
0
        fetch_token (token, regexp, syntax);
2254
0
        mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2255
0
        tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2256
0
        if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2257
0
    {
2258
0
      *err = REG_ESPACE;
2259
0
      return NULL;
2260
0
    }
2261
0
      }
2262
0
  }
2263
0
      break;
2264
2265
0
    case OP_OPEN_SUBEXP:
2266
0
      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2267
0
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2268
0
  return NULL;
2269
0
      break;
2270
2271
0
    case OP_OPEN_BRACKET:
2272
0
      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2273
0
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2274
0
  return NULL;
2275
0
      break;
2276
2277
0
    case OP_BACK_REF:
2278
0
      if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2279
0
  {
2280
0
    *err = REG_ESUBREG;
2281
0
    return NULL;
2282
0
  }
2283
0
      dfa->used_bkref_map |= 1 << token->opr.idx;
2284
0
      tree = create_token_tree (dfa, NULL, NULL, token);
2285
0
      if (__glibc_unlikely (tree == NULL))
2286
0
  {
2287
0
    *err = REG_ESPACE;
2288
0
    return NULL;
2289
0
  }
2290
0
      ++dfa->nbackref;
2291
0
      dfa->has_mb_node = 1;
2292
0
      break;
2293
2294
0
    case OP_OPEN_DUP_NUM:
2295
0
      if (syntax & RE_CONTEXT_INVALID_DUP)
2296
0
  {
2297
0
    *err = REG_BADRPT;
2298
0
    return NULL;
2299
0
  }
2300
0
      FALLTHROUGH;
2301
0
    case OP_DUP_ASTERISK:
2302
0
    case OP_DUP_PLUS:
2303
0
    case OP_DUP_QUESTION:
2304
0
      if (syntax & RE_CONTEXT_INVALID_OPS)
2305
0
  {
2306
0
    *err = REG_BADRPT;
2307
0
    return NULL;
2308
0
  }
2309
0
      else if (syntax & RE_CONTEXT_INDEP_OPS)
2310
0
  {
2311
0
    fetch_token (token, regexp, syntax);
2312
0
    return parse_expression (regexp, preg, token, syntax, nest, err);
2313
0
  }
2314
0
      FALLTHROUGH;
2315
0
    case OP_CLOSE_SUBEXP:
2316
0
      if ((token->type == OP_CLOSE_SUBEXP)
2317
0
    && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2318
0
  {
2319
0
    *err = REG_ERPAREN;
2320
0
    return NULL;
2321
0
  }
2322
0
      FALLTHROUGH;
2323
0
    case OP_CLOSE_DUP_NUM:
2324
      /* We treat it as a normal character.  */
2325
2326
      /* Then we can these characters as normal characters.  */
2327
0
      token->type = CHARACTER;
2328
      /* mb_partial and word_char bits should be initialized already
2329
   by peek_token.  */
2330
0
      tree = create_token_tree (dfa, NULL, NULL, token);
2331
0
      if (__glibc_unlikely (tree == NULL))
2332
0
  {
2333
0
    *err = REG_ESPACE;
2334
0
    return NULL;
2335
0
  }
2336
0
      break;
2337
2338
0
    case ANCHOR:
2339
0
      if ((token->opr.ctx_type
2340
0
     & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2341
0
    && dfa->word_ops_used == 0)
2342
0
  init_word_char (dfa);
2343
0
      if (token->opr.ctx_type == WORD_DELIM
2344
0
    || token->opr.ctx_type == NOT_WORD_DELIM)
2345
0
  {
2346
0
    bin_tree_t *tree_first, *tree_last;
2347
0
    if (token->opr.ctx_type == WORD_DELIM)
2348
0
      {
2349
0
        token->opr.ctx_type = WORD_FIRST;
2350
0
        tree_first = create_token_tree (dfa, NULL, NULL, token);
2351
0
        token->opr.ctx_type = WORD_LAST;
2352
0
      }
2353
0
    else
2354
0
      {
2355
0
        token->opr.ctx_type = INSIDE_WORD;
2356
0
        tree_first = create_token_tree (dfa, NULL, NULL, token);
2357
0
        token->opr.ctx_type = INSIDE_NOTWORD;
2358
0
      }
2359
0
    tree_last = create_token_tree (dfa, NULL, NULL, token);
2360
0
    tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2361
0
    if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2362
0
        || tree == NULL))
2363
0
      {
2364
0
        *err = REG_ESPACE;
2365
0
        return NULL;
2366
0
      }
2367
0
  }
2368
0
      else
2369
0
  {
2370
0
    tree = create_token_tree (dfa, NULL, NULL, token);
2371
0
    if (__glibc_unlikely (tree == NULL))
2372
0
      {
2373
0
        *err = REG_ESPACE;
2374
0
        return NULL;
2375
0
      }
2376
0
  }
2377
      /* We must return here, since ANCHORs can't be followed
2378
   by repetition operators.
2379
   eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2380
       it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2381
0
      fetch_token (token, regexp, syntax);
2382
0
      return tree;
2383
2384
0
    case OP_PERIOD:
2385
0
      tree = create_token_tree (dfa, NULL, NULL, token);
2386
0
      if (__glibc_unlikely (tree == NULL))
2387
0
  {
2388
0
    *err = REG_ESPACE;
2389
0
    return NULL;
2390
0
  }
2391
0
      if (dfa->mb_cur_max > 1)
2392
0
  dfa->has_mb_node = 1;
2393
0
      break;
2394
2395
0
    case OP_WORD:
2396
0
    case OP_NOTWORD:
2397
0
      tree = build_charclass_op (dfa, regexp->trans,
2398
0
         "alnum",
2399
0
         "_",
2400
0
         token->type == OP_NOTWORD, err);
2401
0
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2402
0
  return NULL;
2403
0
      break;
2404
2405
0
    case OP_SPACE:
2406
0
    case OP_NOTSPACE:
2407
0
      tree = build_charclass_op (dfa, regexp->trans,
2408
0
         "space",
2409
0
         "",
2410
0
         token->type == OP_NOTSPACE, err);
2411
0
      if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2412
0
  return NULL;
2413
0
      break;
2414
2415
0
    case OP_ALT:
2416
0
    case END_OF_RE:
2417
0
      return NULL;
2418
2419
0
    case BACK_SLASH:
2420
0
      *err = REG_EESCAPE;
2421
0
      return NULL;
2422
2423
0
    default:
2424
      /* Must not happen?  */
2425
0
      DEBUG_ASSERT (false);
2426
0
      return NULL;
2427
0
    }
2428
0
  fetch_token (token, regexp, syntax);
2429
2430
0
  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2431
0
   || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2432
0
    {
2433
0
      bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2434
0
             syntax, err);
2435
0
      if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2436
0
  {
2437
0
    if (tree != NULL)
2438
0
      postorder (tree, free_tree, NULL);
2439
0
    return NULL;
2440
0
  }
2441
0
      tree = dup_tree;
2442
      /* In BRE consecutive duplications are not allowed.  */
2443
0
      if ((syntax & RE_CONTEXT_INVALID_DUP)
2444
0
    && (token->type == OP_DUP_ASTERISK
2445
0
        || token->type == OP_OPEN_DUP_NUM))
2446
0
  {
2447
0
    if (tree != NULL)
2448
0
      postorder (tree, free_tree, NULL);
2449
0
    *err = REG_BADRPT;
2450
0
    return NULL;
2451
0
  }
2452
0
    }
2453
2454
0
  return tree;
2455
0
}
2456
2457
/* This function build the following tree, from regular expression
2458
   (<reg_exp>):
2459
   SUBEXP
2460
      |
2461
  <reg_exp>
2462
*/
2463
2464
static bin_tree_t *
2465
parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2466
         reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2467
0
{
2468
0
  re_dfa_t *dfa = preg->buffer;
2469
0
  bin_tree_t *tree;
2470
0
  size_t cur_nsub;
2471
0
  cur_nsub = preg->re_nsub++;
2472
2473
0
  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2474
2475
  /* The subexpression may be a null string.  */
2476
0
  if (token->type == OP_CLOSE_SUBEXP)
2477
0
    tree = NULL;
2478
0
  else
2479
0
    {
2480
0
      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2481
0
      if (__glibc_unlikely (*err == REG_NOERROR
2482
0
          && token->type != OP_CLOSE_SUBEXP))
2483
0
  {
2484
0
    if (tree != NULL)
2485
0
      postorder (tree, free_tree, NULL);
2486
0
    *err = REG_EPAREN;
2487
0
  }
2488
0
      if (__glibc_unlikely (*err != REG_NOERROR))
2489
0
  return NULL;
2490
0
    }
2491
2492
0
  if (cur_nsub <= '9' - '1')
2493
0
    dfa->completed_bkref_map |= 1 << cur_nsub;
2494
2495
0
  tree = create_tree (dfa, tree, NULL, SUBEXP);
2496
0
  if (__glibc_unlikely (tree == NULL))
2497
0
    {
2498
0
      *err = REG_ESPACE;
2499
0
      return NULL;
2500
0
    }
2501
0
  tree->token.opr.idx = cur_nsub;
2502
0
  return tree;
2503
0
}
2504
2505
/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2506
2507
static bin_tree_t *
2508
parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2509
        re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2510
0
{
2511
0
  bin_tree_t *tree = NULL, *old_tree = NULL;
2512
0
  Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2513
0
  re_token_t start_token = *token;
2514
2515
0
  if (token->type == OP_OPEN_DUP_NUM)
2516
0
    {
2517
0
      end = 0;
2518
0
      start = fetch_number (regexp, token, syntax);
2519
0
      if (start == -1)
2520
0
  {
2521
0
    if (token->type == CHARACTER && token->opr.c == ',')
2522
0
      start = 0; /* We treat "{,m}" as "{0,m}".  */
2523
0
    else
2524
0
      {
2525
0
        *err = REG_BADBR; /* <re>{} is invalid.  */
2526
0
        return NULL;
2527
0
      }
2528
0
  }
2529
0
      if (__glibc_likely (start != -2))
2530
0
  {
2531
    /* We treat "{n}" as "{n,n}".  */
2532
0
    end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2533
0
     : ((token->type == CHARACTER && token->opr.c == ',')
2534
0
        ? fetch_number (regexp, token, syntax) : -2));
2535
0
  }
2536
0
      if (__glibc_unlikely (start == -2 || end == -2))
2537
0
  {
2538
    /* Invalid sequence.  */
2539
0
    if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2540
0
      {
2541
0
        if (token->type == END_OF_RE)
2542
0
    *err = REG_EBRACE;
2543
0
        else
2544
0
    *err = REG_BADBR;
2545
2546
0
        return NULL;
2547
0
      }
2548
2549
    /* If the syntax bit is set, rollback.  */
2550
0
    re_string_set_index (regexp, start_idx);
2551
0
    *token = start_token;
2552
0
    token->type = CHARACTER;
2553
    /* mb_partial and word_char bits should be already initialized by
2554
       peek_token.  */
2555
0
    return elem;
2556
0
  }
2557
2558
0
      if (__glibc_unlikely ((end != -1 && start > end)
2559
0
          || token->type != OP_CLOSE_DUP_NUM))
2560
0
  {
2561
    /* First number greater than second.  */
2562
0
    *err = REG_BADBR;
2563
0
    return NULL;
2564
0
  }
2565
2566
0
      if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2567
0
  {
2568
0
    *err = REG_ESIZE;
2569
0
    return NULL;
2570
0
  }
2571
0
    }
2572
0
  else
2573
0
    {
2574
0
      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2575
0
      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2576
0
    }
2577
2578
0
  fetch_token (token, regexp, syntax);
2579
2580
0
  if (__glibc_unlikely (elem == NULL))
2581
0
    return NULL;
2582
0
  if (__glibc_unlikely (start == 0 && end == 0))
2583
0
    {
2584
0
      postorder (elem, free_tree, NULL);
2585
0
      return NULL;
2586
0
    }
2587
2588
  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2589
0
  if (__glibc_unlikely (start > 0))
2590
0
    {
2591
0
      tree = elem;
2592
0
      for (i = 2; i <= start; ++i)
2593
0
  {
2594
0
    elem = duplicate_tree (elem, dfa);
2595
0
    tree = create_tree (dfa, tree, elem, CONCAT);
2596
0
    if (__glibc_unlikely (elem == NULL || tree == NULL))
2597
0
      goto parse_dup_op_espace;
2598
0
  }
2599
2600
0
      if (start == end)
2601
0
  return tree;
2602
2603
      /* Duplicate ELEM before it is marked optional.  */
2604
0
      elem = duplicate_tree (elem, dfa);
2605
0
      if (__glibc_unlikely (elem == NULL))
2606
0
        goto parse_dup_op_espace;
2607
0
      old_tree = tree;
2608
0
    }
2609
0
  else
2610
0
    old_tree = NULL;
2611
2612
0
  if (elem->token.type == SUBEXP)
2613
0
    {
2614
0
      uintptr_t subidx = elem->token.opr.idx;
2615
0
      postorder (elem, mark_opt_subexp, (void *) subidx);
2616
0
    }
2617
2618
0
  tree = create_tree (dfa, elem, NULL,
2619
0
          (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2620
0
  if (__glibc_unlikely (tree == NULL))
2621
0
    goto parse_dup_op_espace;
2622
2623
  /* This loop is actually executed only when end != -1,
2624
     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2625
     already created the start+1-th copy.  */
2626
0
  if (TYPE_SIGNED (Idx) || end != -1)
2627
0
    for (i = start + 2; i <= end; ++i)
2628
0
      {
2629
0
  elem = duplicate_tree (elem, dfa);
2630
0
  tree = create_tree (dfa, tree, elem, CONCAT);
2631
0
  if (__glibc_unlikely (elem == NULL || tree == NULL))
2632
0
    goto parse_dup_op_espace;
2633
2634
0
  tree = create_tree (dfa, tree, NULL, OP_ALT);
2635
0
  if (__glibc_unlikely (tree == NULL))
2636
0
    goto parse_dup_op_espace;
2637
0
      }
2638
2639
0
  if (old_tree)
2640
0
    tree = create_tree (dfa, old_tree, tree, CONCAT);
2641
2642
0
  return tree;
2643
2644
0
 parse_dup_op_espace:
2645
0
  *err = REG_ESPACE;
2646
0
  return NULL;
2647
0
}
2648
2649
/* Size of the names for collating symbol/equivalence_class/character_class.
2650
   I'm not sure, but maybe enough.  */
2651
0
#define BRACKET_NAME_BUF_SIZE 32
2652
2653
#ifndef _LIBC
2654
2655
/* Convert the byte B to the corresponding wide character.  In a
2656
   unibyte locale, treat B as itself.  In a multibyte locale, return
2657
   WEOF if B is an encoding error.  */
2658
static wint_t
2659
parse_byte (unsigned char b, re_dfa_t const *dfa)
2660
0
{
2661
0
  return dfa->mb_cur_max > 1 ? __btowc (b) : b;
2662
0
}
2663
2664
/* Local function for parse_bracket_exp used in _LIBC environment.
2665
   Build the range expression which starts from START_ELEM, and ends
2666
   at END_ELEM.  The result are written to MBCSET and SBCSET.
2667
   RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2668
   mbcset->range_ends, is a pointer argument since we may
2669
   update it.  */
2670
2671
static reg_errcode_t
2672
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2673
     bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2674
     re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
2675
     const unsigned char *collseqmb, const char *collseqwc,
2676
     int_fast32_t table_size, const void *symb_table,
2677
     const unsigned char *extra)
2678
0
{
2679
  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2680
0
  if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2681
0
      || start_elem->type == CHAR_CLASS
2682
0
      || end_elem->type == EQUIV_CLASS
2683
0
      || end_elem->type == CHAR_CLASS))
2684
0
    return REG_ERANGE;
2685
2686
  /* We can handle no multi character collating elements without libc
2687
     support.  */
2688
0
  if (__glibc_unlikely ((start_elem->type == COLL_SYM
2689
0
       && strlen ((char *) start_elem->opr.name) > 1)
2690
0
      || (end_elem->type == COLL_SYM
2691
0
          && strlen ((char *) end_elem->opr.name) > 1)))
2692
0
    return REG_ECOLLATE;
2693
2694
0
  unsigned int
2695
0
    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2696
0
    : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2697
0
       : 0)),
2698
0
    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2699
0
        : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2700
0
     : 0));
2701
0
  wint_t
2702
0
    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2703
0
    ? parse_byte (start_ch, dfa) : start_elem->opr.wch),
2704
0
    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2705
0
        ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
2706
2707
0
  if (start_wc == WEOF || end_wc == WEOF)
2708
0
    return REG_ECOLLATE;
2709
0
  else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2710
0
                             && start_wc > end_wc))
2711
0
    return REG_ERANGE;
2712
2713
  /* Got valid collation sequence values, add them as a new entry.
2714
     However, for !_LIBC we have no collation elements: if the
2715
     character set is single byte, the single byte character set
2716
     that we build below suffices.  parse_bracket_exp passes
2717
     no MBCSET if dfa->mb_cur_max == 1.  */
2718
0
  if (dfa->mb_cur_max > 1)
2719
0
    {
2720
      /* Check the space of the arrays.  */
2721
0
      if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2722
0
        {
2723
          /* There is not enough space, need realloc.  */
2724
0
          wchar_t *new_array_start, *new_array_end;
2725
0
          Idx new_nranges;
2726
2727
          /* +1 in case of mbcset->nranges is 0.  */
2728
0
          new_nranges = 2 * mbcset->nranges + 1;
2729
          /* Use realloc since mbcset->range_starts and mbcset->range_ends
2730
             are NULL if *range_alloc == 0.  */
2731
0
          new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2732
0
                                        new_nranges);
2733
0
          new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2734
0
                                      new_nranges);
2735
2736
0
          if (__glibc_unlikely (new_array_start == NULL
2737
0
                                || new_array_end == NULL))
2738
0
            {
2739
0
              re_free (new_array_start);
2740
0
              re_free (new_array_end);
2741
0
              return REG_ESPACE;
2742
0
            }
2743
2744
0
          mbcset->range_starts = new_array_start;
2745
0
          mbcset->range_ends = new_array_end;
2746
0
          *range_alloc = new_nranges;
2747
0
        }
2748
2749
0
      mbcset->range_starts[mbcset->nranges] = start_wc;
2750
0
      mbcset->range_ends[mbcset->nranges++] = end_wc;
2751
0
    }
2752
2753
  /* Build the table for single byte characters.  */
2754
0
  for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
2755
0
    {
2756
0
      if (start_wc <= wc && wc <= end_wc)
2757
0
        bitset_set (sbcset, wc);
2758
0
    }
2759
2760
0
  return REG_NOERROR;
2761
0
}
2762
#endif /* not _LIBC */
2763
2764
#ifndef _LIBC
2765
/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
2766
   Build the collating element which is represented by NAME.
2767
   The result are written to MBCSET and SBCSET.
2768
   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2769
   pointer argument since we may update it.  */
2770
2771
static reg_errcode_t
2772
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2773
      Idx *coll_sym_alloc, const unsigned char *name,
2774
      uint_fast32_t nrules, int_fast32_t table_size,
2775
      const void *symb_table, const unsigned char *extra)
2776
0
{
2777
0
  size_t name_len = strlen ((const char *) name);
2778
0
  if (__glibc_unlikely (name_len != 1))
2779
0
    return REG_ECOLLATE;
2780
0
  else
2781
0
    {
2782
0
      bitset_set (sbcset, name[0]);
2783
0
      return REG_NOERROR;
2784
0
    }
2785
0
}
2786
#endif /* not _LIBC */
2787
2788
#ifdef _LIBC
2789
/* Local function for parse_bracket_exp used in _LIBC environment.
2790
   Seek the collating symbol entry corresponding to NAME.
2791
   Return the index of the symbol in the SYMB_TABLE,
2792
   or -1 if not found.  */
2793
2794
static __always_inline int32_t
2795
seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2796
           const int32_t *symb_table,
2797
           int_fast32_t table_size,
2798
           const unsigned char *extra)
2799
{
2800
  int_fast32_t elem;
2801
2802
  for (elem = 0; elem < table_size; elem++)
2803
    if (symb_table[2 * elem] != 0)
2804
      {
2805
  int32_t idx = symb_table[2 * elem + 1];
2806
  /* Skip the name of collating element name.  */
2807
  idx += 1 + extra[idx];
2808
  if (/* Compare the length of the name.  */
2809
      name_len == extra[idx]
2810
      /* Compare the name.  */
2811
      && memcmp (name, &extra[idx + 1], name_len) == 0)
2812
    /* Yep, this is the entry.  */
2813
    return elem;
2814
      }
2815
  return -1;
2816
}
2817
2818
/* Local function for parse_bracket_exp used in _LIBC environment.
2819
   Look up the collation sequence value of BR_ELEM.
2820
   Return the value if succeeded, UINT_MAX otherwise.  */
2821
2822
static __always_inline unsigned int
2823
lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2824
         const unsigned char *collseqmb,
2825
         const char *collseqwc,
2826
         int_fast32_t table_size,
2827
         const int32_t *symb_table,
2828
         const unsigned char *extra)
2829
{
2830
  if (br_elem->type == SB_CHAR)
2831
    {
2832
      /* if (MB_CUR_MAX == 1) */
2833
      if (nrules == 0)
2834
  return collseqmb[br_elem->opr.ch];
2835
      else
2836
  {
2837
    wint_t wc = __btowc (br_elem->opr.ch);
2838
    return __collseq_table_lookup (collseqwc, wc);
2839
  }
2840
    }
2841
  else if (br_elem->type == MB_CHAR)
2842
    {
2843
      if (nrules != 0)
2844
  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2845
    }
2846
  else if (br_elem->type == COLL_SYM)
2847
    {
2848
      size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2849
      if (nrules != 0)
2850
  {
2851
    int32_t elem, idx;
2852
    elem = seek_collating_symbol_entry (br_elem->opr.name,
2853
                sym_name_len,
2854
                symb_table, table_size,
2855
                extra);
2856
    if (elem != -1)
2857
      {
2858
        /* We found the entry.  */
2859
        idx = symb_table[2 * elem + 1];
2860
        /* Skip the name of collating element name.  */
2861
        idx += 1 + extra[idx];
2862
        /* Skip the byte sequence of the collating element.  */
2863
        idx += 1 + extra[idx];
2864
        /* Adjust for the alignment.  */
2865
        idx = (idx + 3) & ~3;
2866
        /* Skip the multibyte collation sequence value.  */
2867
        idx += sizeof (unsigned int);
2868
        /* Skip the wide char sequence of the collating element.  */
2869
        idx += sizeof (unsigned int) *
2870
    (1 + *(unsigned int *) (extra + idx));
2871
        /* Return the collation sequence value.  */
2872
        return *(unsigned int *) (extra + idx);
2873
      }
2874
    else if (sym_name_len == 1)
2875
      {
2876
        /* No valid character.  Match it as a single byte
2877
     character.  */
2878
        return collseqmb[br_elem->opr.name[0]];
2879
      }
2880
  }
2881
      else if (sym_name_len == 1)
2882
  return collseqmb[br_elem->opr.name[0]];
2883
    }
2884
  return UINT_MAX;
2885
}
2886
2887
/* Local function for parse_bracket_exp used in _LIBC environment.
2888
   Build the range expression which starts from START_ELEM, and ends
2889
   at END_ELEM.  The result are written to MBCSET and SBCSET.
2890
   RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2891
   mbcset->range_ends, is a pointer argument since we may
2892
   update it.  */
2893
2894
static __always_inline reg_errcode_t
2895
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2896
     bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2897
     re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2898
     const unsigned char *collseqmb, const char *collseqwc,
2899
     int_fast32_t table_size, const int32_t *symb_table,
2900
     const unsigned char *extra)
2901
{
2902
  unsigned int ch;
2903
  uint32_t start_collseq;
2904
  uint32_t end_collseq;
2905
2906
  /* Equivalence Classes and Character Classes can't be a range
2907
     start/end.  */
2908
  if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2909
                        || start_elem->type == CHAR_CLASS
2910
                        || end_elem->type == EQUIV_CLASS
2911
                        || end_elem->type == CHAR_CLASS))
2912
    return REG_ERANGE;
2913
2914
  /* FIXME: Implement rational ranges here, too.  */
2915
  start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2916
               table_size, symb_table, extra);
2917
  end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2918
             table_size, symb_table, extra);
2919
  /* Check start/end collation sequence values.  */
2920
  if (__glibc_unlikely (start_collseq == UINT_MAX
2921
                        || end_collseq == UINT_MAX))
2922
    return REG_ECOLLATE;
2923
  if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2924
                        && start_collseq > end_collseq))
2925
    return REG_ERANGE;
2926
2927
  /* Got valid collation sequence values, add them as a new entry.
2928
     However, if we have no collation elements, and the character set
2929
     is single byte, the single byte character set that we
2930
     build below suffices. */
2931
  if (nrules > 0 || dfa->mb_cur_max > 1)
2932
    {
2933
      /* Check the space of the arrays.  */
2934
      if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2935
  {
2936
    /* There is not enough space, need realloc.  */
2937
    uint32_t *new_array_start;
2938
    uint32_t *new_array_end;
2939
    int new_nranges;
2940
2941
    /* +1 in case of mbcset->nranges is 0.  */
2942
    new_nranges = 2 * mbcset->nranges + 1;
2943
    new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2944
          new_nranges);
2945
    new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2946
              new_nranges);
2947
2948
          if (__glibc_unlikely (new_array_start == NULL
2949
                                || new_array_end == NULL))
2950
      return REG_ESPACE;
2951
2952
    mbcset->range_starts = new_array_start;
2953
    mbcset->range_ends = new_array_end;
2954
    *range_alloc = new_nranges;
2955
  }
2956
2957
      mbcset->range_starts[mbcset->nranges] = start_collseq;
2958
      mbcset->range_ends[mbcset->nranges++] = end_collseq;
2959
    }
2960
2961
  /* Build the table for single byte characters.  */
2962
  for (ch = 0; ch < SBC_MAX; ch++)
2963
    {
2964
      uint32_t ch_collseq;
2965
      /* if (MB_CUR_MAX == 1) */
2966
      if (nrules == 0)
2967
  ch_collseq = collseqmb[ch];
2968
      else
2969
  ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2970
      if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2971
  bitset_set (sbcset, ch);
2972
    }
2973
  return REG_NOERROR;
2974
}
2975
2976
/* Local function for parse_bracket_exp used in _LIBC environment.
2977
   Build the collating element which is represented by NAME.
2978
   The result are written to MBCSET and SBCSET.
2979
   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2980
   pointer argument since we may update it.  */
2981
2982
static __always_inline reg_errcode_t
2983
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2984
      Idx *coll_sym_alloc, const unsigned char *name,
2985
      uint_fast32_t nrules, int_fast32_t table_size,
2986
      const int32_t *symb_table, const unsigned char *extra)
2987
{
2988
  int32_t elem, idx;
2989
  size_t name_len = strlen ((const char *) name);
2990
  if (nrules != 0)
2991
    {
2992
      elem = seek_collating_symbol_entry (name, name_len, symb_table,
2993
            table_size, extra);
2994
      if (elem != -1)
2995
  {
2996
    /* We found the entry.  */
2997
    idx = symb_table[2 * elem + 1];
2998
    /* Skip the name of collating element name.  */
2999
    idx += 1 + extra[idx];
3000
  }
3001
      else if (name_len == 1)
3002
  {
3003
    /* No valid character, treat it as a normal
3004
       character.  */
3005
    bitset_set (sbcset, name[0]);
3006
    return REG_NOERROR;
3007
  }
3008
      else
3009
  return REG_ECOLLATE;
3010
3011
      /* Got valid collation sequence, add it as a new entry.  */
3012
      /* Check the space of the arrays.  */
3013
      if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3014
  {
3015
    /* Not enough, realloc it.  */
3016
    /* +1 in case of mbcset->ncoll_syms is 0.  */
3017
    int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3018
    /* Use realloc since mbcset->coll_syms is NULL
3019
       if *alloc == 0.  */
3020
    int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3021
                 new_coll_sym_alloc);
3022
          if (__glibc_unlikely (new_coll_syms == NULL))
3023
      return REG_ESPACE;
3024
    mbcset->coll_syms = new_coll_syms;
3025
    *coll_sym_alloc = new_coll_sym_alloc;
3026
  }
3027
      mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3028
      return REG_NOERROR;
3029
    }
3030
  else
3031
    {
3032
      if (__glibc_unlikely (name_len != 1))
3033
  return REG_ECOLLATE;
3034
      else
3035
  {
3036
    bitset_set (sbcset, name[0]);
3037
    return REG_NOERROR;
3038
  }
3039
    }
3040
}
3041
#endif /* _LIBC */
3042
3043
/* This function parse bracket expression like "[abc]", "[a-c]",
3044
   "[[.a-a.]]" etc.  */
3045
3046
static bin_tree_t *
3047
parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3048
       reg_syntax_t syntax, reg_errcode_t *err)
3049
0
{
3050
0
  const unsigned char *collseqmb = NULL;
3051
0
  const char *collseqwc = NULL;
3052
0
  uint_fast32_t nrules = 0;
3053
0
  int_fast32_t table_size = 0;
3054
0
  const void *symb_table = NULL;
3055
0
  const unsigned char *extra = NULL;
3056
3057
0
  re_token_t br_token;
3058
0
  re_bitset_ptr_t sbcset;
3059
0
  re_charset_t *mbcset;
3060
0
  Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3061
0
  Idx equiv_class_alloc = 0, char_class_alloc = 0;
3062
0
  bool non_match = false;
3063
0
  bin_tree_t *work_tree;
3064
0
  int token_len;
3065
0
  bool first_round = true;
3066
#ifdef _LIBC
3067
  collseqmb = (const unsigned char *)
3068
    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3069
  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3070
  if (nrules)
3071
    {
3072
      /*
3073
      if (MB_CUR_MAX > 1)
3074
      */
3075
      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3076
      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3077
      symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
3078
      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3079
               _NL_COLLATE_SYMB_EXTRAMB);
3080
    }
3081
#endif
3082
0
  sbcset = (re_bitset_ptr_t) calloc (1, sizeof (bitset_t));
3083
0
  mbcset = (re_charset_t *) calloc (1, sizeof (re_charset_t));
3084
0
  if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3085
0
    {
3086
0
      re_free (sbcset);
3087
0
      re_free (mbcset);
3088
0
      *err = REG_ESPACE;
3089
0
      return NULL;
3090
0
    }
3091
3092
0
  token_len = peek_token_bracket (token, regexp, syntax);
3093
0
  if (__glibc_unlikely (token->type == END_OF_RE))
3094
0
    {
3095
0
      *err = REG_BADPAT;
3096
0
      goto parse_bracket_exp_free_return;
3097
0
    }
3098
0
  if (token->type == OP_NON_MATCH_LIST)
3099
0
    {
3100
0
      mbcset->non_match = 1;
3101
0
      non_match = true;
3102
0
      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3103
0
  bitset_set (sbcset, '\n');
3104
0
      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3105
0
      token_len = peek_token_bracket (token, regexp, syntax);
3106
0
      if (__glibc_unlikely (token->type == END_OF_RE))
3107
0
  {
3108
0
    *err = REG_BADPAT;
3109
0
    goto parse_bracket_exp_free_return;
3110
0
  }
3111
0
    }
3112
3113
  /* We treat the first ']' as a normal character.  */
3114
0
  if (token->type == OP_CLOSE_BRACKET)
3115
0
    token->type = CHARACTER;
3116
3117
0
  while (1)
3118
0
    {
3119
0
      bracket_elem_t start_elem, end_elem;
3120
0
      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3121
0
      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3122
0
      reg_errcode_t ret;
3123
0
      int token_len2 = 0;
3124
0
      bool is_range_exp = false;
3125
0
      re_token_t token2;
3126
3127
0
      start_elem.opr.name = start_name_buf;
3128
0
      start_elem.type = COLL_SYM;
3129
0
      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3130
0
           syntax, first_round);
3131
0
      if (__glibc_unlikely (ret != REG_NOERROR))
3132
0
  {
3133
0
    *err = ret;
3134
0
    goto parse_bracket_exp_free_return;
3135
0
  }
3136
0
      first_round = false;
3137
3138
      /* Get information about the next token.  We need it in any case.  */
3139
0
      token_len = peek_token_bracket (token, regexp, syntax);
3140
3141
      /* Do not check for ranges if we know they are not allowed.  */
3142
0
      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3143
0
  {
3144
0
    if (__glibc_unlikely (token->type == END_OF_RE))
3145
0
      {
3146
0
        *err = REG_EBRACK;
3147
0
        goto parse_bracket_exp_free_return;
3148
0
      }
3149
0
    if (token->type == OP_CHARSET_RANGE)
3150
0
      {
3151
0
        re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3152
0
        token_len2 = peek_token_bracket (&token2, regexp, syntax);
3153
0
        if (__glibc_unlikely (token2.type == END_OF_RE))
3154
0
    {
3155
0
      *err = REG_EBRACK;
3156
0
      goto parse_bracket_exp_free_return;
3157
0
    }
3158
0
        if (token2.type == OP_CLOSE_BRACKET)
3159
0
    {
3160
      /* We treat the last '-' as a normal character.  */
3161
0
      re_string_skip_bytes (regexp, -token_len);
3162
0
      token->type = CHARACTER;
3163
0
    }
3164
0
        else
3165
0
    is_range_exp = true;
3166
0
      }
3167
0
  }
3168
3169
0
      if (is_range_exp == true)
3170
0
  {
3171
0
    end_elem.opr.name = end_name_buf;
3172
0
    end_elem.type = COLL_SYM;
3173
0
    ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3174
0
               dfa, syntax, true);
3175
0
    if (__glibc_unlikely (ret != REG_NOERROR))
3176
0
      {
3177
0
        *err = ret;
3178
0
        goto parse_bracket_exp_free_return;
3179
0
      }
3180
3181
0
    token_len = peek_token_bracket (token, regexp, syntax);
3182
3183
0
    *err = build_range_exp (sbcset, mbcset, &range_alloc,
3184
0
          &start_elem, &end_elem,
3185
0
          dfa, syntax, nrules, collseqmb, collseqwc,
3186
0
          table_size, symb_table, extra);
3187
0
    if (__glibc_unlikely (*err != REG_NOERROR))
3188
0
      goto parse_bracket_exp_free_return;
3189
0
  }
3190
0
      else
3191
0
  {
3192
0
    switch (start_elem.type)
3193
0
      {
3194
0
      case SB_CHAR:
3195
0
        bitset_set (sbcset, start_elem.opr.ch);
3196
0
        break;
3197
0
      case MB_CHAR:
3198
        /* Check whether the array has enough space.  */
3199
0
        if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3200
0
    {
3201
0
      wchar_t *new_mbchars;
3202
      /* Not enough, realloc it.  */
3203
      /* +1 in case of mbcset->nmbchars is 0.  */
3204
0
      mbchar_alloc = 2 * mbcset->nmbchars + 1;
3205
      /* Use realloc since array is NULL if *alloc == 0.  */
3206
0
      new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3207
0
              mbchar_alloc);
3208
0
      if (__glibc_unlikely (new_mbchars == NULL))
3209
0
        goto parse_bracket_exp_espace;
3210
0
      mbcset->mbchars = new_mbchars;
3211
0
    }
3212
0
        mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3213
0
        break;
3214
0
      case EQUIV_CLASS:
3215
0
        *err = build_equiv_class (sbcset,
3216
0
          mbcset, &equiv_class_alloc,
3217
0
          start_elem.opr.name);
3218
0
        if (__glibc_unlikely (*err != REG_NOERROR))
3219
0
    goto parse_bracket_exp_free_return;
3220
0
        break;
3221
0
      case COLL_SYM:
3222
0
        *err = build_collating_symbol (sbcset,
3223
0
               mbcset, &coll_sym_alloc,
3224
0
               start_elem.opr.name,
3225
0
               nrules, table_size, symb_table, extra);
3226
0
        if (__glibc_unlikely (*err != REG_NOERROR))
3227
0
    goto parse_bracket_exp_free_return;
3228
0
        break;
3229
0
      case CHAR_CLASS:
3230
0
        *err = build_charclass (regexp->trans, sbcset,
3231
0
              mbcset, &char_class_alloc,
3232
0
              (const char *) start_elem.opr.name,
3233
0
              syntax);
3234
0
        if (__glibc_unlikely (*err != REG_NOERROR))
3235
0
         goto parse_bracket_exp_free_return;
3236
0
        break;
3237
0
      default:
3238
0
        DEBUG_ASSERT (false);
3239
0
        break;
3240
0
      }
3241
0
  }
3242
0
      if (__glibc_unlikely (token->type == END_OF_RE))
3243
0
  {
3244
0
    *err = REG_EBRACK;
3245
0
    goto parse_bracket_exp_free_return;
3246
0
  }
3247
0
      if (token->type == OP_CLOSE_BRACKET)
3248
0
  break;
3249
0
    }
3250
3251
0
  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3252
3253
  /* If it is non-matching list.  */
3254
0
  if (non_match)
3255
0
    bitset_not (sbcset);
3256
3257
  /* Ensure only single byte characters are set.  */
3258
0
  if (dfa->mb_cur_max > 1)
3259
0
    bitset_mask (sbcset, dfa->sb_char);
3260
3261
0
  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3262
0
      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3263
0
                 || mbcset->non_match)))
3264
0
    {
3265
0
      bin_tree_t *mbc_tree;
3266
0
      int sbc_idx;
3267
      /* Build a tree for complex bracket.  */
3268
0
      dfa->has_mb_node = 1;
3269
0
      br_token.type = COMPLEX_BRACKET;
3270
0
      br_token.opr.mbcset = mbcset;
3271
0
      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3272
0
      if (__glibc_unlikely (mbc_tree == NULL))
3273
0
  goto parse_bracket_exp_espace;
3274
0
      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3275
0
  if (sbcset[sbc_idx])
3276
0
    break;
3277
      /* If there are no bits set in sbcset, there is no point
3278
   of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3279
0
      if (sbc_idx < BITSET_WORDS)
3280
0
  {
3281
    /* Build a tree for simple bracket.  */
3282
0
    br_token.type = SIMPLE_BRACKET;
3283
0
    br_token.opr.sbcset = sbcset;
3284
0
    work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3285
0
    if (__glibc_unlikely (work_tree == NULL))
3286
0
      goto parse_bracket_exp_espace;
3287
3288
    /* Then join them by ALT node.  */
3289
0
    work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3290
0
    if (__glibc_unlikely (work_tree == NULL))
3291
0
      goto parse_bracket_exp_espace;
3292
0
  }
3293
0
      else
3294
0
  {
3295
0
    re_free (sbcset);
3296
0
    work_tree = mbc_tree;
3297
0
  }
3298
0
    }
3299
0
  else
3300
0
    {
3301
0
      free_charset (mbcset);
3302
0
      mbcset = NULL;
3303
      /* Build a tree for simple bracket.  */
3304
0
      br_token.type = SIMPLE_BRACKET;
3305
0
      br_token.opr.sbcset = sbcset;
3306
0
      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3307
0
      if (__glibc_unlikely (work_tree == NULL))
3308
0
  goto parse_bracket_exp_espace;
3309
0
    }
3310
0
  return work_tree;
3311
3312
0
 parse_bracket_exp_espace:
3313
0
  *err = REG_ESPACE;
3314
0
 parse_bracket_exp_free_return:
3315
0
  re_free (sbcset);
3316
0
  if (__glibc_likely (mbcset != NULL))
3317
0
    free_charset (mbcset);
3318
0
  return NULL;
3319
0
}
3320
3321
/* Parse an element in the bracket expression.  */
3322
3323
static reg_errcode_t
3324
parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3325
           re_token_t *token, int token_len, re_dfa_t *dfa,
3326
           reg_syntax_t syntax, bool accept_hyphen)
3327
0
{
3328
0
  int cur_char_size;
3329
0
  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3330
0
  if (cur_char_size > 1)
3331
0
    {
3332
0
      elem->type = MB_CHAR;
3333
0
      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3334
0
      re_string_skip_bytes (regexp, cur_char_size);
3335
0
      return REG_NOERROR;
3336
0
    }
3337
0
  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3338
0
  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3339
0
      || token->type == OP_OPEN_EQUIV_CLASS)
3340
0
    return parse_bracket_symbol (elem, regexp, token);
3341
0
  if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3342
0
    {
3343
      /* A '-' must only appear as anything but a range indicator before
3344
   the closing bracket.  Everything else is an error.  */
3345
0
      re_token_t token2;
3346
0
      (void) peek_token_bracket (&token2, regexp, syntax);
3347
0
      if (token2.type != OP_CLOSE_BRACKET)
3348
  /* The actual error value is not standardized since this whole
3349
     case is undefined.  But ERANGE makes good sense.  */
3350
0
  return REG_ERANGE;
3351
0
    }
3352
0
  elem->type = SB_CHAR;
3353
0
  elem->opr.ch = token->opr.c;
3354
0
  return REG_NOERROR;
3355
0
}
3356
3357
/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3358
   such as [:<character_class>:], [.<collating_element>.], and
3359
   [=<equivalent_class>=].  */
3360
3361
static reg_errcode_t
3362
parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3363
          re_token_t *token)
3364
0
{
3365
0
  unsigned char ch, delim = token->opr.c;
3366
0
  int i = 0;
3367
0
  if (re_string_eoi(regexp))
3368
0
    return REG_EBRACK;
3369
0
  for (;; ++i)
3370
0
    {
3371
0
      if (i >= BRACKET_NAME_BUF_SIZE)
3372
0
  return REG_EBRACK;
3373
0
      if (token->type == OP_OPEN_CHAR_CLASS)
3374
0
  ch = re_string_fetch_byte_case (regexp);
3375
0
      else
3376
0
  ch = re_string_fetch_byte (regexp);
3377
0
      if (re_string_eoi(regexp))
3378
0
  return REG_EBRACK;
3379
0
      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3380
0
  break;
3381
0
      elem->opr.name[i] = ch;
3382
0
    }
3383
0
  re_string_skip_bytes (regexp, 1);
3384
0
  elem->opr.name[i] = '\0';
3385
0
  switch (token->type)
3386
0
    {
3387
0
    case OP_OPEN_COLL_ELEM:
3388
0
      elem->type = COLL_SYM;
3389
0
      break;
3390
0
    case OP_OPEN_EQUIV_CLASS:
3391
0
      elem->type = EQUIV_CLASS;
3392
0
      break;
3393
0
    case OP_OPEN_CHAR_CLASS:
3394
0
      elem->type = CHAR_CLASS;
3395
0
      break;
3396
0
    default:
3397
0
      break;
3398
0
    }
3399
0
  return REG_NOERROR;
3400
0
}
3401
3402
  /* Helper function for parse_bracket_exp.
3403
     Build the equivalence class which is represented by NAME.
3404
     The result are written to MBCSET and SBCSET.
3405
     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3406
     is a pointer argument since we may update it.  */
3407
3408
static reg_errcode_t
3409
build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3410
       Idx *equiv_class_alloc, const unsigned char *name)
3411
0
{
3412
#ifdef _LIBC
3413
  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3414
  if (nrules != 0)
3415
    {
3416
      const int32_t *table, *indirect;
3417
      const unsigned char *weights, *extra, *cp;
3418
      unsigned char char_buf[2];
3419
      int32_t idx1, idx2;
3420
      unsigned int ch;
3421
      size_t len;
3422
      /* Calculate the index for equivalence class.  */
3423
      cp = name;
3424
      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3425
      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3426
                 _NL_COLLATE_WEIGHTMB);
3427
      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3428
               _NL_COLLATE_EXTRAMB);
3429
      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3430
            _NL_COLLATE_INDIRECTMB);
3431
      idx1 = findidx (table, indirect, extra, &cp, -1);
3432
      if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3433
  /* This isn't a valid character.  */
3434
  return REG_ECOLLATE;
3435
3436
      /* Build single byte matching table for this equivalence class.  */
3437
      len = weights[idx1 & 0xffffff];
3438
      for (ch = 0; ch < SBC_MAX; ++ch)
3439
  {
3440
    char_buf[0] = ch;
3441
    cp = char_buf;
3442
    idx2 = findidx (table, indirect, extra, &cp, 1);
3443
/*
3444
    idx2 = table[ch];
3445
*/
3446
    if (idx2 == 0)
3447
      /* This isn't a valid character.  */
3448
      continue;
3449
    /* Compare only if the length matches and the collation rule
3450
       index is the same.  */
3451
    if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3452
        && memcmp (weights + (idx1 & 0xffffff) + 1,
3453
       weights + (idx2 & 0xffffff) + 1, len) == 0)
3454
      bitset_set (sbcset, ch);
3455
  }
3456
      /* Check whether the array has enough space.  */
3457
      if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3458
  {
3459
    /* Not enough, realloc it.  */
3460
    /* +1 in case of mbcset->nequiv_classes is 0.  */
3461
    Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3462
    /* Use realloc since the array is NULL if *alloc == 0.  */
3463
    int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3464
               int32_t,
3465
               new_equiv_class_alloc);
3466
    if (__glibc_unlikely (new_equiv_classes == NULL))
3467
      return REG_ESPACE;
3468
    mbcset->equiv_classes = new_equiv_classes;
3469
    *equiv_class_alloc = new_equiv_class_alloc;
3470
  }
3471
      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3472
    }
3473
  else
3474
#endif /* _LIBC */
3475
0
    {
3476
0
      if (__glibc_unlikely (strlen ((const char *) name) != 1))
3477
0
  return REG_ECOLLATE;
3478
0
      bitset_set (sbcset, *name);
3479
0
    }
3480
0
  return REG_NOERROR;
3481
0
}
3482
3483
  /* Helper function for parse_bracket_exp.
3484
     Build the character class which is represented by NAME.
3485
     The result are written to MBCSET and SBCSET.
3486
     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3487
     is a pointer argument since we may update it.  */
3488
3489
static reg_errcode_t
3490
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3491
     re_charset_t *mbcset, Idx *char_class_alloc,
3492
     const char *class_name, reg_syntax_t syntax)
3493
0
{
3494
0
  int i;
3495
0
  const char *name = class_name;
3496
3497
  /* In case of REG_ICASE "upper" and "lower" match the both of
3498
     upper and lower cases.  */
3499
0
  if ((syntax & RE_ICASE)
3500
0
      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3501
0
    name = "alpha";
3502
3503
  /* Check the space of the arrays.  */
3504
0
  if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3505
0
    {
3506
      /* Not enough, realloc it.  */
3507
      /* +1 in case of mbcset->nchar_classes is 0.  */
3508
0
      Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3509
      /* Use realloc since array is NULL if *alloc == 0.  */
3510
0
      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3511
0
                 new_char_class_alloc);
3512
0
      if (__glibc_unlikely (new_char_classes == NULL))
3513
0
  return REG_ESPACE;
3514
0
      mbcset->char_classes = new_char_classes;
3515
0
      *char_class_alloc = new_char_class_alloc;
3516
0
    }
3517
0
  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3518
3519
0
#define BUILD_CHARCLASS_LOOP(ctype_func)  \
3520
0
  do {           \
3521
0
    if (__glibc_unlikely (trans != NULL))     \
3522
0
      {           \
3523
0
  for (i = 0; i < SBC_MAX; ++i)   \
3524
0
    if (ctype_func (i))     \
3525
0
      bitset_set (sbcset, trans[i]); \
3526
0
      }           \
3527
0
    else          \
3528
0
      {           \
3529
0
  for (i = 0; i < SBC_MAX; ++i)   \
3530
0
    if (ctype_func (i))     \
3531
0
      bitset_set (sbcset, i);   \
3532
0
      }           \
3533
0
  } while (0)
3534
3535
0
  if (strcmp (name, "alnum") == 0)
3536
0
    BUILD_CHARCLASS_LOOP (isalnum);
3537
0
  else if (strcmp (name, "cntrl") == 0)
3538
0
    BUILD_CHARCLASS_LOOP (iscntrl);
3539
0
  else if (strcmp (name, "lower") == 0)
3540
0
    BUILD_CHARCLASS_LOOP (islower);
3541
0
  else if (strcmp (name, "space") == 0)
3542
0
    BUILD_CHARCLASS_LOOP (isspace);
3543
0
  else if (strcmp (name, "alpha") == 0)
3544
0
    BUILD_CHARCLASS_LOOP (isalpha);
3545
0
  else if (strcmp (name, "digit") == 0)
3546
0
    BUILD_CHARCLASS_LOOP (isdigit);
3547
0
  else if (strcmp (name, "print") == 0)
3548
0
    BUILD_CHARCLASS_LOOP (isprint);
3549
0
  else if (strcmp (name, "upper") == 0)
3550
0
    BUILD_CHARCLASS_LOOP (isupper);
3551
0
  else if (strcmp (name, "blank") == 0)
3552
0
    BUILD_CHARCLASS_LOOP (isblank);
3553
0
  else if (strcmp (name, "graph") == 0)
3554
0
    BUILD_CHARCLASS_LOOP (isgraph);
3555
0
  else if (strcmp (name, "punct") == 0)
3556
0
    BUILD_CHARCLASS_LOOP (ispunct);
3557
0
  else if (strcmp (name, "xdigit") == 0)
3558
0
    BUILD_CHARCLASS_LOOP (isxdigit);
3559
0
  else
3560
0
    return REG_ECTYPE;
3561
3562
0
  return REG_NOERROR;
3563
0
}
3564
3565
static bin_tree_t *
3566
build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3567
        const char *class_name,
3568
        const char *extra, bool non_match,
3569
        reg_errcode_t *err)
3570
0
{
3571
0
  re_bitset_ptr_t sbcset;
3572
0
  re_charset_t *mbcset;
3573
0
  Idx alloc = 0;
3574
0
  reg_errcode_t ret;
3575
0
  bin_tree_t *tree;
3576
3577
0
  sbcset = (re_bitset_ptr_t) calloc (1, sizeof (bitset_t));
3578
0
  if (__glibc_unlikely (sbcset == NULL))
3579
0
    {
3580
0
      *err = REG_ESPACE;
3581
0
      return NULL;
3582
0
    }
3583
0
  mbcset = (re_charset_t *) calloc (1, sizeof (re_charset_t));
3584
0
  if (__glibc_unlikely (mbcset == NULL))
3585
0
    {
3586
0
      re_free (sbcset);
3587
0
      *err = REG_ESPACE;
3588
0
      return NULL;
3589
0
    }
3590
0
  mbcset->non_match = non_match;
3591
3592
  /* We don't care the syntax in this case.  */
3593
0
  ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
3594
3595
0
  if (__glibc_unlikely (ret != REG_NOERROR))
3596
0
    {
3597
0
      re_free (sbcset);
3598
0
      free_charset (mbcset);
3599
0
      *err = ret;
3600
0
      return NULL;
3601
0
    }
3602
  /* \w match '_' also.  */
3603
0
  for (; *extra; extra++)
3604
0
    bitset_set (sbcset, *extra);
3605
3606
  /* If it is non-matching list.  */
3607
0
  if (non_match)
3608
0
    bitset_not (sbcset);
3609
3610
  /* Ensure only single byte characters are set.  */
3611
0
  if (dfa->mb_cur_max > 1)
3612
0
    bitset_mask (sbcset, dfa->sb_char);
3613
3614
  /* Build a tree for simple bracket.  */
3615
0
  re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3616
0
  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3617
0
  if (__glibc_unlikely (tree == NULL))
3618
0
    goto build_word_op_espace;
3619
3620
0
  if (dfa->mb_cur_max > 1)
3621
0
    {
3622
0
      bin_tree_t *mbc_tree;
3623
      /* Build a tree for complex bracket.  */
3624
0
      br_token.type = COMPLEX_BRACKET;
3625
0
      br_token.opr.mbcset = mbcset;
3626
0
      dfa->has_mb_node = 1;
3627
0
      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3628
0
      if (__glibc_unlikely (mbc_tree == NULL))
3629
0
  goto build_word_op_espace;
3630
      /* Then join them by ALT node.  */
3631
0
      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3632
0
      if (__glibc_likely (mbc_tree != NULL))
3633
0
  return tree;
3634
0
    }
3635
0
  else
3636
0
    {
3637
0
      free_charset (mbcset);
3638
0
      return tree;
3639
0
    }
3640
3641
0
 build_word_op_espace:
3642
0
  re_free (sbcset);
3643
0
  free_charset (mbcset);
3644
0
  *err = REG_ESPACE;
3645
0
  return NULL;
3646
0
}
3647
3648
/* This is intended for the expressions like "a{1,3}".
3649
   Fetch a number from 'input', and return the number.
3650
   Return -1 if the number field is empty like "{,1}".
3651
   Return RE_DUP_MAX + 1 if the number field is too large.
3652
   Return -2 if an error occurred.  */
3653
3654
static Idx
3655
fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3656
0
{
3657
0
  Idx num = -1;
3658
0
  unsigned char c;
3659
0
  while (1)
3660
0
    {
3661
0
      fetch_token (token, input, syntax);
3662
0
      c = token->opr.c;
3663
0
      if (__glibc_unlikely (token->type == END_OF_RE))
3664
0
  return -2;
3665
0
      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3666
0
  break;
3667
0
      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3668
0
       ? -2
3669
0
       : num == -1
3670
0
       ? c - '0'
3671
0
       : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3672
0
    }
3673
0
  return num;
3674
0
}
3675

3676
static void
3677
free_charset (re_charset_t *cset)
3678
0
{
3679
0
  re_free (cset->mbchars);
3680
#ifdef _LIBC
3681
  re_free (cset->coll_syms);
3682
  re_free (cset->equiv_classes);
3683
#endif
3684
0
  re_free (cset->range_starts);
3685
0
  re_free (cset->range_ends);
3686
0
  re_free (cset->char_classes);
3687
0
  re_free (cset);
3688
0
}
3689

3690
/* Functions for binary tree operation.  */
3691
3692
/* Create a tree node.  */
3693
3694
static bin_tree_t *
3695
create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3696
       re_token_type_t type)
3697
0
{
3698
0
  re_token_t t = { .type = type };
3699
0
  return create_token_tree (dfa, left, right, &t);
3700
0
}
3701
3702
static bin_tree_t *
3703
create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3704
       const re_token_t *token)
3705
0
{
3706
0
  bin_tree_t *tree;
3707
0
  if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3708
0
    {
3709
0
      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3710
3711
0
      if (storage == NULL)
3712
0
  return NULL;
3713
0
      storage->next = dfa->str_tree_storage;
3714
0
      dfa->str_tree_storage = storage;
3715
0
      dfa->str_tree_storage_idx = 0;
3716
0
    }
3717
0
  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3718
3719
0
  tree->parent = NULL;
3720
0
  tree->left = left;
3721
0
  tree->right = right;
3722
0
  tree->token = *token;
3723
0
  tree->token.duplicated = 0;
3724
0
  tree->token.opt_subexp = 0;
3725
0
  tree->first = NULL;
3726
0
  tree->next = NULL;
3727
0
  tree->node_idx = -1;
3728
3729
0
  if (left != NULL)
3730
0
    left->parent = tree;
3731
0
  if (right != NULL)
3732
0
    right->parent = tree;
3733
0
  return tree;
3734
0
}
3735
3736
/* Mark the tree SRC as an optional subexpression.
3737
   To be called from preorder or postorder.  */
3738
3739
static reg_errcode_t
3740
mark_opt_subexp (void *extra, bin_tree_t *node)
3741
0
{
3742
0
  Idx idx = (uintptr_t) extra;
3743
0
  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3744
0
    node->token.opt_subexp = 1;
3745
3746
0
  return REG_NOERROR;
3747
0
}
3748
3749
/* Free the allocated memory inside NODE. */
3750
3751
static void
3752
free_token (re_token_t *node)
3753
0
{
3754
0
  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3755
0
    free_charset (node->opr.mbcset);
3756
0
  else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3757
0
    re_free (node->opr.sbcset);
3758
0
}
3759
3760
/* Worker function for tree walking.  Free the allocated memory inside NODE
3761
   and its children. */
3762
3763
static reg_errcode_t
3764
free_tree (void *extra, bin_tree_t *node)
3765
0
{
3766
0
  free_token (&node->token);
3767
0
  return REG_NOERROR;
3768
0
}
3769
3770
3771
/* Duplicate the node SRC, and return new node.  This is a preorder
3772
   visit similar to the one implemented by the generic visitor, but
3773
   we need more infrastructure to maintain two parallel trees --- so,
3774
   it's easier to duplicate.  */
3775
3776
static bin_tree_t *
3777
duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3778
0
{
3779
0
  const bin_tree_t *node;
3780
0
  bin_tree_t *dup_root;
3781
0
  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3782
3783
0
  for (node = root; ; )
3784
0
    {
3785
      /* Create a new tree and link it back to the current parent.  */
3786
0
      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3787
0
      if (*p_new == NULL)
3788
0
  return NULL;
3789
0
      (*p_new)->parent = dup_node;
3790
0
      (*p_new)->token.duplicated = 1;
3791
0
      dup_node = *p_new;
3792
3793
      /* Go to the left node, or up and to the right.  */
3794
0
      if (node->left)
3795
0
  {
3796
0
    node = node->left;
3797
0
    p_new = &dup_node->left;
3798
0
  }
3799
0
      else
3800
0
  {
3801
0
    const bin_tree_t *prev = NULL;
3802
0
    while (node->right == prev || node->right == NULL)
3803
0
      {
3804
0
        prev = node;
3805
0
        node = node->parent;
3806
0
        dup_node = dup_node->parent;
3807
0
        if (!node)
3808
0
    return dup_root;
3809
0
      }
3810
0
    node = node->right;
3811
0
    p_new = &dup_node->right;
3812
0
  }
3813
0
    }
3814
0
}