Coverage Report

Created: 2025-03-06 06:58

/src/wget/lib/regexec.c
Line
Count
Source (jump to first uncovered line)
1
/* Extended regular expression matching and search library.
2
   Copyright (C) 2002-2025 Free Software Foundation, Inc.
3
   This file is part of the GNU C Library.
4
   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6
   The GNU C Library is free software; you can redistribute it and/or
7
   modify it under the terms of the GNU Lesser General Public
8
   License as published by the Free Software Foundation; either
9
   version 2.1 of the License, or (at your option) any later version.
10
11
   The GNU C Library is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
   Lesser General Public License for more details.
15
16
   You should have received a copy of the GNU Lesser General Public
17
   License along with the GNU C Library; if not, see
18
   <https://www.gnu.org/licenses/>.  */
19
20
static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21
             Idx n);
22
static void match_ctx_clean (re_match_context_t *mctx);
23
static void match_ctx_free (re_match_context_t *cache);
24
static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25
            Idx str_idx, Idx from, Idx to);
26
static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27
static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28
             Idx str_idx);
29
static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30
                Idx node, Idx str_idx);
31
static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32
         re_dfastate_t **limited_sts, Idx last_node,
33
         Idx last_str_idx);
34
static reg_errcode_t re_search_internal (const regex_t *preg,
35
           const char *string, Idx length,
36
           Idx start, Idx last_start, Idx stop,
37
           size_t nmatch, regmatch_t pmatch[],
38
           int eflags);
39
static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40
          const char *string1, Idx length1,
41
          const char *string2, Idx length2,
42
          Idx start, regoff_t range,
43
          struct re_registers *regs,
44
          Idx stop, bool ret_len);
45
static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46
        const char *string, Idx length, Idx start,
47
        regoff_t range, Idx stop,
48
        struct re_registers *regs,
49
        bool ret_len);
50
static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51
                              Idx nregs, int regs_allocated);
52
static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53
static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54
         Idx *p_match_first);
55
static Idx check_halt_state_context (const re_match_context_t *mctx,
56
             const re_dfastate_t *state, Idx idx);
57
static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58
       regmatch_t *prev_idx_match, Idx cur_node,
59
       Idx cur_idx, Idx nmatch);
60
static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61
              Idx str_idx, Idx dest_node, Idx nregs,
62
              regmatch_t *regs, regmatch_t *prevregs,
63
              re_node_set *eps_via_nodes);
64
static reg_errcode_t set_regs (const regex_t *preg,
65
             const re_match_context_t *mctx,
66
             size_t nmatch, regmatch_t *pmatch,
67
             bool fl_backtrack);
68
static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69
70
static int sift_states_iter_mb (const re_match_context_t *mctx,
71
        re_sift_context_t *sctx,
72
        Idx node_idx, Idx str_idx, Idx max_str_idx);
73
static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
74
             re_sift_context_t *sctx);
75
static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
76
            re_sift_context_t *sctx, Idx str_idx,
77
            re_node_set *cur_dest);
78
static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
79
                re_sift_context_t *sctx,
80
                Idx str_idx,
81
                re_node_set *dest_nodes);
82
static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
83
              re_node_set *dest_nodes,
84
              const re_node_set *candidates);
85
static bool check_dst_limits (const re_match_context_t *mctx,
86
            const re_node_set *limits,
87
            Idx dst_node, Idx dst_idx, Idx src_node,
88
            Idx src_idx);
89
static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
90
          int boundaries, Idx subexp_idx,
91
          Idx from_node, Idx bkref_idx);
92
static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
93
              Idx limit, Idx subexp_idx,
94
              Idx node, Idx str_idx,
95
              Idx bkref_idx);
96
static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
97
            re_node_set *dest_nodes,
98
            const re_node_set *candidates,
99
            re_node_set *limits,
100
            struct re_backref_cache_entry *bkref_ents,
101
            Idx str_idx);
102
static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
103
          re_sift_context_t *sctx,
104
          Idx str_idx, const re_node_set *candidates);
105
static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
106
          re_dfastate_t **dst,
107
          re_dfastate_t **src, Idx num);
108
static re_dfastate_t *find_recover_state (reg_errcode_t *err,
109
           re_match_context_t *mctx);
110
static re_dfastate_t *transit_state (reg_errcode_t *err,
111
             re_match_context_t *mctx,
112
             re_dfastate_t *state);
113
static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
114
              re_match_context_t *mctx,
115
              re_dfastate_t *next_state);
116
static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
117
            re_node_set *cur_nodes,
118
            Idx str_idx);
119
#if 0
120
static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
121
          re_match_context_t *mctx,
122
          re_dfastate_t *pstate);
123
#endif
124
static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
125
               re_dfastate_t *pstate);
126
static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
127
            const re_node_set *nodes);
128
static reg_errcode_t get_subexp (re_match_context_t *mctx,
129
         Idx bkref_node, Idx bkref_str_idx);
130
static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
131
             const re_sub_match_top_t *sub_top,
132
             re_sub_match_last_t *sub_last,
133
             Idx bkref_node, Idx bkref_str);
134
static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
135
           Idx subexp_idx, int type);
136
static reg_errcode_t check_arrival (re_match_context_t *mctx,
137
            state_array_t *path, Idx top_node,
138
            Idx top_str, Idx last_node, Idx last_str,
139
            int type);
140
static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
141
               Idx str_idx,
142
               re_node_set *cur_nodes,
143
               re_node_set *next_nodes);
144
static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
145
                 re_node_set *cur_nodes,
146
                 Idx ex_subexp, int type);
147
static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
148
               re_node_set *dst_nodes,
149
               Idx target, Idx ex_subexp,
150
               int type);
151
static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
152
           re_node_set *cur_nodes, Idx cur_str,
153
           Idx subexp_num, int type);
154
static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
155
static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
156
            const re_string_t *input, Idx idx);
157
#ifdef _LIBC
158
static unsigned int find_collation_sequence_value (const unsigned char *mbs,
159
               size_t name_len);
160
#endif
161
static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
162
               const re_dfastate_t *state,
163
               re_node_set *states_node,
164
               bitset_t *states_ch);
165
static bool check_node_accept (const re_match_context_t *mctx,
166
             const re_token_t *node, Idx idx);
167
static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
168

169
/* Entry point for POSIX code.  */
170
171
/* regexec searches for a given pattern, specified by PREG, in the
172
   string STRING.
173
174
   If NMATCH is zero or REG_NOSUB was set in the cflags argument to
175
   'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
176
   least NMATCH elements, and we set them to the offsets of the
177
   corresponding matched substrings.
178
179
   EFLAGS specifies "execution flags" which affect matching: if
180
   REG_NOTBOL is set, then ^ does not match at the beginning of the
181
   string; if REG_NOTEOL is set, then $ does not match at the end.
182
183
   Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
184
   EFLAGS is invalid.  */
185
186
int
187
regexec (const regex_t *__restrict preg, const char *__restrict string,
188
   size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
189
0
{
190
0
  reg_errcode_t err;
191
0
  Idx start, length;
192
0
  re_dfa_t *dfa = preg->buffer;
193
194
0
  if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
195
0
    return REG_BADPAT;
196
197
0
  if (eflags & REG_STARTEND)
198
0
    {
199
0
      start = pmatch[0].rm_so;
200
0
      length = pmatch[0].rm_eo;
201
0
    }
202
0
  else
203
0
    {
204
0
      start = 0;
205
0
      length = strlen (string);
206
0
    }
207
208
0
  lock_lock (dfa->lock);
209
0
  if (preg->no_sub)
210
0
    err = re_search_internal (preg, string, length, start, length,
211
0
            length, 0, NULL, eflags);
212
0
  else
213
0
    err = re_search_internal (preg, string, length, start, length,
214
0
            length, nmatch, pmatch, eflags);
215
0
  lock_unlock (dfa->lock);
216
0
  return err != REG_NOERROR;
217
0
}
218
219
#ifdef _LIBC
220
libc_hidden_def (__regexec)
221
222
# include <shlib-compat.h>
223
versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
224
225
# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
226
__typeof__ (__regexec) __compat_regexec;
227
228
int
229
attribute_compat_text_section
230
__compat_regexec (const regex_t *__restrict preg,
231
      const char *__restrict string, size_t nmatch,
232
      regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
233
{
234
  return regexec (preg, string, nmatch, pmatch,
235
      eflags & (REG_NOTBOL | REG_NOTEOL));
236
}
237
compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
238
# endif
239
#endif
240
241
/* Entry points for GNU code.  */
242
243
/* re_match, re_search, re_match_2, re_search_2
244
245
   The former two functions operate on STRING with length LENGTH,
246
   while the later two operate on concatenation of STRING1 and STRING2
247
   with lengths LENGTH1 and LENGTH2, respectively.
248
249
   re_match() matches the compiled pattern in BUFP against the string,
250
   starting at index START.
251
252
   re_search() first tries matching at index START, then it tries to match
253
   starting from index START + 1, and so on.  The last start position tried
254
   is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
255
   way as re_match().)
256
257
   The parameter STOP of re_{match,search}_2 specifies that no match exceeding
258
   the first STOP characters of the concatenation of the strings should be
259
   concerned.
260
261
   If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
262
   and all groups is stored in REGS.  (For the "_2" variants, the offsets are
263
   computed relative to the concatenation, not relative to the individual
264
   strings.)
265
266
   On success, re_match* functions return the length of the match, re_search*
267
   return the position of the start of the match.  They return -1 on
268
   match failure, -2 on error.  */
269
270
regoff_t
271
re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
272
    Idx start, struct re_registers *regs)
273
0
{
274
0
  return re_search_stub (bufp, string, length, start, 0, length, regs, true);
275
0
}
276
#ifdef _LIBC
277
weak_alias (__re_match, re_match)
278
#endif
279
280
regoff_t
281
re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
282
     Idx start, regoff_t range, struct re_registers *regs)
283
0
{
284
0
  return re_search_stub (bufp, string, length, start, range, length, regs,
285
0
       false);
286
0
}
287
#ifdef _LIBC
288
weak_alias (__re_search, re_search)
289
#endif
290
291
regoff_t
292
re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
293
      const char *string2, Idx length2, Idx start,
294
      struct re_registers *regs, Idx stop)
295
0
{
296
0
  return re_search_2_stub (bufp, string1, length1, string2, length2,
297
0
         start, 0, regs, stop, true);
298
0
}
299
#ifdef _LIBC
300
weak_alias (__re_match_2, re_match_2)
301
#endif
302
303
regoff_t
304
re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
305
       const char *string2, Idx length2, Idx start, regoff_t range,
306
       struct re_registers *regs, Idx stop)
307
0
{
308
0
  return re_search_2_stub (bufp, string1, length1, string2, length2,
309
0
         start, range, regs, stop, false);
310
0
}
311
#ifdef _LIBC
312
weak_alias (__re_search_2, re_search_2)
313
#endif
314
315
static regoff_t
316
re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
317
      Idx length1, const char *string2, Idx length2, Idx start,
318
      regoff_t range, struct re_registers *regs,
319
      Idx stop, bool ret_len)
320
0
{
321
0
  const char *str;
322
0
  regoff_t rval;
323
0
  Idx len;
324
0
  char *s = NULL;
325
326
0
  if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
327
0
       || ckd_add (&len, length1, length2))))
328
0
    return -2;
329
330
  /* Concatenate the strings.  */
331
0
  if (length2 > 0)
332
0
    if (length1 > 0)
333
0
      {
334
0
  s = re_malloc (char, len);
335
336
0
  if (__glibc_unlikely (s == NULL))
337
0
    return -2;
338
#ifdef _LIBC
339
  memcpy (__mempcpy (s, string1, length1), string2, length2);
340
#else
341
0
  memcpy (s, string1, length1);
342
0
  memcpy (s + length1, string2, length2);
343
0
#endif
344
0
  str = s;
345
0
      }
346
0
    else
347
0
      str = string2;
348
0
  else
349
0
    str = string1;
350
351
0
  rval = re_search_stub (bufp, str, len, start, range, stop, regs,
352
0
       ret_len);
353
0
  re_free (s);
354
0
  return rval;
355
0
}
356
357
/* The parameters have the same meaning as those of re_search.
358
   Additional parameters:
359
   If RET_LEN is true the length of the match is returned (re_match style);
360
   otherwise the position of the match is returned.  */
361
362
static regoff_t
363
re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
364
    Idx start, regoff_t range, Idx stop, struct re_registers *regs,
365
    bool ret_len)
366
0
{
367
0
  reg_errcode_t result;
368
0
  regmatch_t *pmatch;
369
0
  Idx nregs;
370
0
  regoff_t rval;
371
0
  int eflags = 0;
372
0
  re_dfa_t *dfa = bufp->buffer;
373
0
  Idx last_start = start + range;
374
375
  /* Check for out-of-range.  */
376
0
  if (__glibc_unlikely (start < 0 || start > length))
377
0
    return -1;
378
0
  if (__glibc_unlikely (length < last_start
379
0
      || (0 <= range && last_start < start)))
380
0
    last_start = length;
381
0
  else if (__glibc_unlikely (last_start < 0
382
0
           || (range < 0 && start <= last_start)))
383
0
    last_start = 0;
384
385
0
  lock_lock (dfa->lock);
386
387
0
  eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
388
0
  eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
389
390
  /* Compile fastmap if we haven't yet.  */
391
0
  if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
392
0
    re_compile_fastmap (bufp);
393
394
0
  if (__glibc_unlikely (bufp->no_sub))
395
0
    regs = NULL;
396
397
  /* We need at least 1 register.  */
398
0
  if (regs == NULL)
399
0
    nregs = 1;
400
0
  else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
401
0
           && regs->num_regs <= bufp->re_nsub))
402
0
    {
403
0
      nregs = regs->num_regs;
404
0
      if (__glibc_unlikely (nregs < 1))
405
0
  {
406
    /* Nothing can be copied to regs.  */
407
0
    regs = NULL;
408
0
    nregs = 1;
409
0
  }
410
0
    }
411
0
  else
412
0
    nregs = bufp->re_nsub + 1;
413
0
  pmatch = re_malloc (regmatch_t, nregs);
414
0
  if (__glibc_unlikely (pmatch == NULL))
415
0
    {
416
0
      rval = -2;
417
0
      goto out;
418
0
    }
419
420
0
  result = re_search_internal (bufp, string, length, start, last_start, stop,
421
0
             nregs, pmatch, eflags);
422
423
0
  rval = 0;
424
425
  /* I hope we needn't fill their regs with -1's when no match was found.  */
426
0
  if (result != REG_NOERROR)
427
0
    rval = result == REG_NOMATCH ? -1 : -2;
428
0
  else if (regs != NULL)
429
0
    {
430
      /* If caller wants register contents data back, copy them.  */
431
0
      bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
432
0
             bufp->regs_allocated);
433
0
      if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
434
0
  rval = -2;
435
0
    }
436
437
0
  if (__glibc_likely (rval == 0))
438
0
    {
439
0
      if (ret_len)
440
0
  {
441
0
    DEBUG_ASSERT (pmatch[0].rm_so == start);
442
0
    rval = pmatch[0].rm_eo - start;
443
0
  }
444
0
      else
445
0
  rval = pmatch[0].rm_so;
446
0
    }
447
0
  re_free (pmatch);
448
0
 out:
449
0
  lock_unlock (dfa->lock);
450
0
  return rval;
451
0
}
452
453
static unsigned
454
re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
455
        int regs_allocated)
456
0
{
457
0
  int rval = REGS_REALLOCATE;
458
0
  Idx i;
459
0
  Idx need_regs = nregs + 1;
460
  /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
461
     uses.  */
462
463
  /* Have the register data arrays been allocated?  */
464
0
  if (regs_allocated == REGS_UNALLOCATED)
465
0
    { /* No.  So allocate them with malloc.  */
466
0
      regs->start = re_malloc (regoff_t, need_regs);
467
0
      if (__glibc_unlikely (regs->start == NULL))
468
0
  return REGS_UNALLOCATED;
469
0
      regs->end = re_malloc (regoff_t, need_regs);
470
0
      if (__glibc_unlikely (regs->end == NULL))
471
0
  {
472
0
    re_free (regs->start);
473
0
    return REGS_UNALLOCATED;
474
0
  }
475
0
      regs->num_regs = need_regs;
476
0
    }
477
0
  else if (regs_allocated == REGS_REALLOCATE)
478
0
    { /* Yes.  If we need more elements than were already
479
   allocated, reallocate them.  If we need fewer, just
480
   leave it alone.  */
481
0
      if (__glibc_unlikely (need_regs > regs->num_regs))
482
0
  {
483
0
    regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
484
0
    regoff_t *new_end;
485
0
    if (__glibc_unlikely (new_start == NULL))
486
0
      return REGS_UNALLOCATED;
487
0
    new_end = re_realloc (regs->end, regoff_t, need_regs);
488
0
    if (__glibc_unlikely (new_end == NULL))
489
0
      {
490
0
        re_free (new_start);
491
0
        return REGS_UNALLOCATED;
492
0
      }
493
0
    regs->start = new_start;
494
0
    regs->end = new_end;
495
0
    regs->num_regs = need_regs;
496
0
  }
497
0
    }
498
0
  else
499
0
    {
500
0
      DEBUG_ASSERT (regs_allocated == REGS_FIXED);
501
      /* This function may not be called with REGS_FIXED and nregs too big.  */
502
0
      DEBUG_ASSERT (nregs <= regs->num_regs);
503
0
      rval = REGS_FIXED;
504
0
    }
505
506
  /* Copy the regs.  */
507
0
  for (i = 0; i < nregs; ++i)
508
0
    {
509
0
      regs->start[i] = pmatch[i].rm_so;
510
0
      regs->end[i] = pmatch[i].rm_eo;
511
0
    }
512
0
  for ( ; i < regs->num_regs; ++i)
513
0
    regs->start[i] = regs->end[i] = -1;
514
515
0
  return rval;
516
0
}
517
518
/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
519
   ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
520
   this memory for recording register information.  STARTS and ENDS
521
   must be allocated using the malloc library routine, and must each
522
   be at least NUM_REGS * sizeof (regoff_t) bytes long.
523
524
   If NUM_REGS == 0, then subsequent matches should allocate their own
525
   register data.
526
527
   Unless this function is called, the first search or match using
528
   PATTERN_BUFFER will allocate its own register data, without
529
   freeing the old data.  */
530
531
void
532
re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
533
      __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
534
0
{
535
0
  if (num_regs)
536
0
    {
537
0
      bufp->regs_allocated = REGS_REALLOCATE;
538
0
      regs->num_regs = num_regs;
539
0
      regs->start = starts;
540
0
      regs->end = ends;
541
0
    }
542
0
  else
543
0
    {
544
0
      bufp->regs_allocated = REGS_UNALLOCATED;
545
0
      regs->num_regs = 0;
546
0
      regs->start = regs->end = NULL;
547
0
    }
548
0
}
549
#ifdef _LIBC
550
weak_alias (__re_set_registers, re_set_registers)
551
#endif
552

553
/* Entry points compatible with 4.2 BSD regex library.  We don't define
554
   them unless specifically requested.  */
555
556
#if defined _REGEX_RE_COMP || defined _LIBC
557
int
558
# ifdef _LIBC
559
weak_function
560
# endif
561
re_exec (const char *s)
562
{
563
  return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
564
}
565
#endif /* _REGEX_RE_COMP */
566

567
/* Internal entry point.  */
568
569
/* Searches for a compiled pattern PREG in the string STRING, whose
570
   length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
571
   meaning as with regexec.  LAST_START is START + RANGE, where
572
   START and RANGE have the same meaning as with re_search.
573
   Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
574
   otherwise return the error code.
575
   Note: We assume front end functions already check ranges.
576
   (0 <= LAST_START && LAST_START <= LENGTH)  */
577
578
static reg_errcode_t
579
__attribute_warn_unused_result__
580
re_search_internal (const regex_t *preg, const char *string, Idx length,
581
        Idx start, Idx last_start, Idx stop, size_t nmatch,
582
        regmatch_t pmatch[], int eflags)
583
0
{
584
0
  reg_errcode_t err;
585
0
  const re_dfa_t *dfa = preg->buffer;
586
0
  Idx left_lim, right_lim;
587
0
  int incr;
588
0
  bool fl_longest_match;
589
0
  int match_kind;
590
0
  Idx match_first;
591
0
  Idx match_last = -1;
592
0
  Idx extra_nmatch;
593
0
  bool sb;
594
0
  int ch;
595
0
  re_match_context_t mctx = { .dfa = dfa };
596
0
  char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
597
0
        && start != last_start && !preg->can_be_null)
598
0
       ? preg->fastmap : NULL);
599
0
  RE_TRANSLATE_TYPE t = preg->translate;
600
601
0
  extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
602
0
  nmatch -= extra_nmatch;
603
604
  /* Check if the DFA haven't been compiled.  */
605
0
  if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
606
0
      || dfa->init_state_word == NULL
607
0
      || dfa->init_state_nl == NULL
608
0
      || dfa->init_state_begbuf == NULL))
609
0
    return REG_NOMATCH;
610
611
  /* We assume front-end functions already check them.  */
612
0
  DEBUG_ASSERT (0 <= last_start && last_start <= length);
613
614
  /* If initial states with non-begbuf contexts have no elements,
615
     the regex must be anchored.  If preg->newline_anchor is set,
616
     we'll never use init_state_nl, so do not check it.  */
617
0
  if (dfa->init_state->nodes.nelem == 0
618
0
      && dfa->init_state_word->nodes.nelem == 0
619
0
      && (dfa->init_state_nl->nodes.nelem == 0
620
0
    || !preg->newline_anchor))
621
0
    {
622
0
      if (start != 0 && last_start != 0)
623
0
        return REG_NOMATCH;
624
0
      start = last_start = 0;
625
0
    }
626
627
  /* We must check the longest matching, if nmatch > 0.  */
628
0
  fl_longest_match = (nmatch != 0 || dfa->nbackref);
629
630
0
  err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
631
0
          preg->translate, (preg->syntax & RE_ICASE) != 0,
632
0
          dfa);
633
0
  if (__glibc_unlikely (err != REG_NOERROR))
634
0
    goto free_return;
635
0
  mctx.input.stop = stop;
636
0
  mctx.input.raw_stop = stop;
637
0
  mctx.input.newline_anchor = preg->newline_anchor;
638
639
0
  err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
640
0
  if (__glibc_unlikely (err != REG_NOERROR))
641
0
    goto free_return;
642
643
  /* We will log all the DFA states through which the dfa pass,
644
     if nmatch > 1, or this dfa has "multibyte node", which is a
645
     back-reference or a node which can accept multibyte character or
646
     multi character collating element.  */
647
0
  if (nmatch > 1 || dfa->has_mb_node)
648
0
    {
649
      /* Avoid overflow.  */
650
0
      if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
651
0
           <= mctx.input.bufs_len)))
652
0
  {
653
0
    err = REG_ESPACE;
654
0
    goto free_return;
655
0
  }
656
657
0
      mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
658
0
      if (__glibc_unlikely (mctx.state_log == NULL))
659
0
  {
660
0
    err = REG_ESPACE;
661
0
    goto free_return;
662
0
  }
663
0
    }
664
665
0
  match_first = start;
666
0
  mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
667
0
         : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
668
669
  /* Check incrementally whether the input string matches.  */
670
0
  incr = (last_start < start) ? -1 : 1;
671
0
  left_lim = (last_start < start) ? last_start : start;
672
0
  right_lim = (last_start < start) ? start : last_start;
673
0
  sb = dfa->mb_cur_max == 1;
674
0
  match_kind =
675
0
    (fastmap
676
0
     ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
677
0
  | (start <= last_start ? 2 : 0)
678
0
  | (t != NULL ? 1 : 0))
679
0
     : 8);
680
681
0
  for (;; match_first += incr)
682
0
    {
683
0
      err = REG_NOMATCH;
684
0
      if (match_first < left_lim || right_lim < match_first)
685
0
  goto free_return;
686
687
      /* Advance as rapidly as possible through the string, until we
688
   find a plausible place to start matching.  This may be done
689
   with varying efficiency, so there are various possibilities:
690
   only the most common of them are specialized, in order to
691
   save on code size.  We use a switch statement for speed.  */
692
0
      switch (match_kind)
693
0
  {
694
0
  case 8:
695
    /* No fastmap.  */
696
0
    break;
697
698
0
  case 7:
699
    /* Fastmap with single-byte translation, match forward.  */
700
0
    while (__glibc_likely (match_first < right_lim)
701
0
     && !fastmap[t[(unsigned char) string[match_first]]])
702
0
      ++match_first;
703
0
    goto forward_match_found_start_or_reached_end;
704
705
0
  case 6:
706
    /* Fastmap without translation, match forward.  */
707
0
    while (__glibc_likely (match_first < right_lim)
708
0
     && !fastmap[(unsigned char) string[match_first]])
709
0
      ++match_first;
710
711
0
  forward_match_found_start_or_reached_end:
712
0
    if (__glibc_unlikely (match_first == right_lim))
713
0
      {
714
0
        ch = match_first >= length
715
0
           ? 0 : (unsigned char) string[match_first];
716
0
        if (!fastmap[t ? t[ch] : ch])
717
0
    goto free_return;
718
0
      }
719
0
    break;
720
721
0
  case 4:
722
0
  case 5:
723
    /* Fastmap without multi-byte translation, match backwards.  */
724
0
    while (match_first >= left_lim)
725
0
      {
726
0
        ch = match_first >= length
727
0
           ? 0 : (unsigned char) string[match_first];
728
0
        if (fastmap[t ? t[ch] : ch])
729
0
    break;
730
0
        --match_first;
731
0
      }
732
0
    if (match_first < left_lim)
733
0
      goto free_return;
734
0
    break;
735
736
0
  default:
737
    /* In this case, we can't determine easily the current byte,
738
       since it might be a component byte of a multibyte
739
       character.  Then we use the constructed buffer instead.  */
740
0
    for (;;)
741
0
      {
742
        /* If MATCH_FIRST is out of the valid range, reconstruct the
743
     buffers.  */
744
0
        __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
745
0
        if (__glibc_unlikely (offset
746
0
            >= (__re_size_t) mctx.input.valid_raw_len))
747
0
    {
748
0
      err = re_string_reconstruct (&mctx.input, match_first,
749
0
                 eflags);
750
0
      if (__glibc_unlikely (err != REG_NOERROR))
751
0
        goto free_return;
752
753
0
      offset = match_first - mctx.input.raw_mbs_idx;
754
0
    }
755
        /* Use buffer byte if OFFSET is in buffer, otherwise '\0'.  */
756
0
        ch = (offset < mctx.input.valid_len
757
0
        ? re_string_byte_at (&mctx.input, offset) : 0);
758
0
        if (fastmap[ch])
759
0
    break;
760
0
        match_first += incr;
761
0
        if (match_first < left_lim || match_first > right_lim)
762
0
    {
763
0
      err = REG_NOMATCH;
764
0
      goto free_return;
765
0
    }
766
0
      }
767
0
    break;
768
0
  }
769
770
      /* Reconstruct the buffers so that the matcher can assume that
771
   the matching starts from the beginning of the buffer.  */
772
0
      err = re_string_reconstruct (&mctx.input, match_first, eflags);
773
0
      if (__glibc_unlikely (err != REG_NOERROR))
774
0
  goto free_return;
775
776
      /* Don't consider this char as a possible match start if it part,
777
         yet isn't the head, of a multibyte character.  */
778
0
      if (!sb && !re_string_first_byte (&mctx.input, 0))
779
0
  continue;
780
781
      /* It seems to be appropriate one, then use the matcher.  */
782
      /* We assume that the matching starts from 0.  */
783
0
      mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
784
0
      match_last = check_matching (&mctx, fl_longest_match,
785
0
           start <= last_start ? &match_first : NULL);
786
0
      if (match_last != -1)
787
0
  {
788
0
    if (__glibc_unlikely (match_last == -2))
789
0
      {
790
0
        err = REG_ESPACE;
791
0
        goto free_return;
792
0
      }
793
0
    else
794
0
      {
795
0
        mctx.match_last = match_last;
796
0
        if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
797
0
    {
798
0
      re_dfastate_t *pstate = mctx.state_log[match_last];
799
0
      mctx.last_node = check_halt_state_context (&mctx, pstate,
800
0
                   match_last);
801
0
    }
802
0
        if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
803
0
      || dfa->nbackref)
804
0
    {
805
0
      err = prune_impossible_nodes (&mctx);
806
0
      if (err == REG_NOERROR)
807
0
        break;
808
0
      if (__glibc_unlikely (err != REG_NOMATCH))
809
0
        goto free_return;
810
0
      match_last = -1;
811
0
    }
812
0
        else
813
0
    break; /* We found a match.  */
814
0
      }
815
0
  }
816
817
0
      match_ctx_clean (&mctx);
818
0
    }
819
820
0
  DEBUG_ASSERT (match_last != -1);
821
0
  DEBUG_ASSERT (err == REG_NOERROR);
822
823
  /* Set pmatch[] if we need.  */
824
0
  if (nmatch > 0)
825
0
    {
826
0
      Idx reg_idx;
827
828
      /* Initialize registers.  */
829
0
      for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
830
0
  pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
831
832
      /* Set the points where matching start/end.  */
833
0
      pmatch[0].rm_so = 0;
834
0
      pmatch[0].rm_eo = mctx.match_last;
835
      /* FIXME: This function should fail if mctx.match_last exceeds
836
   the maximum possible regoff_t value.  We need a new error
837
   code REG_OVERFLOW.  */
838
839
0
      if (!preg->no_sub && nmatch > 1)
840
0
  {
841
0
    err = set_regs (preg, &mctx, nmatch, pmatch,
842
0
        dfa->has_plural_match && dfa->nbackref > 0);
843
0
    if (__glibc_unlikely (err != REG_NOERROR))
844
0
      goto free_return;
845
0
  }
846
847
      /* At last, add the offset to each register, since we slid
848
   the buffers so that we could assume that the matching starts
849
   from 0.  */
850
0
      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
851
0
  if (pmatch[reg_idx].rm_so != -1)
852
0
    {
853
0
      if (__glibc_unlikely (mctx.input.offsets_needed != 0))
854
0
        {
855
0
    pmatch[reg_idx].rm_so =
856
0
      (pmatch[reg_idx].rm_so == mctx.input.valid_len
857
0
       ? mctx.input.valid_raw_len
858
0
       : mctx.input.offsets[pmatch[reg_idx].rm_so]);
859
0
    pmatch[reg_idx].rm_eo =
860
0
      (pmatch[reg_idx].rm_eo == mctx.input.valid_len
861
0
       ? mctx.input.valid_raw_len
862
0
       : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
863
0
        }
864
0
      pmatch[reg_idx].rm_so += match_first;
865
0
      pmatch[reg_idx].rm_eo += match_first;
866
0
    }
867
0
      for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
868
0
  {
869
0
    pmatch[nmatch + reg_idx].rm_so = -1;
870
0
    pmatch[nmatch + reg_idx].rm_eo = -1;
871
0
  }
872
873
0
      if (dfa->subexp_map)
874
0
  for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
875
0
    if (dfa->subexp_map[reg_idx] != reg_idx)
876
0
      {
877
0
        pmatch[reg_idx + 1].rm_so
878
0
    = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
879
0
        pmatch[reg_idx + 1].rm_eo
880
0
    = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
881
0
      }
882
0
    }
883
884
0
 free_return:
885
0
  re_free (mctx.state_log);
886
0
  if (dfa->nbackref)
887
0
    match_ctx_free (&mctx);
888
0
  re_string_destruct (&mctx.input);
889
0
  return err;
890
0
}
891
892
static reg_errcode_t
893
__attribute_warn_unused_result__
894
prune_impossible_nodes (re_match_context_t *mctx)
895
0
{
896
0
  const re_dfa_t *const dfa = mctx->dfa;
897
0
  Idx halt_node, match_last;
898
0
  reg_errcode_t ret;
899
0
  re_dfastate_t **sifted_states;
900
0
  re_dfastate_t **lim_states = NULL;
901
0
  re_sift_context_t sctx;
902
0
  DEBUG_ASSERT (mctx->state_log != NULL);
903
0
  match_last = mctx->match_last;
904
0
  halt_node = mctx->last_node;
905
906
  /* Avoid overflow.  */
907
0
  if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
908
0
      <= match_last))
909
0
    return REG_ESPACE;
910
911
0
  sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
912
0
  if (__glibc_unlikely (sifted_states == NULL))
913
0
    {
914
0
      ret = REG_ESPACE;
915
0
      goto free_return;
916
0
    }
917
0
  if (dfa->nbackref)
918
0
    {
919
0
      lim_states = re_malloc (re_dfastate_t *, match_last + 1);
920
0
      if (__glibc_unlikely (lim_states == NULL))
921
0
  {
922
0
    ret = REG_ESPACE;
923
0
    goto free_return;
924
0
  }
925
0
      while (1)
926
0
  {
927
0
    memset (lim_states, '\0',
928
0
      sizeof (re_dfastate_t *) * (match_last + 1));
929
0
    sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
930
0
       match_last);
931
0
    ret = sift_states_backward (mctx, &sctx);
932
0
    re_node_set_free (&sctx.limits);
933
0
    if (__glibc_unlikely (ret != REG_NOERROR))
934
0
        goto free_return;
935
0
    if (sifted_states[0] != NULL || lim_states[0] != NULL)
936
0
      break;
937
0
    do
938
0
      {
939
0
        --match_last;
940
0
        if (match_last < 0)
941
0
    {
942
0
      ret = REG_NOMATCH;
943
0
      goto free_return;
944
0
    }
945
0
      } while (mctx->state_log[match_last] == NULL
946
0
         || !mctx->state_log[match_last]->halt);
947
0
    halt_node = check_halt_state_context (mctx,
948
0
            mctx->state_log[match_last],
949
0
            match_last);
950
0
  }
951
0
      ret = merge_state_array (dfa, sifted_states, lim_states,
952
0
             match_last + 1);
953
0
      re_free (lim_states);
954
0
      lim_states = NULL;
955
0
      if (__glibc_unlikely (ret != REG_NOERROR))
956
0
  goto free_return;
957
0
    }
958
0
  else
959
0
    {
960
0
      sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
961
0
      ret = sift_states_backward (mctx, &sctx);
962
0
      re_node_set_free (&sctx.limits);
963
0
      if (__glibc_unlikely (ret != REG_NOERROR))
964
0
  goto free_return;
965
0
      if (sifted_states[0] == NULL)
966
0
  {
967
0
    ret = REG_NOMATCH;
968
0
    goto free_return;
969
0
  }
970
0
    }
971
0
  re_free (mctx->state_log);
972
0
  mctx->state_log = sifted_states;
973
0
  sifted_states = NULL;
974
0
  mctx->last_node = halt_node;
975
0
  mctx->match_last = match_last;
976
0
  ret = REG_NOERROR;
977
0
 free_return:
978
0
  re_free (sifted_states);
979
0
  re_free (lim_states);
980
0
  return ret;
981
0
}
982
983
/* Acquire an initial state and return it.
984
   We must select appropriate initial state depending on the context,
985
   since initial states may have constraints like "\<", "^", etc..  */
986
987
static __always_inline re_dfastate_t *
988
acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
989
          Idx idx)
990
0
{
991
0
  const re_dfa_t *const dfa = mctx->dfa;
992
0
  if (dfa->init_state->has_constraint)
993
0
    {
994
0
      unsigned int context;
995
0
      context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
996
0
      if (IS_WORD_CONTEXT (context))
997
0
  return dfa->init_state_word;
998
0
      else if (IS_ORDINARY_CONTEXT (context))
999
0
  return dfa->init_state;
1000
0
      else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1001
0
  return dfa->init_state_begbuf;
1002
0
      else if (IS_NEWLINE_CONTEXT (context))
1003
0
  return dfa->init_state_nl;
1004
0
      else if (IS_BEGBUF_CONTEXT (context))
1005
0
  {
1006
    /* It is relatively rare case, then calculate on demand.  */
1007
0
    return re_acquire_state_context (err, dfa,
1008
0
             dfa->init_state->entrance_nodes,
1009
0
             context);
1010
0
  }
1011
0
      else
1012
  /* Must not happen?  */
1013
0
  return dfa->init_state;
1014
0
    }
1015
0
  else
1016
0
    return dfa->init_state;
1017
0
}
1018
1019
/* Check whether the regular expression match input string INPUT or not,
1020
   and return the index where the matching end.  Return -1 if
1021
   there is no match, and return -2 in case of an error.
1022
   FL_LONGEST_MATCH means we want the POSIX longest matching.
1023
   If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1024
   next place where we may want to try matching.
1025
   Note that the matcher assumes that the matching starts from the current
1026
   index of the buffer.  */
1027
1028
static Idx
1029
__attribute_warn_unused_result__
1030
check_matching (re_match_context_t *mctx, bool fl_longest_match,
1031
    Idx *p_match_first)
1032
0
{
1033
0
  const re_dfa_t *const dfa = mctx->dfa;
1034
0
  reg_errcode_t err;
1035
0
  Idx match = 0;
1036
0
  Idx match_last = -1;
1037
0
  Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1038
0
  re_dfastate_t *cur_state;
1039
0
  bool at_init_state = p_match_first != NULL;
1040
0
  Idx next_start_idx = cur_str_idx;
1041
1042
0
  err = REG_NOERROR;
1043
0
  cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1044
  /* An initial state must not be NULL (invalid).  */
1045
0
  if (__glibc_unlikely (cur_state == NULL))
1046
0
    {
1047
0
      DEBUG_ASSERT (err == REG_ESPACE);
1048
0
      return -2;
1049
0
    }
1050
1051
0
  if (mctx->state_log != NULL)
1052
0
    {
1053
0
      mctx->state_log[cur_str_idx] = cur_state;
1054
1055
      /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1056
   later.  E.g. Processing back references.  */
1057
0
      if (__glibc_unlikely (dfa->nbackref))
1058
0
  {
1059
0
    at_init_state = false;
1060
0
    err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1061
0
    if (__glibc_unlikely (err != REG_NOERROR))
1062
0
      return err;
1063
1064
0
    if (cur_state->has_backref)
1065
0
      {
1066
0
        err = transit_state_bkref (mctx, &cur_state->nodes);
1067
0
        if (__glibc_unlikely (err != REG_NOERROR))
1068
0
    return err;
1069
0
      }
1070
0
  }
1071
0
    }
1072
1073
  /* If the RE accepts NULL string.  */
1074
0
  if (__glibc_unlikely (cur_state->halt))
1075
0
    {
1076
0
      if (!cur_state->has_constraint
1077
0
    || check_halt_state_context (mctx, cur_state, cur_str_idx))
1078
0
  {
1079
0
    if (!fl_longest_match)
1080
0
      return cur_str_idx;
1081
0
    else
1082
0
      {
1083
0
        match_last = cur_str_idx;
1084
0
        match = 1;
1085
0
      }
1086
0
  }
1087
0
    }
1088
1089
0
  while (!re_string_eoi (&mctx->input))
1090
0
    {
1091
0
      re_dfastate_t *old_state = cur_state;
1092
0
      Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1093
1094
0
      if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1095
0
     && mctx->input.bufs_len < mctx->input.len)
1096
0
    || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1097
0
        && mctx->input.valid_len < mctx->input.len))
1098
0
  {
1099
0
    err = extend_buffers (mctx, next_char_idx + 1);
1100
0
    if (__glibc_unlikely (err != REG_NOERROR))
1101
0
      {
1102
0
        DEBUG_ASSERT (err == REG_ESPACE);
1103
0
        return -2;
1104
0
      }
1105
0
  }
1106
1107
0
      cur_state = transit_state (&err, mctx, cur_state);
1108
0
      if (mctx->state_log != NULL)
1109
0
  cur_state = merge_state_with_log (&err, mctx, cur_state);
1110
1111
0
      if (cur_state == NULL)
1112
0
  {
1113
    /* Reached the invalid state or an error.  Try to recover a valid
1114
       state using the state log, if available and if we have not
1115
       already found a valid (even if not the longest) match.  */
1116
0
    if (__glibc_unlikely (err != REG_NOERROR))
1117
0
      return -2;
1118
1119
0
    if (mctx->state_log == NULL
1120
0
        || (match && !fl_longest_match)
1121
0
        || (cur_state = find_recover_state (&err, mctx)) == NULL)
1122
0
      break;
1123
0
  }
1124
1125
0
      if (__glibc_unlikely (at_init_state))
1126
0
  {
1127
0
    if (old_state == cur_state)
1128
0
      next_start_idx = next_char_idx;
1129
0
    else
1130
0
      at_init_state = false;
1131
0
  }
1132
1133
0
      if (cur_state->halt)
1134
0
  {
1135
    /* Reached a halt state.
1136
       Check the halt state can satisfy the current context.  */
1137
0
    if (!cur_state->has_constraint
1138
0
        || check_halt_state_context (mctx, cur_state,
1139
0
             re_string_cur_idx (&mctx->input)))
1140
0
      {
1141
        /* We found an appropriate halt state.  */
1142
0
        match_last = re_string_cur_idx (&mctx->input);
1143
0
        match = 1;
1144
1145
        /* We found a match, do not modify match_first below.  */
1146
0
        p_match_first = NULL;
1147
0
        if (!fl_longest_match)
1148
0
    break;
1149
0
      }
1150
0
  }
1151
0
    }
1152
1153
0
  if (p_match_first)
1154
0
    *p_match_first += next_start_idx;
1155
1156
0
  return match_last;
1157
0
}
1158
1159
/* Check NODE match the current context.  */
1160
1161
static bool
1162
check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1163
0
{
1164
0
  re_token_type_t type = dfa->nodes[node].type;
1165
0
  unsigned int constraint = dfa->nodes[node].constraint;
1166
0
  if (type != END_OF_RE)
1167
0
    return false;
1168
0
  if (!constraint)
1169
0
    return true;
1170
0
  if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1171
0
    return false;
1172
0
  return true;
1173
0
}
1174
1175
/* Check the halt state STATE match the current context.
1176
   Return 0 if not match, if the node, STATE has, is a halt node and
1177
   match the context, return the node.  */
1178
1179
static Idx
1180
check_halt_state_context (const re_match_context_t *mctx,
1181
        const re_dfastate_t *state, Idx idx)
1182
0
{
1183
0
  Idx i;
1184
0
  unsigned int context;
1185
0
  DEBUG_ASSERT (state->halt);
1186
0
  context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1187
0
  for (i = 0; i < state->nodes.nelem; ++i)
1188
0
    if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1189
0
      return state->nodes.elems[i];
1190
0
  return 0;
1191
0
}
1192
1193
/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1194
   corresponding to the DFA).
1195
   Return the destination node, and update EPS_VIA_NODES;
1196
   return -1 on match failure, -2 on error.  */
1197
1198
static Idx
1199
proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1200
       regmatch_t *prevregs,
1201
       Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1202
       struct re_fail_stack_t *fs)
1203
0
{
1204
0
  const re_dfa_t *const dfa = mctx->dfa;
1205
0
  if (IS_EPSILON_NODE (dfa->nodes[node].type))
1206
0
    {
1207
0
      re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1208
0
      re_node_set *edests = &dfa->edests[node];
1209
1210
0
      if (! re_node_set_contains (eps_via_nodes, node))
1211
0
        {
1212
0
          bool ok = re_node_set_insert (eps_via_nodes, node);
1213
0
          if (__glibc_unlikely (! ok))
1214
0
            return -2;
1215
0
        }
1216
1217
      /* Pick a valid destination, or return -1 if none is found.  */
1218
0
      Idx dest_node = -1;
1219
0
      for (Idx i = 0; i < edests->nelem; i++)
1220
0
  {
1221
0
    Idx candidate = edests->elems[i];
1222
0
    if (!re_node_set_contains (cur_nodes, candidate))
1223
0
      continue;
1224
0
          if (dest_node == -1)
1225
0
      dest_node = candidate;
1226
1227
0
    else
1228
0
      {
1229
        /* In order to avoid infinite loop like "(a*)*", return the second
1230
     epsilon-transition if the first was already considered.  */
1231
0
        if (re_node_set_contains (eps_via_nodes, dest_node))
1232
0
    return candidate;
1233
1234
        /* Otherwise, push the second epsilon-transition on the fail stack.  */
1235
0
        else if (fs != NULL
1236
0
           && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1237
0
             prevregs, eps_via_nodes))
1238
0
    return -2;
1239
1240
        /* We know we are going to exit.  */
1241
0
        break;
1242
0
      }
1243
0
  }
1244
0
      return dest_node;
1245
0
    }
1246
0
  else
1247
0
    {
1248
0
      Idx naccepted = 0;
1249
0
      re_token_type_t type = dfa->nodes[node].type;
1250
1251
0
      if (dfa->nodes[node].accept_mb)
1252
0
  naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1253
0
      else if (type == OP_BACK_REF)
1254
0
  {
1255
0
    Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1256
0
    if (subexp_idx < nregs)
1257
0
      naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1258
0
    if (fs != NULL)
1259
0
      {
1260
0
        if (subexp_idx >= nregs
1261
0
      || regs[subexp_idx].rm_so == -1
1262
0
      || regs[subexp_idx].rm_eo == -1)
1263
0
    return -1;
1264
0
        else if (naccepted)
1265
0
    {
1266
0
      char *buf = (char *) re_string_get_buffer (&mctx->input);
1267
0
      if (mctx->input.valid_len - *pidx < naccepted
1268
0
          || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1269
0
          naccepted)
1270
0
        != 0))
1271
0
        return -1;
1272
0
    }
1273
0
      }
1274
1275
0
    if (naccepted == 0)
1276
0
      {
1277
0
        Idx dest_node;
1278
0
        bool ok = re_node_set_insert (eps_via_nodes, node);
1279
0
        if (__glibc_unlikely (! ok))
1280
0
    return -2;
1281
0
        dest_node = dfa->edests[node].elems[0];
1282
0
        if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1283
0
          dest_node))
1284
0
    return dest_node;
1285
0
      }
1286
0
  }
1287
1288
0
      if (naccepted != 0
1289
0
    || check_node_accept (mctx, dfa->nodes + node, *pidx))
1290
0
  {
1291
0
    Idx dest_node = dfa->nexts[node];
1292
0
    *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1293
0
    if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1294
0
         || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1295
0
                 dest_node)))
1296
0
      return -1;
1297
0
    re_node_set_empty (eps_via_nodes);
1298
0
    return dest_node;
1299
0
  }
1300
0
    }
1301
0
  return -1;
1302
0
}
1303
1304
static reg_errcode_t
1305
__attribute_warn_unused_result__
1306
push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1307
     Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1308
     re_node_set *eps_via_nodes)
1309
0
{
1310
0
  reg_errcode_t err;
1311
0
  Idx num = fs->num;
1312
0
  if (num == fs->alloc)
1313
0
    {
1314
0
      struct re_fail_stack_ent_t *new_array;
1315
0
      new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1316
0
                              fs->alloc * 2);
1317
0
      if (new_array == NULL)
1318
0
  return REG_ESPACE;
1319
0
      fs->alloc *= 2;
1320
0
      fs->stack = new_array;
1321
0
    }
1322
0
  fs->stack[num].idx = str_idx;
1323
0
  fs->stack[num].node = dest_node;
1324
0
  fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1325
0
  if (fs->stack[num].regs == NULL)
1326
0
    return REG_ESPACE;
1327
0
  fs->num = num + 1;
1328
0
  memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1329
0
  memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1330
0
  err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1331
0
  return err;
1332
0
}
1333
1334
static Idx
1335
pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1336
    regmatch_t *regs, regmatch_t *prevregs,
1337
    re_node_set *eps_via_nodes)
1338
0
{
1339
0
  if (fs == NULL || fs->num == 0)
1340
0
    return -1;
1341
0
  Idx num = --fs->num;
1342
0
  *pidx = fs->stack[num].idx;
1343
0
  memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1344
0
  memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1345
0
  re_node_set_free (eps_via_nodes);
1346
0
  re_free (fs->stack[num].regs);
1347
0
  *eps_via_nodes = fs->stack[num].eps_via_nodes;
1348
0
  DEBUG_ASSERT (0 <= fs->stack[num].node);
1349
0
  return fs->stack[num].node;
1350
0
}
1351
1352
1353
#define DYNARRAY_STRUCT  regmatch_list
1354
#define DYNARRAY_ELEMENT regmatch_t
1355
#define DYNARRAY_PREFIX  regmatch_list_
1356
#include <malloc/dynarray-skeleton.c>
1357
1358
/* Set the positions where the subexpressions are starts/ends to registers
1359
   PMATCH.
1360
   Note: We assume that pmatch[0] is already set, and
1361
   pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1362
1363
static reg_errcode_t
1364
__attribute_warn_unused_result__
1365
set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1366
    regmatch_t *pmatch, bool fl_backtrack)
1367
0
{
1368
0
  const re_dfa_t *dfa = preg->buffer;
1369
0
  Idx idx, cur_node;
1370
0
  re_node_set eps_via_nodes;
1371
0
  struct re_fail_stack_t *fs;
1372
0
  struct re_fail_stack_t fs_body = { 0, 2, NULL };
1373
0
  struct regmatch_list prev_match;
1374
0
  regmatch_list_init (&prev_match);
1375
1376
0
  DEBUG_ASSERT (nmatch > 1);
1377
0
  DEBUG_ASSERT (mctx->state_log != NULL);
1378
0
  if (fl_backtrack)
1379
0
    {
1380
0
      fs = &fs_body;
1381
0
      fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1382
0
      if (fs->stack == NULL)
1383
0
  return REG_ESPACE;
1384
0
    }
1385
0
  else
1386
0
    fs = NULL;
1387
1388
0
  cur_node = dfa->init_node;
1389
0
  re_node_set_init_empty (&eps_via_nodes);
1390
1391
0
  if (!regmatch_list_resize (&prev_match, nmatch))
1392
0
    {
1393
0
      regmatch_list_free (&prev_match);
1394
0
      free_fail_stack_return (fs);
1395
0
      return REG_ESPACE;
1396
0
    }
1397
0
  regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1398
0
  memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1399
1400
0
  for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1401
0
    {
1402
0
      update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1403
1404
0
      if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1405
0
    || (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1406
0
  {
1407
0
    Idx reg_idx;
1408
0
    cur_node = -1;
1409
0
    if (fs)
1410
0
      {
1411
0
        for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1412
0
    if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1413
0
      {
1414
0
        cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1415
0
                 prev_idx_match, &eps_via_nodes);
1416
0
        break;
1417
0
      }
1418
0
      }
1419
0
    if (cur_node < 0)
1420
0
      {
1421
0
        re_node_set_free (&eps_via_nodes);
1422
0
        regmatch_list_free (&prev_match);
1423
0
        return free_fail_stack_return (fs);
1424
0
      }
1425
0
  }
1426
1427
      /* Proceed to next node.  */
1428
0
      cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1429
0
            &idx, cur_node,
1430
0
            &eps_via_nodes, fs);
1431
1432
0
      if (__glibc_unlikely (cur_node < 0))
1433
0
  {
1434
0
    if (__glibc_unlikely (cur_node == -2))
1435
0
      {
1436
0
        re_node_set_free (&eps_via_nodes);
1437
0
        regmatch_list_free (&prev_match);
1438
0
        free_fail_stack_return (fs);
1439
0
        return REG_ESPACE;
1440
0
      }
1441
0
    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1442
0
             prev_idx_match, &eps_via_nodes);
1443
0
    if (cur_node < 0)
1444
0
      {
1445
0
        re_node_set_free (&eps_via_nodes);
1446
0
        regmatch_list_free (&prev_match);
1447
0
        free_fail_stack_return (fs);
1448
0
        return REG_NOMATCH;
1449
0
      }
1450
0
  }
1451
0
    }
1452
0
  re_node_set_free (&eps_via_nodes);
1453
0
  regmatch_list_free (&prev_match);
1454
0
  return free_fail_stack_return (fs);
1455
0
}
1456
1457
static reg_errcode_t
1458
free_fail_stack_return (struct re_fail_stack_t *fs)
1459
0
{
1460
0
  if (fs)
1461
0
    {
1462
0
      Idx fs_idx;
1463
0
      for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1464
0
  {
1465
0
    re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1466
0
    re_free (fs->stack[fs_idx].regs);
1467
0
  }
1468
0
      re_free (fs->stack);
1469
0
    }
1470
0
  return REG_NOERROR;
1471
0
}
1472
1473
static void
1474
update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1475
       regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1476
0
{
1477
0
  int type = dfa->nodes[cur_node].type;
1478
0
  if (type == OP_OPEN_SUBEXP)
1479
0
    {
1480
0
      Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1481
1482
      /* We are at the first node of this sub expression.  */
1483
0
      if (reg_num < nmatch)
1484
0
  {
1485
0
    pmatch[reg_num].rm_so = cur_idx;
1486
0
    pmatch[reg_num].rm_eo = -1;
1487
0
  }
1488
0
    }
1489
0
  else if (type == OP_CLOSE_SUBEXP)
1490
0
    {
1491
      /* We are at the last node of this sub expression.  */
1492
0
      Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1493
0
      if (reg_num < nmatch)
1494
0
  {
1495
0
    if (pmatch[reg_num].rm_so < cur_idx)
1496
0
      {
1497
0
        pmatch[reg_num].rm_eo = cur_idx;
1498
        /* This is a non-empty match or we are not inside an optional
1499
     subexpression.  Accept this right away.  */
1500
0
        memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1501
0
      }
1502
0
    else
1503
0
      {
1504
0
        if (dfa->nodes[cur_node].opt_subexp
1505
0
      && prev_idx_match[reg_num].rm_so != -1)
1506
    /* We transited through an empty match for an optional
1507
       subexpression, like (a?)*, and this is not the subexp's
1508
       first match.  Copy back the old content of the registers
1509
       so that matches of an inner subexpression are undone as
1510
       well, like in ((a?))*.  */
1511
0
    memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1512
0
        else
1513
    /* We completed a subexpression, but it may be part of
1514
       an optional one, so do not update PREV_IDX_MATCH.  */
1515
0
    pmatch[reg_num].rm_eo = cur_idx;
1516
0
      }
1517
0
  }
1518
0
    }
1519
0
}
1520
1521
/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1522
   and sift the nodes in each states according to the following rules.
1523
   Updated state_log will be wrote to STATE_LOG.
1524
1525
   Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1526
     1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1527
  If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1528
  the LAST_NODE, we throw away the node 'a'.
1529
     2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1530
  string 's' and transit to 'b':
1531
  i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1532
     away the node 'a'.
1533
  ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1534
      thrown away, we throw away the node 'a'.
1535
     3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1536
  i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1537
     node 'a'.
1538
  ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1539
      we throw away the node 'a'.  */
1540
1541
#define STATE_NODE_CONTAINS(state,node) \
1542
0
  ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1543
1544
static reg_errcode_t
1545
sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1546
0
{
1547
0
  reg_errcode_t err;
1548
0
  int null_cnt = 0;
1549
0
  Idx str_idx = sctx->last_str_idx;
1550
0
  re_node_set cur_dest;
1551
1552
0
  DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1553
1554
  /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1555
     transit to the last_node and the last_node itself.  */
1556
0
  err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1557
0
  if (__glibc_unlikely (err != REG_NOERROR))
1558
0
    return err;
1559
0
  err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1560
0
  if (__glibc_unlikely (err != REG_NOERROR))
1561
0
    goto free_return;
1562
1563
  /* Then check each states in the state_log.  */
1564
0
  while (str_idx > 0)
1565
0
    {
1566
      /* Update counters.  */
1567
0
      null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1568
0
      if (null_cnt > mctx->max_mb_elem_len)
1569
0
  {
1570
0
    memset (sctx->sifted_states, '\0',
1571
0
      sizeof (re_dfastate_t *) * str_idx);
1572
0
    re_node_set_free (&cur_dest);
1573
0
    return REG_NOERROR;
1574
0
  }
1575
0
      re_node_set_empty (&cur_dest);
1576
0
      --str_idx;
1577
1578
0
      if (mctx->state_log[str_idx])
1579
0
  {
1580
0
    err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1581
0
    if (__glibc_unlikely (err != REG_NOERROR))
1582
0
      goto free_return;
1583
0
  }
1584
1585
      /* Add all the nodes which satisfy the following conditions:
1586
   - It can epsilon transit to a node in CUR_DEST.
1587
   - It is in CUR_SRC.
1588
   And update state_log.  */
1589
0
      err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1590
0
      if (__glibc_unlikely (err != REG_NOERROR))
1591
0
  goto free_return;
1592
0
    }
1593
0
  err = REG_NOERROR;
1594
0
 free_return:
1595
0
  re_node_set_free (&cur_dest);
1596
0
  return err;
1597
0
}
1598
1599
static reg_errcode_t
1600
__attribute_warn_unused_result__
1601
build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1602
         Idx str_idx, re_node_set *cur_dest)
1603
0
{
1604
0
  const re_dfa_t *const dfa = mctx->dfa;
1605
0
  const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1606
0
  Idx i;
1607
1608
  /* Then build the next sifted state.
1609
     We build the next sifted state on 'cur_dest', and update
1610
     'sifted_states[str_idx]' with 'cur_dest'.
1611
     Note:
1612
     'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1613
     'cur_src' points the node_set of the old 'state_log[str_idx]'
1614
     (with the epsilon nodes pre-filtered out).  */
1615
0
  for (i = 0; i < cur_src->nelem; i++)
1616
0
    {
1617
0
      Idx prev_node = cur_src->elems[i];
1618
0
      int naccepted = 0;
1619
0
      bool ok;
1620
0
      DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1621
1622
      /* If the node may accept "multi byte".  */
1623
0
      if (dfa->nodes[prev_node].accept_mb)
1624
0
  naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1625
0
           str_idx, sctx->last_str_idx);
1626
1627
      /* We don't check backreferences here.
1628
   See update_cur_sifted_state().  */
1629
0
      if (!naccepted
1630
0
    && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1631
0
    && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1632
0
          dfa->nexts[prev_node]))
1633
0
  naccepted = 1;
1634
1635
0
      if (naccepted == 0)
1636
0
  continue;
1637
1638
0
      if (sctx->limits.nelem)
1639
0
  {
1640
0
    Idx to_idx = str_idx + naccepted;
1641
0
    if (check_dst_limits (mctx, &sctx->limits,
1642
0
        dfa->nexts[prev_node], to_idx,
1643
0
        prev_node, str_idx))
1644
0
      continue;
1645
0
  }
1646
0
      ok = re_node_set_insert (cur_dest, prev_node);
1647
0
      if (__glibc_unlikely (! ok))
1648
0
  return REG_ESPACE;
1649
0
    }
1650
1651
0
  return REG_NOERROR;
1652
0
}
1653
1654
/* Helper functions.  */
1655
1656
static reg_errcode_t
1657
clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1658
0
{
1659
0
  Idx top = mctx->state_log_top;
1660
1661
0
  if ((next_state_log_idx >= mctx->input.bufs_len
1662
0
       && mctx->input.bufs_len < mctx->input.len)
1663
0
      || (next_state_log_idx >= mctx->input.valid_len
1664
0
    && mctx->input.valid_len < mctx->input.len))
1665
0
    {
1666
0
      reg_errcode_t err;
1667
0
      err = extend_buffers (mctx, next_state_log_idx + 1);
1668
0
      if (__glibc_unlikely (err != REG_NOERROR))
1669
0
  return err;
1670
0
    }
1671
1672
0
  if (top < next_state_log_idx)
1673
0
    {
1674
0
      DEBUG_ASSERT (mctx->state_log != NULL);
1675
0
      memset (mctx->state_log + top + 1, '\0',
1676
0
        sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1677
0
      mctx->state_log_top = next_state_log_idx;
1678
0
    }
1679
0
  return REG_NOERROR;
1680
0
}
1681
1682
static reg_errcode_t
1683
merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1684
       re_dfastate_t **src, Idx num)
1685
0
{
1686
0
  Idx st_idx;
1687
0
  reg_errcode_t err;
1688
0
  for (st_idx = 0; st_idx < num; ++st_idx)
1689
0
    {
1690
0
      if (dst[st_idx] == NULL)
1691
0
  dst[st_idx] = src[st_idx];
1692
0
      else if (src[st_idx] != NULL)
1693
0
  {
1694
0
    re_node_set merged_set;
1695
0
    err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1696
0
          &src[st_idx]->nodes);
1697
0
    if (__glibc_unlikely (err != REG_NOERROR))
1698
0
      return err;
1699
0
    dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1700
0
    re_node_set_free (&merged_set);
1701
0
    if (__glibc_unlikely (err != REG_NOERROR))
1702
0
      return err;
1703
0
  }
1704
0
    }
1705
0
  return REG_NOERROR;
1706
0
}
1707
1708
static reg_errcode_t
1709
update_cur_sifted_state (const re_match_context_t *mctx,
1710
       re_sift_context_t *sctx, Idx str_idx,
1711
       re_node_set *dest_nodes)
1712
0
{
1713
0
  const re_dfa_t *const dfa = mctx->dfa;
1714
0
  reg_errcode_t err = REG_NOERROR;
1715
0
  const re_node_set *candidates;
1716
0
  candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1717
0
    : &mctx->state_log[str_idx]->nodes);
1718
1719
0
  if (dest_nodes->nelem == 0)
1720
0
    sctx->sifted_states[str_idx] = NULL;
1721
0
  else
1722
0
    {
1723
0
      if (candidates)
1724
0
  {
1725
    /* At first, add the nodes which can epsilon transit to a node in
1726
       DEST_NODE.  */
1727
0
    err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1728
0
    if (__glibc_unlikely (err != REG_NOERROR))
1729
0
      return err;
1730
1731
    /* Then, check the limitations in the current sift_context.  */
1732
0
    if (sctx->limits.nelem)
1733
0
      {
1734
0
        err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1735
0
           mctx->bkref_ents, str_idx);
1736
0
        if (__glibc_unlikely (err != REG_NOERROR))
1737
0
    return err;
1738
0
      }
1739
0
  }
1740
1741
0
      sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1742
0
      if (__glibc_unlikely (err != REG_NOERROR))
1743
0
  return err;
1744
0
    }
1745
1746
0
  if (candidates && mctx->state_log[str_idx]->has_backref)
1747
0
    {
1748
0
      err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1749
0
      if (__glibc_unlikely (err != REG_NOERROR))
1750
0
  return err;
1751
0
    }
1752
0
  return REG_NOERROR;
1753
0
}
1754
1755
static reg_errcode_t
1756
__attribute_warn_unused_result__
1757
add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1758
           const re_node_set *candidates)
1759
0
{
1760
0
  reg_errcode_t err = REG_NOERROR;
1761
0
  Idx i;
1762
1763
0
  re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1764
0
  if (__glibc_unlikely (err != REG_NOERROR))
1765
0
    return err;
1766
1767
0
  if (!state->inveclosure.alloc)
1768
0
    {
1769
0
      err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1770
0
      if (__glibc_unlikely (err != REG_NOERROR))
1771
0
  return REG_ESPACE;
1772
0
      for (i = 0; i < dest_nodes->nelem; i++)
1773
0
  {
1774
0
    err = re_node_set_merge (&state->inveclosure,
1775
0
           dfa->inveclosures + dest_nodes->elems[i]);
1776
0
    if (__glibc_unlikely (err != REG_NOERROR))
1777
0
      return REG_ESPACE;
1778
0
  }
1779
0
    }
1780
0
  return re_node_set_add_intersect (dest_nodes, candidates,
1781
0
            &state->inveclosure);
1782
0
}
1783
1784
static reg_errcode_t
1785
sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1786
           const re_node_set *candidates)
1787
0
{
1788
0
    Idx ecl_idx;
1789
0
    reg_errcode_t err;
1790
0
    re_node_set *inv_eclosure = dfa->inveclosures + node;
1791
0
    re_node_set except_nodes;
1792
0
    re_node_set_init_empty (&except_nodes);
1793
0
    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1794
0
      {
1795
0
  Idx cur_node = inv_eclosure->elems[ecl_idx];
1796
0
  if (cur_node == node)
1797
0
    continue;
1798
0
  if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1799
0
    {
1800
0
      Idx edst1 = dfa->edests[cur_node].elems[0];
1801
0
      Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1802
0
       ? dfa->edests[cur_node].elems[1] : -1);
1803
0
      if ((!re_node_set_contains (inv_eclosure, edst1)
1804
0
     && re_node_set_contains (dest_nodes, edst1))
1805
0
    || (edst2 > 0
1806
0
        && !re_node_set_contains (inv_eclosure, edst2)
1807
0
        && re_node_set_contains (dest_nodes, edst2)))
1808
0
        {
1809
0
    err = re_node_set_add_intersect (&except_nodes, candidates,
1810
0
             dfa->inveclosures + cur_node);
1811
0
    if (__glibc_unlikely (err != REG_NOERROR))
1812
0
      {
1813
0
        re_node_set_free (&except_nodes);
1814
0
        return err;
1815
0
      }
1816
0
        }
1817
0
    }
1818
0
      }
1819
0
    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1820
0
      {
1821
0
  Idx cur_node = inv_eclosure->elems[ecl_idx];
1822
0
  if (!re_node_set_contains (&except_nodes, cur_node))
1823
0
    {
1824
0
      Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1825
0
      re_node_set_remove_at (dest_nodes, idx);
1826
0
    }
1827
0
      }
1828
0
    re_node_set_free (&except_nodes);
1829
0
    return REG_NOERROR;
1830
0
}
1831
1832
static bool
1833
check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1834
      Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1835
0
{
1836
0
  const re_dfa_t *const dfa = mctx->dfa;
1837
0
  Idx lim_idx, src_pos, dst_pos;
1838
1839
0
  Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1840
0
  Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1841
0
  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1842
0
    {
1843
0
      Idx subexp_idx;
1844
0
      struct re_backref_cache_entry *ent;
1845
0
      ent = mctx->bkref_ents + limits->elems[lim_idx];
1846
0
      subexp_idx = dfa->nodes[ent->node].opr.idx;
1847
1848
0
      dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1849
0
             subexp_idx, dst_node, dst_idx,
1850
0
             dst_bkref_idx);
1851
0
      src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1852
0
             subexp_idx, src_node, src_idx,
1853
0
             src_bkref_idx);
1854
1855
      /* In case of:
1856
   <src> <dst> ( <subexp> )
1857
   ( <subexp> ) <src> <dst>
1858
   ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1859
0
      if (src_pos == dst_pos)
1860
0
  continue; /* This is unrelated limitation.  */
1861
0
      else
1862
0
  return true;
1863
0
    }
1864
0
  return false;
1865
0
}
1866
1867
static int
1868
check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1869
           Idx subexp_idx, Idx from_node, Idx bkref_idx)
1870
0
{
1871
0
  const re_dfa_t *const dfa = mctx->dfa;
1872
0
  const re_node_set *eclosures = dfa->eclosures + from_node;
1873
0
  Idx node_idx;
1874
1875
  /* Else, we are on the boundary: examine the nodes on the epsilon
1876
     closure.  */
1877
0
  for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1878
0
    {
1879
0
      Idx node = eclosures->elems[node_idx];
1880
0
      switch (dfa->nodes[node].type)
1881
0
  {
1882
0
  case OP_BACK_REF:
1883
0
    if (bkref_idx != -1)
1884
0
      {
1885
0
        struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1886
0
        do
1887
0
    {
1888
0
      Idx dst;
1889
0
      int cpos;
1890
1891
0
      if (ent->node != node)
1892
0
        continue;
1893
1894
0
      if (subexp_idx < BITSET_WORD_BITS
1895
0
          && !(ent->eps_reachable_subexps_map
1896
0
         & ((bitset_word_t) 1 << subexp_idx)))
1897
0
        continue;
1898
1899
      /* Recurse trying to reach the OP_OPEN_SUBEXP and
1900
         OP_CLOSE_SUBEXP cases below.  But, if the
1901
         destination node is the same node as the source
1902
         node, don't recurse because it would cause an
1903
         infinite loop: a regex that exhibits this behavior
1904
         is ()\1*\1*  */
1905
0
      dst = dfa->edests[node].elems[0];
1906
0
      if (dst == from_node)
1907
0
        {
1908
0
          if (boundaries & 1)
1909
0
      return -1;
1910
0
          else /* if (boundaries & 2) */
1911
0
      return 0;
1912
0
        }
1913
1914
0
      cpos =
1915
0
        check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1916
0
             dst, bkref_idx);
1917
0
      if (cpos == -1 /* && (boundaries & 1) */)
1918
0
        return -1;
1919
0
      if (cpos == 0 && (boundaries & 2))
1920
0
        return 0;
1921
1922
0
      if (subexp_idx < BITSET_WORD_BITS)
1923
0
        ent->eps_reachable_subexps_map
1924
0
          &= ~((bitset_word_t) 1 << subexp_idx);
1925
0
    }
1926
0
        while (ent++->more);
1927
0
      }
1928
0
    break;
1929
1930
0
  case OP_OPEN_SUBEXP:
1931
0
    if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1932
0
      return -1;
1933
0
    break;
1934
1935
0
  case OP_CLOSE_SUBEXP:
1936
0
    if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1937
0
      return 0;
1938
0
    break;
1939
1940
0
  default:
1941
0
      break;
1942
0
  }
1943
0
    }
1944
1945
0
  return (boundaries & 2) ? 1 : 0;
1946
0
}
1947
1948
static int
1949
check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1950
         Idx subexp_idx, Idx from_node, Idx str_idx,
1951
         Idx bkref_idx)
1952
0
{
1953
0
  struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1954
0
  int boundaries;
1955
1956
  /* If we are outside the range of the subexpression, return -1 or 1.  */
1957
0
  if (str_idx < lim->subexp_from)
1958
0
    return -1;
1959
1960
0
  if (lim->subexp_to < str_idx)
1961
0
    return 1;
1962
1963
  /* If we are within the subexpression, return 0.  */
1964
0
  boundaries = (str_idx == lim->subexp_from);
1965
0
  boundaries |= (str_idx == lim->subexp_to) << 1;
1966
0
  if (boundaries == 0)
1967
0
    return 0;
1968
1969
  /* Else, examine epsilon closure.  */
1970
0
  return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1971
0
              from_node, bkref_idx);
1972
0
}
1973
1974
/* Check the limitations of sub expressions LIMITS, and remove the nodes
1975
   which are against limitations from DEST_NODES. */
1976
1977
static reg_errcode_t
1978
check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1979
         const re_node_set *candidates, re_node_set *limits,
1980
         struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1981
0
{
1982
0
  reg_errcode_t err;
1983
0
  Idx node_idx, lim_idx;
1984
1985
0
  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1986
0
    {
1987
0
      Idx subexp_idx;
1988
0
      struct re_backref_cache_entry *ent;
1989
0
      ent = bkref_ents + limits->elems[lim_idx];
1990
1991
0
      if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1992
0
  continue; /* This is unrelated limitation.  */
1993
1994
0
      subexp_idx = dfa->nodes[ent->node].opr.idx;
1995
0
      if (ent->subexp_to == str_idx)
1996
0
  {
1997
0
    Idx ops_node = -1;
1998
0
    Idx cls_node = -1;
1999
0
    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2000
0
      {
2001
0
        Idx node = dest_nodes->elems[node_idx];
2002
0
        re_token_type_t type = dfa->nodes[node].type;
2003
0
        if (type == OP_OPEN_SUBEXP
2004
0
      && subexp_idx == dfa->nodes[node].opr.idx)
2005
0
    ops_node = node;
2006
0
        else if (type == OP_CLOSE_SUBEXP
2007
0
           && subexp_idx == dfa->nodes[node].opr.idx)
2008
0
    cls_node = node;
2009
0
      }
2010
2011
    /* Check the limitation of the open subexpression.  */
2012
    /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2013
0
    if (ops_node >= 0)
2014
0
      {
2015
0
        err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2016
0
             candidates);
2017
0
        if (__glibc_unlikely (err != REG_NOERROR))
2018
0
    return err;
2019
0
      }
2020
2021
    /* Check the limitation of the close subexpression.  */
2022
0
    if (cls_node >= 0)
2023
0
      for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2024
0
        {
2025
0
    Idx node = dest_nodes->elems[node_idx];
2026
0
    if (!re_node_set_contains (dfa->inveclosures + node,
2027
0
             cls_node)
2028
0
        && !re_node_set_contains (dfa->eclosures + node,
2029
0
                cls_node))
2030
0
      {
2031
        /* It is against this limitation.
2032
           Remove it form the current sifted state.  */
2033
0
        err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2034
0
             candidates);
2035
0
        if (__glibc_unlikely (err != REG_NOERROR))
2036
0
          return err;
2037
0
        --node_idx;
2038
0
      }
2039
0
        }
2040
0
  }
2041
0
      else /* (ent->subexp_to != str_idx)  */
2042
0
  {
2043
0
    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2044
0
      {
2045
0
        Idx node = dest_nodes->elems[node_idx];
2046
0
        re_token_type_t type = dfa->nodes[node].type;
2047
0
        if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2048
0
    {
2049
0
      if (subexp_idx != dfa->nodes[node].opr.idx)
2050
0
        continue;
2051
      /* It is against this limitation.
2052
         Remove it form the current sifted state.  */
2053
0
      err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2054
0
                 candidates);
2055
0
      if (__glibc_unlikely (err != REG_NOERROR))
2056
0
        return err;
2057
0
    }
2058
0
      }
2059
0
  }
2060
0
    }
2061
0
  return REG_NOERROR;
2062
0
}
2063
2064
static reg_errcode_t
2065
__attribute_warn_unused_result__
2066
sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2067
       Idx str_idx, const re_node_set *candidates)
2068
0
{
2069
0
  const re_dfa_t *const dfa = mctx->dfa;
2070
0
  reg_errcode_t err;
2071
0
  Idx node_idx, node;
2072
0
  re_sift_context_t local_sctx;
2073
0
  Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2074
2075
0
  if (first_idx == -1)
2076
0
    return REG_NOERROR;
2077
2078
0
  local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2079
2080
0
  for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2081
0
    {
2082
0
      Idx enabled_idx;
2083
0
      re_token_type_t type;
2084
0
      struct re_backref_cache_entry *entry;
2085
0
      node = candidates->elems[node_idx];
2086
0
      type = dfa->nodes[node].type;
2087
      /* Avoid infinite loop for the REs like "()\1+".  */
2088
0
      if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2089
0
  continue;
2090
0
      if (type != OP_BACK_REF)
2091
0
  continue;
2092
2093
0
      entry = mctx->bkref_ents + first_idx;
2094
0
      enabled_idx = first_idx;
2095
0
      do
2096
0
  {
2097
0
    Idx subexp_len;
2098
0
    Idx to_idx;
2099
0
    Idx dst_node;
2100
0
    bool ok;
2101
0
    re_dfastate_t *cur_state;
2102
2103
0
    if (entry->node != node)
2104
0
      continue;
2105
0
    subexp_len = entry->subexp_to - entry->subexp_from;
2106
0
    to_idx = str_idx + subexp_len;
2107
0
    dst_node = (subexp_len ? dfa->nexts[node]
2108
0
          : dfa->edests[node].elems[0]);
2109
2110
0
    if (to_idx > sctx->last_str_idx
2111
0
        || sctx->sifted_states[to_idx] == NULL
2112
0
        || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2113
0
        || check_dst_limits (mctx, &sctx->limits, node,
2114
0
           str_idx, dst_node, to_idx))
2115
0
      continue;
2116
2117
0
    if (local_sctx.sifted_states == NULL)
2118
0
      {
2119
0
        local_sctx = *sctx;
2120
0
        err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2121
0
        if (__glibc_unlikely (err != REG_NOERROR))
2122
0
    goto free_return;
2123
0
      }
2124
0
    local_sctx.last_node = node;
2125
0
    local_sctx.last_str_idx = str_idx;
2126
0
    ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2127
0
    if (__glibc_unlikely (! ok))
2128
0
      {
2129
0
        err = REG_ESPACE;
2130
0
        goto free_return;
2131
0
      }
2132
0
    cur_state = local_sctx.sifted_states[str_idx];
2133
0
    err = sift_states_backward (mctx, &local_sctx);
2134
0
    if (__glibc_unlikely (err != REG_NOERROR))
2135
0
      goto free_return;
2136
0
    if (sctx->limited_states != NULL)
2137
0
      {
2138
0
        err = merge_state_array (dfa, sctx->limited_states,
2139
0
               local_sctx.sifted_states,
2140
0
               str_idx + 1);
2141
0
        if (__glibc_unlikely (err != REG_NOERROR))
2142
0
    goto free_return;
2143
0
      }
2144
0
    local_sctx.sifted_states[str_idx] = cur_state;
2145
0
    re_node_set_remove (&local_sctx.limits, enabled_idx);
2146
2147
    /* mctx->bkref_ents may have changed, reload the pointer.  */
2148
0
    entry = mctx->bkref_ents + enabled_idx;
2149
0
  }
2150
0
      while (enabled_idx++, entry++->more);
2151
0
    }
2152
0
  err = REG_NOERROR;
2153
0
 free_return:
2154
0
  if (local_sctx.sifted_states != NULL)
2155
0
    {
2156
0
      re_node_set_free (&local_sctx.limits);
2157
0
    }
2158
2159
0
  return err;
2160
0
}
2161
2162
2163
static int
2164
sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2165
         Idx node_idx, Idx str_idx, Idx max_str_idx)
2166
0
{
2167
0
  const re_dfa_t *const dfa = mctx->dfa;
2168
0
  int naccepted;
2169
  /* Check the node can accept "multi byte".  */
2170
0
  naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2171
0
  if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2172
0
      && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2173
0
             dfa->nexts[node_idx]))
2174
    /* The node can't accept the "multi byte", or the
2175
       destination was already thrown away, then the node
2176
       couldn't accept the current input "multi byte".   */
2177
0
    naccepted = 0;
2178
  /* Otherwise, it is sure that the node could accept
2179
     'naccepted' bytes input.  */
2180
0
  return naccepted;
2181
0
}
2182

2183
/* Functions for state transition.  */
2184
2185
/* Return the next state to which the current state STATE will transit by
2186
   accepting the current input byte, and update STATE_LOG if necessary.
2187
   Return NULL on failure.
2188
   If STATE can accept a multibyte char/collating element/back reference
2189
   update the destination of STATE_LOG.  */
2190
2191
static re_dfastate_t *
2192
__attribute_warn_unused_result__
2193
transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2194
         re_dfastate_t *state)
2195
0
{
2196
0
  re_dfastate_t **trtable;
2197
0
  unsigned char ch;
2198
2199
  /* If the current state can accept multibyte.  */
2200
0
  if (__glibc_unlikely (state->accept_mb))
2201
0
    {
2202
0
      *err = transit_state_mb (mctx, state);
2203
0
      if (__glibc_unlikely (*err != REG_NOERROR))
2204
0
  return NULL;
2205
0
    }
2206
2207
  /* Then decide the next state with the single byte.  */
2208
#if 0
2209
  if (0)
2210
    /* don't use transition table  */
2211
    return transit_state_sb (err, mctx, state);
2212
#endif
2213
2214
  /* Use transition table  */
2215
0
  ch = re_string_fetch_byte (&mctx->input);
2216
0
  for (;;)
2217
0
    {
2218
0
      trtable = state->trtable;
2219
0
      if (__glibc_likely (trtable != NULL))
2220
0
  return trtable[ch];
2221
2222
0
      trtable = state->word_trtable;
2223
0
      if (__glibc_likely (trtable != NULL))
2224
0
  {
2225
0
    unsigned int context;
2226
0
    context
2227
0
      = re_string_context_at (&mctx->input,
2228
0
            re_string_cur_idx (&mctx->input) - 1,
2229
0
            mctx->eflags);
2230
0
    if (IS_WORD_CONTEXT (context))
2231
0
      return trtable[ch + SBC_MAX];
2232
0
    else
2233
0
      return trtable[ch];
2234
0
  }
2235
2236
0
      if (!build_trtable (mctx->dfa, state))
2237
0
  {
2238
0
    *err = REG_ESPACE;
2239
0
    return NULL;
2240
0
  }
2241
2242
      /* Retry, we now have a transition table.  */
2243
0
    }
2244
0
}
2245
2246
/* Update the state_log if we need */
2247
static re_dfastate_t *
2248
merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2249
          re_dfastate_t *next_state)
2250
0
{
2251
0
  const re_dfa_t *const dfa = mctx->dfa;
2252
0
  Idx cur_idx = re_string_cur_idx (&mctx->input);
2253
2254
0
  if (cur_idx > mctx->state_log_top)
2255
0
    {
2256
0
      mctx->state_log[cur_idx] = next_state;
2257
0
      mctx->state_log_top = cur_idx;
2258
0
    }
2259
0
  else if (mctx->state_log[cur_idx] == 0)
2260
0
    {
2261
0
      mctx->state_log[cur_idx] = next_state;
2262
0
    }
2263
0
  else
2264
0
    {
2265
0
      re_dfastate_t *pstate;
2266
0
      unsigned int context;
2267
0
      re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2268
      /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2269
   the destination of a multibyte char/collating element/
2270
   back reference.  Then the next state is the union set of
2271
   these destinations and the results of the transition table.  */
2272
0
      pstate = mctx->state_log[cur_idx];
2273
0
      log_nodes = pstate->entrance_nodes;
2274
0
      if (next_state != NULL)
2275
0
  {
2276
0
    table_nodes = next_state->entrance_nodes;
2277
0
    *err = re_node_set_init_union (&next_nodes, table_nodes,
2278
0
               log_nodes);
2279
0
    if (__glibc_unlikely (*err != REG_NOERROR))
2280
0
      return NULL;
2281
0
  }
2282
0
      else
2283
0
  next_nodes = *log_nodes;
2284
      /* Note: We already add the nodes of the initial state,
2285
   then we don't need to add them here.  */
2286
2287
0
      context = re_string_context_at (&mctx->input,
2288
0
              re_string_cur_idx (&mctx->input) - 1,
2289
0
              mctx->eflags);
2290
0
      next_state = mctx->state_log[cur_idx]
2291
0
  = re_acquire_state_context (err, dfa, &next_nodes, context);
2292
      /* We don't need to check errors here, since the return value of
2293
   this function is next_state and ERR is already set.  */
2294
2295
0
      if (table_nodes != NULL)
2296
0
  re_node_set_free (&next_nodes);
2297
0
    }
2298
2299
0
  if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2300
0
    {
2301
      /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2302
   later.  We must check them here, since the back references in the
2303
   next state might use them.  */
2304
0
      *err = check_subexp_matching_top (mctx, &next_state->nodes,
2305
0
          cur_idx);
2306
0
      if (__glibc_unlikely (*err != REG_NOERROR))
2307
0
  return NULL;
2308
2309
      /* If the next state has back references.  */
2310
0
      if (next_state->has_backref)
2311
0
  {
2312
0
    *err = transit_state_bkref (mctx, &next_state->nodes);
2313
0
    if (__glibc_unlikely (*err != REG_NOERROR))
2314
0
      return NULL;
2315
0
    next_state = mctx->state_log[cur_idx];
2316
0
  }
2317
0
    }
2318
2319
0
  return next_state;
2320
0
}
2321
2322
/* Skip bytes in the input that correspond to part of a
2323
   multi-byte match, then look in the log for a state
2324
   from which to restart matching.  */
2325
static re_dfastate_t *
2326
find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2327
0
{
2328
0
  re_dfastate_t *cur_state;
2329
0
  do
2330
0
    {
2331
0
      Idx max = mctx->state_log_top;
2332
0
      Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2333
2334
0
      do
2335
0
  {
2336
0
    if (++cur_str_idx > max)
2337
0
      return NULL;
2338
0
    re_string_skip_bytes (&mctx->input, 1);
2339
0
  }
2340
0
      while (mctx->state_log[cur_str_idx] == NULL);
2341
2342
0
      cur_state = merge_state_with_log (err, mctx, NULL);
2343
0
    }
2344
0
  while (*err == REG_NOERROR && cur_state == NULL);
2345
0
  return cur_state;
2346
0
}
2347
2348
/* Helper functions for transit_state.  */
2349
2350
/* From the node set CUR_NODES, pick up the nodes whose types are
2351
   OP_OPEN_SUBEXP and which have corresponding back references in the regular
2352
   expression. And register them to use them later for evaluating the
2353
   corresponding back references.  */
2354
2355
static reg_errcode_t
2356
check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2357
         Idx str_idx)
2358
0
{
2359
0
  const re_dfa_t *const dfa = mctx->dfa;
2360
0
  Idx node_idx;
2361
0
  reg_errcode_t err;
2362
2363
  /* TODO: This isn't efficient.
2364
     Because there might be more than one nodes whose types are
2365
     OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2366
     nodes.
2367
     E.g. RE: (a){2}  */
2368
0
  for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2369
0
    {
2370
0
      Idx node = cur_nodes->elems[node_idx];
2371
0
      if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2372
0
    && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2373
0
    && (dfa->used_bkref_map
2374
0
        & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2375
0
  {
2376
0
    err = match_ctx_add_subtop (mctx, node, str_idx);
2377
0
    if (__glibc_unlikely (err != REG_NOERROR))
2378
0
      return err;
2379
0
  }
2380
0
    }
2381
0
  return REG_NOERROR;
2382
0
}
2383
2384
#if 0
2385
/* Return the next state to which the current state STATE will transit by
2386
   accepting the current input byte.  Return NULL on failure.  */
2387
2388
static re_dfastate_t *
2389
transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2390
      re_dfastate_t *state)
2391
{
2392
  const re_dfa_t *const dfa = mctx->dfa;
2393
  re_node_set next_nodes;
2394
  re_dfastate_t *next_state;
2395
  Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2396
  unsigned int context;
2397
2398
  *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2399
  if (__glibc_unlikely (*err != REG_NOERROR))
2400
    return NULL;
2401
  for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2402
    {
2403
      Idx cur_node = state->nodes.elems[node_cnt];
2404
      if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2405
  {
2406
    *err = re_node_set_merge (&next_nodes,
2407
            dfa->eclosures + dfa->nexts[cur_node]);
2408
    if (__glibc_unlikely (*err != REG_NOERROR))
2409
      {
2410
        re_node_set_free (&next_nodes);
2411
        return NULL;
2412
      }
2413
  }
2414
    }
2415
  context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2416
  next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2417
  /* We don't need to check errors here, since the return value of
2418
     this function is next_state and ERR is already set.  */
2419
2420
  re_node_set_free (&next_nodes);
2421
  re_string_skip_bytes (&mctx->input, 1);
2422
  return next_state;
2423
}
2424
#endif
2425
2426
static reg_errcode_t
2427
transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2428
0
{
2429
0
  const re_dfa_t *const dfa = mctx->dfa;
2430
0
  reg_errcode_t err;
2431
0
  Idx i;
2432
2433
0
  for (i = 0; i < pstate->nodes.nelem; ++i)
2434
0
    {
2435
0
      re_node_set dest_nodes, *new_nodes;
2436
0
      Idx cur_node_idx = pstate->nodes.elems[i];
2437
0
      int naccepted;
2438
0
      Idx dest_idx;
2439
0
      unsigned int context;
2440
0
      re_dfastate_t *dest_state;
2441
2442
0
      if (!dfa->nodes[cur_node_idx].accept_mb)
2443
0
  continue;
2444
2445
0
      if (dfa->nodes[cur_node_idx].constraint)
2446
0
  {
2447
0
    context = re_string_context_at (&mctx->input,
2448
0
            re_string_cur_idx (&mctx->input),
2449
0
            mctx->eflags);
2450
0
    if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2451
0
             context))
2452
0
      continue;
2453
0
  }
2454
2455
      /* How many bytes the node can accept?  */
2456
0
      naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2457
0
             re_string_cur_idx (&mctx->input));
2458
0
      if (naccepted == 0)
2459
0
  continue;
2460
2461
      /* The node can accepts 'naccepted' bytes.  */
2462
0
      dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2463
0
      mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2464
0
             : mctx->max_mb_elem_len);
2465
0
      err = clean_state_log_if_needed (mctx, dest_idx);
2466
0
      if (__glibc_unlikely (err != REG_NOERROR))
2467
0
  return err;
2468
0
      DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2469
0
      new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2470
2471
0
      dest_state = mctx->state_log[dest_idx];
2472
0
      if (dest_state == NULL)
2473
0
  dest_nodes = *new_nodes;
2474
0
      else
2475
0
  {
2476
0
    err = re_node_set_init_union (&dest_nodes,
2477
0
          dest_state->entrance_nodes, new_nodes);
2478
0
    if (__glibc_unlikely (err != REG_NOERROR))
2479
0
      return err;
2480
0
  }
2481
0
      context = re_string_context_at (&mctx->input, dest_idx - 1,
2482
0
              mctx->eflags);
2483
0
      mctx->state_log[dest_idx]
2484
0
  = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2485
0
      if (dest_state != NULL)
2486
0
  re_node_set_free (&dest_nodes);
2487
0
      if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2488
0
          && err != REG_NOERROR))
2489
0
  return err;
2490
0
    }
2491
0
  return REG_NOERROR;
2492
0
}
2493
2494
static reg_errcode_t
2495
transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2496
0
{
2497
0
  const re_dfa_t *const dfa = mctx->dfa;
2498
0
  reg_errcode_t err;
2499
0
  Idx i;
2500
0
  Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2501
2502
0
  for (i = 0; i < nodes->nelem; ++i)
2503
0
    {
2504
0
      Idx dest_str_idx, prev_nelem, bkc_idx;
2505
0
      Idx node_idx = nodes->elems[i];
2506
0
      unsigned int context;
2507
0
      const re_token_t *node = dfa->nodes + node_idx;
2508
0
      re_node_set *new_dest_nodes;
2509
2510
      /* Check whether 'node' is a backreference or not.  */
2511
0
      if (node->type != OP_BACK_REF)
2512
0
  continue;
2513
2514
0
      if (node->constraint)
2515
0
  {
2516
0
    context = re_string_context_at (&mctx->input, cur_str_idx,
2517
0
            mctx->eflags);
2518
0
    if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2519
0
      continue;
2520
0
  }
2521
2522
      /* 'node' is a backreference.
2523
   Check the substring which the substring matched.  */
2524
0
      bkc_idx = mctx->nbkref_ents;
2525
0
      err = get_subexp (mctx, node_idx, cur_str_idx);
2526
0
      if (__glibc_unlikely (err != REG_NOERROR))
2527
0
  goto free_return;
2528
2529
      /* And add the epsilon closures (which is 'new_dest_nodes') of
2530
   the backreference to appropriate state_log.  */
2531
0
      DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2532
0
      for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2533
0
  {
2534
0
    Idx subexp_len;
2535
0
    re_dfastate_t *dest_state;
2536
0
    struct re_backref_cache_entry *bkref_ent;
2537
0
    bkref_ent = mctx->bkref_ents + bkc_idx;
2538
0
    if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2539
0
      continue;
2540
0
    subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2541
0
    new_dest_nodes = (subexp_len == 0
2542
0
          ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2543
0
          : dfa->eclosures + dfa->nexts[node_idx]);
2544
0
    dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2545
0
        - bkref_ent->subexp_from);
2546
0
    context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2547
0
            mctx->eflags);
2548
0
    dest_state = mctx->state_log[dest_str_idx];
2549
0
    prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2550
0
      : mctx->state_log[cur_str_idx]->nodes.nelem);
2551
    /* Add 'new_dest_node' to state_log.  */
2552
0
    if (dest_state == NULL)
2553
0
      {
2554
0
        mctx->state_log[dest_str_idx]
2555
0
    = re_acquire_state_context (&err, dfa, new_dest_nodes,
2556
0
              context);
2557
0
        if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2558
0
            && err != REG_NOERROR))
2559
0
    goto free_return;
2560
0
      }
2561
0
    else
2562
0
      {
2563
0
        re_node_set dest_nodes;
2564
0
        err = re_node_set_init_union (&dest_nodes,
2565
0
              dest_state->entrance_nodes,
2566
0
              new_dest_nodes);
2567
0
        if (__glibc_unlikely (err != REG_NOERROR))
2568
0
    {
2569
0
      re_node_set_free (&dest_nodes);
2570
0
      goto free_return;
2571
0
    }
2572
0
        mctx->state_log[dest_str_idx]
2573
0
    = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2574
0
        re_node_set_free (&dest_nodes);
2575
0
        if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2576
0
            && err != REG_NOERROR))
2577
0
    goto free_return;
2578
0
      }
2579
    /* We need to check recursively if the backreference can epsilon
2580
       transit.  */
2581
0
    if (subexp_len == 0
2582
0
        && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2583
0
      {
2584
0
        err = check_subexp_matching_top (mctx, new_dest_nodes,
2585
0
                 cur_str_idx);
2586
0
        if (__glibc_unlikely (err != REG_NOERROR))
2587
0
    goto free_return;
2588
0
        err = transit_state_bkref (mctx, new_dest_nodes);
2589
0
        if (__glibc_unlikely (err != REG_NOERROR))
2590
0
    goto free_return;
2591
0
      }
2592
0
  }
2593
0
    }
2594
0
  err = REG_NOERROR;
2595
0
 free_return:
2596
0
  return err;
2597
0
}
2598
2599
/* Enumerate all the candidates which the backreference BKREF_NODE can match
2600
   at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2601
   Note that we might collect inappropriate candidates here.
2602
   However, the cost of checking them strictly here is too high, then we
2603
   delay these checking for prune_impossible_nodes().  */
2604
2605
static reg_errcode_t
2606
__attribute_warn_unused_result__
2607
get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2608
0
{
2609
0
  const re_dfa_t *const dfa = mctx->dfa;
2610
0
  Idx subexp_num, sub_top_idx;
2611
0
  const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2612
  /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2613
0
  Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2614
0
  if (cache_idx != -1)
2615
0
    {
2616
0
      const struct re_backref_cache_entry *entry
2617
0
  = mctx->bkref_ents + cache_idx;
2618
0
      do
2619
0
  if (entry->node == bkref_node)
2620
0
    return REG_NOERROR; /* We already checked it.  */
2621
0
      while (entry++->more);
2622
0
    }
2623
2624
0
  subexp_num = dfa->nodes[bkref_node].opr.idx;
2625
2626
  /* For each sub expression  */
2627
0
  for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2628
0
    {
2629
0
      reg_errcode_t err;
2630
0
      re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2631
0
      re_sub_match_last_t *sub_last;
2632
0
      Idx sub_last_idx, sl_str, bkref_str_off;
2633
2634
0
      if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2635
0
  continue; /* It isn't related.  */
2636
2637
0
      sl_str = sub_top->str_idx;
2638
0
      bkref_str_off = bkref_str_idx;
2639
      /* At first, check the last node of sub expressions we already
2640
   evaluated.  */
2641
0
      for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2642
0
  {
2643
0
    regoff_t sl_str_diff;
2644
0
    sub_last = sub_top->lasts[sub_last_idx];
2645
0
    sl_str_diff = sub_last->str_idx - sl_str;
2646
    /* The matched string by the sub expression match with the substring
2647
       at the back reference?  */
2648
0
    if (sl_str_diff > 0)
2649
0
      {
2650
0
        if (__glibc_unlikely (bkref_str_off + sl_str_diff
2651
0
            > mctx->input.valid_len))
2652
0
    {
2653
      /* Not enough chars for a successful match.  */
2654
0
      if (bkref_str_off + sl_str_diff > mctx->input.len)
2655
0
        break;
2656
2657
0
      err = clean_state_log_if_needed (mctx,
2658
0
               bkref_str_off
2659
0
               + sl_str_diff);
2660
0
      if (__glibc_unlikely (err != REG_NOERROR))
2661
0
        return err;
2662
0
      buf = (const char *) re_string_get_buffer (&mctx->input);
2663
0
    }
2664
0
        if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2665
    /* We don't need to search this sub expression any more.  */
2666
0
    break;
2667
0
      }
2668
0
    bkref_str_off += sl_str_diff;
2669
0
    sl_str += sl_str_diff;
2670
0
    err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2671
0
        bkref_str_idx);
2672
2673
    /* Reload buf, since the preceding call might have reallocated
2674
       the buffer.  */
2675
0
    buf = (const char *) re_string_get_buffer (&mctx->input);
2676
2677
0
    if (err == REG_NOMATCH)
2678
0
      continue;
2679
0
    if (__glibc_unlikely (err != REG_NOERROR))
2680
0
      return err;
2681
0
  }
2682
2683
0
      if (sub_last_idx < sub_top->nlasts)
2684
0
  continue;
2685
0
      if (sub_last_idx > 0)
2686
0
  ++sl_str;
2687
      /* Then, search for the other last nodes of the sub expression.  */
2688
0
      for (; sl_str <= bkref_str_idx; ++sl_str)
2689
0
  {
2690
0
    Idx cls_node;
2691
0
    regoff_t sl_str_off;
2692
0
    const re_node_set *nodes;
2693
0
    sl_str_off = sl_str - sub_top->str_idx;
2694
    /* The matched string by the sub expression match with the substring
2695
       at the back reference?  */
2696
0
    if (sl_str_off > 0)
2697
0
      {
2698
0
        if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2699
0
    {
2700
      /* If we are at the end of the input, we cannot match.  */
2701
0
      if (bkref_str_off >= mctx->input.len)
2702
0
        break;
2703
2704
0
      err = extend_buffers (mctx, bkref_str_off + 1);
2705
0
      if (__glibc_unlikely (err != REG_NOERROR))
2706
0
        return err;
2707
2708
0
      buf = (const char *) re_string_get_buffer (&mctx->input);
2709
0
    }
2710
0
        if (buf [bkref_str_off++] != buf[sl_str - 1])
2711
0
    break; /* We don't need to search this sub expression
2712
        any more.  */
2713
0
      }
2714
0
    if (mctx->state_log[sl_str] == NULL)
2715
0
      continue;
2716
    /* Does this state have a ')' of the sub expression?  */
2717
0
    nodes = &mctx->state_log[sl_str]->nodes;
2718
0
    cls_node = find_subexp_node (dfa, nodes, subexp_num,
2719
0
               OP_CLOSE_SUBEXP);
2720
0
    if (cls_node == -1)
2721
0
      continue; /* No.  */
2722
0
    if (sub_top->path == NULL)
2723
0
      {
2724
0
        sub_top->path = calloc (sizeof (state_array_t),
2725
0
              sl_str - sub_top->str_idx + 1);
2726
0
        if (sub_top->path == NULL)
2727
0
    return REG_ESPACE;
2728
0
      }
2729
    /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2730
       in the current context?  */
2731
0
    err = check_arrival (mctx, sub_top->path, sub_top->node,
2732
0
             sub_top->str_idx, cls_node, sl_str,
2733
0
             OP_CLOSE_SUBEXP);
2734
0
    if (err == REG_NOMATCH)
2735
0
        continue;
2736
0
    if (__glibc_unlikely (err != REG_NOERROR))
2737
0
        return err;
2738
0
    sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2739
0
    if (__glibc_unlikely (sub_last == NULL))
2740
0
      return REG_ESPACE;
2741
0
    err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2742
0
        bkref_str_idx);
2743
0
    buf = (const char *) re_string_get_buffer (&mctx->input);
2744
0
    if (err == REG_NOMATCH)
2745
0
      continue;
2746
0
    if (__glibc_unlikely (err != REG_NOERROR))
2747
0
      return err;
2748
0
  }
2749
0
    }
2750
0
  return REG_NOERROR;
2751
0
}
2752
2753
/* Helper functions for get_subexp().  */
2754
2755
/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2756
   If it can arrive, register the sub expression expressed with SUB_TOP
2757
   and SUB_LAST.  */
2758
2759
static reg_errcode_t
2760
get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2761
    re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2762
0
{
2763
0
  reg_errcode_t err;
2764
0
  Idx to_idx;
2765
  /* Can the subexpression arrive the back reference?  */
2766
0
  err = check_arrival (mctx, &sub_last->path, sub_last->node,
2767
0
           sub_last->str_idx, bkref_node, bkref_str,
2768
0
           OP_OPEN_SUBEXP);
2769
0
  if (err != REG_NOERROR)
2770
0
    return err;
2771
0
  err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2772
0
           sub_last->str_idx);
2773
0
  if (__glibc_unlikely (err != REG_NOERROR))
2774
0
    return err;
2775
0
  to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2776
0
  return clean_state_log_if_needed (mctx, to_idx);
2777
0
}
2778
2779
/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2780
   Search '(' if FL_OPEN, or search ')' otherwise.
2781
   TODO: This function isn't efficient...
2782
   Because there might be more than one nodes whose types are
2783
   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2784
   nodes.
2785
   E.g. RE: (a){2}  */
2786
2787
static Idx
2788
find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2789
      Idx subexp_idx, int type)
2790
0
{
2791
0
  Idx cls_idx;
2792
0
  for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2793
0
    {
2794
0
      Idx cls_node = nodes->elems[cls_idx];
2795
0
      const re_token_t *node = dfa->nodes + cls_node;
2796
0
      if (node->type == type
2797
0
    && node->opr.idx == subexp_idx)
2798
0
  return cls_node;
2799
0
    }
2800
0
  return -1;
2801
0
}
2802
2803
/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2804
   LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2805
   heavily reused.
2806
   Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2807
   REG_ESPACE if memory is exhausted.  */
2808
2809
static reg_errcode_t
2810
__attribute_warn_unused_result__
2811
check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2812
         Idx top_str, Idx last_node, Idx last_str, int type)
2813
0
{
2814
0
  const re_dfa_t *const dfa = mctx->dfa;
2815
0
  reg_errcode_t err = REG_NOERROR;
2816
0
  Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2817
0
  re_dfastate_t *cur_state = NULL;
2818
0
  re_node_set *cur_nodes, next_nodes;
2819
0
  re_dfastate_t **backup_state_log;
2820
0
  unsigned int context;
2821
2822
0
  subexp_num = dfa->nodes[top_node].opr.idx;
2823
  /* Extend the buffer if we need.  */
2824
0
  if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2825
0
    {
2826
0
      re_dfastate_t **new_array;
2827
0
      Idx old_alloc = path->alloc;
2828
0
      Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2829
0
      Idx new_alloc;
2830
0
      if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2831
0
  return REG_ESPACE;
2832
0
      new_alloc = old_alloc + incr_alloc;
2833
0
      if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2834
0
  return REG_ESPACE;
2835
0
      new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2836
0
      if (__glibc_unlikely (new_array == NULL))
2837
0
  return REG_ESPACE;
2838
0
      path->array = new_array;
2839
0
      path->alloc = new_alloc;
2840
0
      memset (new_array + old_alloc, '\0',
2841
0
        sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2842
0
    }
2843
2844
0
  str_idx = path->next_idx ? path->next_idx : top_str;
2845
2846
  /* Temporary modify MCTX.  */
2847
0
  backup_state_log = mctx->state_log;
2848
0
  backup_cur_idx = mctx->input.cur_idx;
2849
0
  mctx->state_log = path->array;
2850
0
  mctx->input.cur_idx = str_idx;
2851
2852
  /* Setup initial node set.  */
2853
0
  context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2854
0
  if (str_idx == top_str)
2855
0
    {
2856
0
      err = re_node_set_init_1 (&next_nodes, top_node);
2857
0
      if (__glibc_unlikely (err != REG_NOERROR))
2858
0
  return err;
2859
0
      err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2860
0
      if (__glibc_unlikely (err != REG_NOERROR))
2861
0
  {
2862
0
    re_node_set_free (&next_nodes);
2863
0
    return err;
2864
0
  }
2865
0
    }
2866
0
  else
2867
0
    {
2868
0
      cur_state = mctx->state_log[str_idx];
2869
0
      if (cur_state && cur_state->has_backref)
2870
0
  {
2871
0
    err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2872
0
    if (__glibc_unlikely (err != REG_NOERROR))
2873
0
      return err;
2874
0
  }
2875
0
      else
2876
0
  re_node_set_init_empty (&next_nodes);
2877
0
    }
2878
0
  if (str_idx == top_str || (cur_state && cur_state->has_backref))
2879
0
    {
2880
0
      if (next_nodes.nelem)
2881
0
  {
2882
0
    err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2883
0
            subexp_num, type);
2884
0
    if (__glibc_unlikely (err != REG_NOERROR))
2885
0
      {
2886
0
        re_node_set_free (&next_nodes);
2887
0
        return err;
2888
0
      }
2889
0
  }
2890
0
      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2891
0
      if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2892
0
  {
2893
0
    re_node_set_free (&next_nodes);
2894
0
    return err;
2895
0
  }
2896
0
      mctx->state_log[str_idx] = cur_state;
2897
0
    }
2898
2899
0
  for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2900
0
    {
2901
0
      re_node_set_empty (&next_nodes);
2902
0
      if (mctx->state_log[str_idx + 1])
2903
0
  {
2904
0
    err = re_node_set_merge (&next_nodes,
2905
0
           &mctx->state_log[str_idx + 1]->nodes);
2906
0
    if (__glibc_unlikely (err != REG_NOERROR))
2907
0
      {
2908
0
        re_node_set_free (&next_nodes);
2909
0
        return err;
2910
0
      }
2911
0
  }
2912
0
      if (cur_state)
2913
0
  {
2914
0
    err = check_arrival_add_next_nodes (mctx, str_idx,
2915
0
                &cur_state->non_eps_nodes,
2916
0
                &next_nodes);
2917
0
    if (__glibc_unlikely (err != REG_NOERROR))
2918
0
      {
2919
0
        re_node_set_free (&next_nodes);
2920
0
        return err;
2921
0
      }
2922
0
  }
2923
0
      ++str_idx;
2924
0
      if (next_nodes.nelem)
2925
0
  {
2926
0
    err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2927
0
    if (__glibc_unlikely (err != REG_NOERROR))
2928
0
      {
2929
0
        re_node_set_free (&next_nodes);
2930
0
        return err;
2931
0
      }
2932
0
    err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2933
0
            subexp_num, type);
2934
0
    if (__glibc_unlikely (err != REG_NOERROR))
2935
0
      {
2936
0
        re_node_set_free (&next_nodes);
2937
0
        return err;
2938
0
      }
2939
0
  }
2940
0
      context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2941
0
      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2942
0
      if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2943
0
  {
2944
0
    re_node_set_free (&next_nodes);
2945
0
    return err;
2946
0
  }
2947
0
      mctx->state_log[str_idx] = cur_state;
2948
0
      null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2949
0
    }
2950
0
  re_node_set_free (&next_nodes);
2951
0
  cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2952
0
         : &mctx->state_log[last_str]->nodes);
2953
0
  path->next_idx = str_idx;
2954
2955
  /* Fix MCTX.  */
2956
0
  mctx->state_log = backup_state_log;
2957
0
  mctx->input.cur_idx = backup_cur_idx;
2958
2959
  /* Then check the current node set has the node LAST_NODE.  */
2960
0
  if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2961
0
    return REG_NOERROR;
2962
2963
0
  return REG_NOMATCH;
2964
0
}
2965
2966
/* Helper functions for check_arrival.  */
2967
2968
/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2969
   to NEXT_NODES.
2970
   TODO: This function is similar to the functions transit_state*(),
2971
   however this function has many additional works.
2972
   Can't we unify them?  */
2973
2974
static reg_errcode_t
2975
__attribute_warn_unused_result__
2976
check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2977
            re_node_set *cur_nodes, re_node_set *next_nodes)
2978
0
{
2979
0
  const re_dfa_t *const dfa = mctx->dfa;
2980
0
  bool ok;
2981
0
  Idx cur_idx;
2982
0
  reg_errcode_t err = REG_NOERROR;
2983
0
  re_node_set union_set;
2984
0
  re_node_set_init_empty (&union_set);
2985
0
  for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2986
0
    {
2987
0
      int naccepted = 0;
2988
0
      Idx cur_node = cur_nodes->elems[cur_idx];
2989
0
      DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
2990
2991
      /* If the node may accept "multi byte".  */
2992
0
      if (dfa->nodes[cur_node].accept_mb)
2993
0
  {
2994
0
    naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2995
0
                 str_idx);
2996
0
    if (naccepted > 1)
2997
0
      {
2998
0
        re_dfastate_t *dest_state;
2999
0
        Idx next_node = dfa->nexts[cur_node];
3000
0
        Idx next_idx = str_idx + naccepted;
3001
0
        dest_state = mctx->state_log[next_idx];
3002
0
        re_node_set_empty (&union_set);
3003
0
        if (dest_state)
3004
0
    {
3005
0
      err = re_node_set_merge (&union_set, &dest_state->nodes);
3006
0
      if (__glibc_unlikely (err != REG_NOERROR))
3007
0
        {
3008
0
          re_node_set_free (&union_set);
3009
0
          return err;
3010
0
        }
3011
0
    }
3012
0
        ok = re_node_set_insert (&union_set, next_node);
3013
0
        if (__glibc_unlikely (! ok))
3014
0
    {
3015
0
      re_node_set_free (&union_set);
3016
0
      return REG_ESPACE;
3017
0
    }
3018
0
        mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3019
0
                  &union_set);
3020
0
        if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3021
0
            && err != REG_NOERROR))
3022
0
    {
3023
0
      re_node_set_free (&union_set);
3024
0
      return err;
3025
0
    }
3026
0
      }
3027
0
  }
3028
3029
0
      if (naccepted
3030
0
    || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3031
0
  {
3032
0
    ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3033
0
    if (__glibc_unlikely (! ok))
3034
0
      {
3035
0
        re_node_set_free (&union_set);
3036
0
        return REG_ESPACE;
3037
0
      }
3038
0
  }
3039
0
    }
3040
0
  re_node_set_free (&union_set);
3041
0
  return REG_NOERROR;
3042
0
}
3043
3044
/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3045
   CUR_NODES, however exclude the nodes which are:
3046
    - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3047
    - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3048
*/
3049
3050
static reg_errcode_t
3051
check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3052
        Idx ex_subexp, int type)
3053
0
{
3054
0
  reg_errcode_t err;
3055
0
  Idx idx, outside_node;
3056
0
  re_node_set new_nodes;
3057
0
  DEBUG_ASSERT (cur_nodes->nelem);
3058
0
  err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3059
0
  if (__glibc_unlikely (err != REG_NOERROR))
3060
0
    return err;
3061
  /* Create a new node set NEW_NODES with the nodes which are epsilon
3062
     closures of the node in CUR_NODES.  */
3063
3064
0
  for (idx = 0; idx < cur_nodes->nelem; ++idx)
3065
0
    {
3066
0
      Idx cur_node = cur_nodes->elems[idx];
3067
0
      const re_node_set *eclosure = dfa->eclosures + cur_node;
3068
0
      outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3069
0
      if (outside_node == -1)
3070
0
  {
3071
    /* There are no problematic nodes, just merge them.  */
3072
0
    err = re_node_set_merge (&new_nodes, eclosure);
3073
0
    if (__glibc_unlikely (err != REG_NOERROR))
3074
0
      {
3075
0
        re_node_set_free (&new_nodes);
3076
0
        return err;
3077
0
      }
3078
0
  }
3079
0
      else
3080
0
  {
3081
    /* There are problematic nodes, re-calculate incrementally.  */
3082
0
    err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3083
0
                ex_subexp, type);
3084
0
    if (__glibc_unlikely (err != REG_NOERROR))
3085
0
      {
3086
0
        re_node_set_free (&new_nodes);
3087
0
        return err;
3088
0
      }
3089
0
  }
3090
0
    }
3091
0
  re_node_set_free (cur_nodes);
3092
0
  *cur_nodes = new_nodes;
3093
0
  return REG_NOERROR;
3094
0
}
3095
3096
/* Helper function for check_arrival_expand_ecl.
3097
   Check incrementally the epsilon closure of TARGET, and if it isn't
3098
   problematic append it to DST_NODES.  */
3099
3100
static reg_errcode_t
3101
__attribute_warn_unused_result__
3102
check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3103
            Idx target, Idx ex_subexp, int type)
3104
0
{
3105
0
  Idx cur_node;
3106
0
  for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3107
0
    {
3108
0
      bool ok;
3109
3110
0
      if (dfa->nodes[cur_node].type == type
3111
0
    && dfa->nodes[cur_node].opr.idx == ex_subexp)
3112
0
  {
3113
0
    if (type == OP_CLOSE_SUBEXP)
3114
0
      {
3115
0
        ok = re_node_set_insert (dst_nodes, cur_node);
3116
0
        if (__glibc_unlikely (! ok))
3117
0
    return REG_ESPACE;
3118
0
      }
3119
0
    break;
3120
0
  }
3121
0
      ok = re_node_set_insert (dst_nodes, cur_node);
3122
0
      if (__glibc_unlikely (! ok))
3123
0
  return REG_ESPACE;
3124
0
      if (dfa->edests[cur_node].nelem == 0)
3125
0
  break;
3126
0
      if (dfa->edests[cur_node].nelem == 2)
3127
0
  {
3128
0
    reg_errcode_t err;
3129
0
    err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3130
0
                dfa->edests[cur_node].elems[1],
3131
0
                ex_subexp, type);
3132
0
    if (__glibc_unlikely (err != REG_NOERROR))
3133
0
      return err;
3134
0
  }
3135
0
      cur_node = dfa->edests[cur_node].elems[0];
3136
0
    }
3137
0
  return REG_NOERROR;
3138
0
}
3139
3140
3141
/* For all the back references in the current state, calculate the
3142
   destination of the back references by the appropriate entry
3143
   in MCTX->BKREF_ENTS.  */
3144
3145
static reg_errcode_t
3146
__attribute_warn_unused_result__
3147
expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3148
        Idx cur_str, Idx subexp_num, int type)
3149
0
{
3150
0
  const re_dfa_t *const dfa = mctx->dfa;
3151
0
  reg_errcode_t err;
3152
0
  Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3153
0
  struct re_backref_cache_entry *ent;
3154
3155
0
  if (cache_idx_start == -1)
3156
0
    return REG_NOERROR;
3157
3158
0
 restart:
3159
0
  ent = mctx->bkref_ents + cache_idx_start;
3160
0
  do
3161
0
    {
3162
0
      Idx to_idx, next_node;
3163
3164
      /* Is this entry ENT is appropriate?  */
3165
0
      if (!re_node_set_contains (cur_nodes, ent->node))
3166
0
  continue; /* No.  */
3167
3168
0
      to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3169
      /* Calculate the destination of the back reference, and append it
3170
   to MCTX->STATE_LOG.  */
3171
0
      if (to_idx == cur_str)
3172
0
  {
3173
    /* The backreference did epsilon transit, we must re-check all the
3174
       node in the current state.  */
3175
0
    re_node_set new_dests;
3176
0
    reg_errcode_t err2, err3;
3177
0
    next_node = dfa->edests[ent->node].elems[0];
3178
0
    if (re_node_set_contains (cur_nodes, next_node))
3179
0
      continue;
3180
0
    err = re_node_set_init_1 (&new_dests, next_node);
3181
0
    err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3182
0
    err3 = re_node_set_merge (cur_nodes, &new_dests);
3183
0
    re_node_set_free (&new_dests);
3184
0
    if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3185
0
        || err3 != REG_NOERROR))
3186
0
      {
3187
0
        err = (err != REG_NOERROR ? err
3188
0
         : (err2 != REG_NOERROR ? err2 : err3));
3189
0
        return err;
3190
0
      }
3191
    /* TODO: It is still inefficient...  */
3192
0
    goto restart;
3193
0
  }
3194
0
      else
3195
0
  {
3196
0
    re_node_set union_set;
3197
0
    next_node = dfa->nexts[ent->node];
3198
0
    if (mctx->state_log[to_idx])
3199
0
      {
3200
0
        bool ok;
3201
0
        if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3202
0
          next_node))
3203
0
    continue;
3204
0
        err = re_node_set_init_copy (&union_set,
3205
0
             &mctx->state_log[to_idx]->nodes);
3206
0
        ok = re_node_set_insert (&union_set, next_node);
3207
0
        if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3208
0
    {
3209
0
      re_node_set_free (&union_set);
3210
0
      err = err != REG_NOERROR ? err : REG_ESPACE;
3211
0
      return err;
3212
0
    }
3213
0
      }
3214
0
    else
3215
0
      {
3216
0
        err = re_node_set_init_1 (&union_set, next_node);
3217
0
        if (__glibc_unlikely (err != REG_NOERROR))
3218
0
    return err;
3219
0
      }
3220
0
    mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3221
0
    re_node_set_free (&union_set);
3222
0
    if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3223
0
        && err != REG_NOERROR))
3224
0
      return err;
3225
0
  }
3226
0
    }
3227
0
  while (ent++->more);
3228
0
  return REG_NOERROR;
3229
0
}
3230
3231
/* Build transition table for the state.
3232
   Return true if successful.  */
3233
3234
static bool __attribute_noinline__
3235
build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3236
0
{
3237
0
  reg_errcode_t err;
3238
0
  Idx i, j;
3239
0
  int ch;
3240
0
  bool need_word_trtable = false;
3241
0
  bitset_word_t elem, mask;
3242
0
  Idx ndests; /* Number of the destination states from 'state'.  */
3243
0
  re_dfastate_t **trtable;
3244
0
  re_dfastate_t *dest_states[SBC_MAX];
3245
0
  re_dfastate_t *dest_states_word[SBC_MAX];
3246
0
  re_dfastate_t *dest_states_nl[SBC_MAX];
3247
0
  re_node_set follows;
3248
0
  bitset_t acceptable;
3249
3250
  /* We build DFA states which corresponds to the destination nodes
3251
     from 'state'.  'dests_node[i]' represents the nodes which i-th
3252
     destination state contains, and 'dests_ch[i]' represents the
3253
     characters which i-th destination state accepts.  */
3254
0
  re_node_set dests_node[SBC_MAX];
3255
0
  bitset_t dests_ch[SBC_MAX];
3256
3257
  /* Initialize transition table.  */
3258
0
  state->word_trtable = state->trtable = NULL;
3259
3260
  /* At first, group all nodes belonging to 'state' into several
3261
     destinations.  */
3262
0
  ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3263
0
  if (__glibc_unlikely (ndests <= 0))
3264
0
    {
3265
      /* Return false in case of an error, true otherwise.  */
3266
0
      if (ndests == 0)
3267
0
  {
3268
0
    state->trtable = (re_dfastate_t **)
3269
0
      calloc (sizeof (re_dfastate_t *), SBC_MAX);
3270
0
          if (__glibc_unlikely (state->trtable == NULL))
3271
0
            return false;
3272
0
    return true;
3273
0
  }
3274
0
      return false;
3275
0
    }
3276
3277
0
  err = re_node_set_alloc (&follows, ndests + 1);
3278
0
  if (__glibc_unlikely (err != REG_NOERROR))
3279
0
    {
3280
0
    out_free:
3281
0
      re_node_set_free (&follows);
3282
0
      for (i = 0; i < ndests; ++i)
3283
0
  re_node_set_free (dests_node + i);
3284
0
      return false;
3285
0
    }
3286
3287
0
  bitset_empty (acceptable);
3288
3289
  /* Then build the states for all destinations.  */
3290
0
  for (i = 0; i < ndests; ++i)
3291
0
    {
3292
0
      Idx next_node;
3293
0
      re_node_set_empty (&follows);
3294
      /* Merge the follows of this destination states.  */
3295
0
      for (j = 0; j < dests_node[i].nelem; ++j)
3296
0
  {
3297
0
    next_node = dfa->nexts[dests_node[i].elems[j]];
3298
0
    if (next_node != -1)
3299
0
      {
3300
0
        err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3301
0
        if (__glibc_unlikely (err != REG_NOERROR))
3302
0
    goto out_free;
3303
0
      }
3304
0
  }
3305
0
      dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3306
0
      if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3307
0
  goto out_free;
3308
      /* If the new state has context constraint,
3309
   build appropriate states for these contexts.  */
3310
0
      if (dest_states[i]->has_constraint)
3311
0
  {
3312
0
    dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3313
0
                CONTEXT_WORD);
3314
0
    if (__glibc_unlikely (dest_states_word[i] == NULL
3315
0
        && err != REG_NOERROR))
3316
0
      goto out_free;
3317
3318
0
    if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3319
0
      need_word_trtable = true;
3320
3321
0
    dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3322
0
              CONTEXT_NEWLINE);
3323
0
    if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3324
0
      goto out_free;
3325
0
  }
3326
0
      else
3327
0
  {
3328
0
    dest_states_word[i] = dest_states[i];
3329
0
    dest_states_nl[i] = dest_states[i];
3330
0
  }
3331
0
      bitset_merge (acceptable, dests_ch[i]);
3332
0
    }
3333
3334
0
  if (!__glibc_unlikely (need_word_trtable))
3335
0
    {
3336
      /* We don't care about whether the following character is a word
3337
   character, or we are in a single-byte character set so we can
3338
   discern by looking at the character code: allocate a
3339
   256-entry transition table.  */
3340
0
      trtable = state->trtable =
3341
0
  (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3342
0
      if (__glibc_unlikely (trtable == NULL))
3343
0
  goto out_free;
3344
3345
      /* For all characters ch...:  */
3346
0
      for (i = 0; i < BITSET_WORDS; ++i)
3347
0
  for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3348
0
       elem;
3349
0
       mask <<= 1, elem >>= 1, ++ch)
3350
0
    if (__glibc_unlikely (elem & 1))
3351
0
      {
3352
        /* There must be exactly one destination which accepts
3353
     character ch.  See group_nodes_into_DFAstates.  */
3354
0
        for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3355
0
    ;
3356
3357
        /* j-th destination accepts the word character ch.  */
3358
0
        if (dfa->word_char[i] & mask)
3359
0
    trtable[ch] = dest_states_word[j];
3360
0
        else
3361
0
    trtable[ch] = dest_states[j];
3362
0
      }
3363
0
    }
3364
0
  else
3365
0
    {
3366
      /* We care about whether the following character is a word
3367
   character, and we are in a multi-byte character set: discern
3368
   by looking at the character code: build two 256-entry
3369
   transition tables, one starting at trtable[0] and one
3370
   starting at trtable[SBC_MAX].  */
3371
0
      trtable = state->word_trtable =
3372
0
  (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3373
0
      if (__glibc_unlikely (trtable == NULL))
3374
0
  goto out_free;
3375
3376
      /* For all characters ch...:  */
3377
0
      for (i = 0; i < BITSET_WORDS; ++i)
3378
0
  for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3379
0
       elem;
3380
0
       mask <<= 1, elem >>= 1, ++ch)
3381
0
    if (__glibc_unlikely (elem & 1))
3382
0
      {
3383
        /* There must be exactly one destination which accepts
3384
     character ch.  See group_nodes_into_DFAstates.  */
3385
0
        for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3386
0
    ;
3387
3388
        /* j-th destination accepts the word character ch.  */
3389
0
        trtable[ch] = dest_states[j];
3390
0
        trtable[ch + SBC_MAX] = dest_states_word[j];
3391
0
      }
3392
0
    }
3393
3394
  /* new line */
3395
0
  if (bitset_contain (acceptable, NEWLINE_CHAR))
3396
0
    {
3397
      /* The current state accepts newline character.  */
3398
0
      for (j = 0; j < ndests; ++j)
3399
0
  if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3400
0
    {
3401
      /* k-th destination accepts newline character.  */
3402
0
      trtable[NEWLINE_CHAR] = dest_states_nl[j];
3403
0
      if (need_word_trtable)
3404
0
        trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3405
      /* There must be only one destination which accepts
3406
         newline.  See group_nodes_into_DFAstates.  */
3407
0
      break;
3408
0
    }
3409
0
    }
3410
3411
0
  re_node_set_free (&follows);
3412
0
  for (i = 0; i < ndests; ++i)
3413
0
    re_node_set_free (dests_node + i);
3414
0
  return true;
3415
0
}
3416
3417
/* Group all nodes belonging to STATE into several destinations.
3418
   Then for all destinations, set the nodes belonging to the destination
3419
   to DESTS_NODE[i] and set the characters accepted by the destination
3420
   to DEST_CH[i].  Return the number of destinations if successful,
3421
   -1 on internal error.  */
3422
3423
static Idx
3424
group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3425
          re_node_set *dests_node, bitset_t *dests_ch)
3426
0
{
3427
0
  reg_errcode_t err;
3428
0
  bool ok;
3429
0
  Idx i, j, k;
3430
0
  Idx ndests; /* Number of the destinations from 'state'.  */
3431
0
  bitset_t accepts; /* Characters a node can accept.  */
3432
0
  const re_node_set *cur_nodes = &state->nodes;
3433
0
  bitset_empty (accepts);
3434
0
  ndests = 0;
3435
3436
  /* For all the nodes belonging to 'state',  */
3437
0
  for (i = 0; i < cur_nodes->nelem; ++i)
3438
0
    {
3439
0
      re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3440
0
      re_token_type_t type = node->type;
3441
0
      unsigned int constraint = node->constraint;
3442
3443
      /* Enumerate all single byte character this node can accept.  */
3444
0
      if (type == CHARACTER)
3445
0
  bitset_set (accepts, node->opr.c);
3446
0
      else if (type == SIMPLE_BRACKET)
3447
0
  {
3448
0
    bitset_merge (accepts, node->opr.sbcset);
3449
0
  }
3450
0
      else if (type == OP_PERIOD)
3451
0
  {
3452
0
    if (dfa->mb_cur_max > 1)
3453
0
      bitset_merge (accepts, dfa->sb_char);
3454
0
    else
3455
0
      bitset_set_all (accepts);
3456
0
    if (!(dfa->syntax & RE_DOT_NEWLINE))
3457
0
      bitset_clear (accepts, '\n');
3458
0
    if (dfa->syntax & RE_DOT_NOT_NULL)
3459
0
      bitset_clear (accepts, '\0');
3460
0
  }
3461
0
      else if (type == OP_UTF8_PERIOD)
3462
0
  {
3463
0
    if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3464
0
      memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3465
0
    else
3466
0
      bitset_merge (accepts, utf8_sb_map);
3467
0
    if (!(dfa->syntax & RE_DOT_NEWLINE))
3468
0
      bitset_clear (accepts, '\n');
3469
0
    if (dfa->syntax & RE_DOT_NOT_NULL)
3470
0
      bitset_clear (accepts, '\0');
3471
0
  }
3472
0
      else
3473
0
  continue;
3474
3475
      /* Check the 'accepts' and sift the characters which are not
3476
   match it the context.  */
3477
0
      if (constraint)
3478
0
  {
3479
0
    if (constraint & NEXT_NEWLINE_CONSTRAINT)
3480
0
      {
3481
0
        bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3482
0
        bitset_empty (accepts);
3483
0
        if (accepts_newline)
3484
0
    bitset_set (accepts, NEWLINE_CHAR);
3485
0
        else
3486
0
    continue;
3487
0
      }
3488
0
    if (constraint & NEXT_ENDBUF_CONSTRAINT)
3489
0
      {
3490
0
        bitset_empty (accepts);
3491
0
        continue;
3492
0
      }
3493
3494
0
    if (constraint & NEXT_WORD_CONSTRAINT)
3495
0
      {
3496
0
        bitset_word_t any_set = 0;
3497
0
        if (type == CHARACTER && !node->word_char)
3498
0
    {
3499
0
      bitset_empty (accepts);
3500
0
      continue;
3501
0
    }
3502
0
        if (dfa->mb_cur_max > 1)
3503
0
    for (j = 0; j < BITSET_WORDS; ++j)
3504
0
      any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3505
0
        else
3506
0
    for (j = 0; j < BITSET_WORDS; ++j)
3507
0
      any_set |= (accepts[j] &= dfa->word_char[j]);
3508
0
        if (!any_set)
3509
0
    continue;
3510
0
      }
3511
0
    if (constraint & NEXT_NOTWORD_CONSTRAINT)
3512
0
      {
3513
0
        bitset_word_t any_set = 0;
3514
0
        if (type == CHARACTER && node->word_char)
3515
0
    {
3516
0
      bitset_empty (accepts);
3517
0
      continue;
3518
0
    }
3519
0
        if (dfa->mb_cur_max > 1)
3520
0
    for (j = 0; j < BITSET_WORDS; ++j)
3521
0
      any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3522
0
        else
3523
0
    for (j = 0; j < BITSET_WORDS; ++j)
3524
0
      any_set |= (accepts[j] &= ~dfa->word_char[j]);
3525
0
        if (!any_set)
3526
0
    continue;
3527
0
      }
3528
0
  }
3529
3530
      /* Then divide 'accepts' into DFA states, or create a new
3531
   state.  Above, we make sure that accepts is not empty.  */
3532
0
      for (j = 0; j < ndests; ++j)
3533
0
  {
3534
0
    bitset_t intersec; /* Intersection sets, see below.  */
3535
0
    bitset_t remains;
3536
    /* Flags, see below.  */
3537
0
    bitset_word_t has_intersec, not_subset, not_consumed;
3538
3539
    /* Optimization, skip if this state doesn't accept the character.  */
3540
0
    if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3541
0
      continue;
3542
3543
    /* Enumerate the intersection set of this state and 'accepts'.  */
3544
0
    has_intersec = 0;
3545
0
    for (k = 0; k < BITSET_WORDS; ++k)
3546
0
      has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3547
    /* And skip if the intersection set is empty.  */
3548
0
    if (!has_intersec)
3549
0
      continue;
3550
3551
    /* Then check if this state is a subset of 'accepts'.  */
3552
0
    not_subset = not_consumed = 0;
3553
0
    for (k = 0; k < BITSET_WORDS; ++k)
3554
0
      {
3555
0
        not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3556
0
        not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3557
0
      }
3558
3559
    /* If this state isn't a subset of 'accepts', create a
3560
       new group state, which has the 'remains'. */
3561
0
    if (not_subset)
3562
0
      {
3563
0
        bitset_copy (dests_ch[ndests], remains);
3564
0
        bitset_copy (dests_ch[j], intersec);
3565
0
        err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3566
0
        if (__glibc_unlikely (err != REG_NOERROR))
3567
0
    goto error_return;
3568
0
        ++ndests;
3569
0
      }
3570
3571
    /* Put the position in the current group. */
3572
0
    ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3573
0
    if (__glibc_unlikely (! ok))
3574
0
      goto error_return;
3575
3576
    /* If all characters are consumed, go to next node. */
3577
0
    if (!not_consumed)
3578
0
      break;
3579
0
  }
3580
      /* Some characters remain, create a new group. */
3581
0
      if (j == ndests)
3582
0
  {
3583
0
    bitset_copy (dests_ch[ndests], accepts);
3584
0
    err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3585
0
    if (__glibc_unlikely (err != REG_NOERROR))
3586
0
      goto error_return;
3587
0
    ++ndests;
3588
0
    bitset_empty (accepts);
3589
0
  }
3590
0
    }
3591
0
  assume (ndests <= SBC_MAX);
3592
0
  return ndests;
3593
0
 error_return:
3594
0
  for (j = 0; j < ndests; ++j)
3595
0
    re_node_set_free (dests_node + j);
3596
0
  return -1;
3597
0
}
3598
3599
/* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3600
   Return the number of the bytes the node accepts.
3601
   STR_IDX is the current index of the input string.
3602
3603
   This function handles the nodes which can accept one character, or
3604
   one collating element like '.', '[a-z]', opposite to the other nodes
3605
   can only accept one byte.  */
3606
3607
#ifdef _LIBC
3608
# include <locale/weight.h>
3609
#endif
3610
3611
static int
3612
check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3613
       const re_string_t *input, Idx str_idx)
3614
0
{
3615
0
  const re_token_t *node = dfa->nodes + node_idx;
3616
0
  int char_len, elem_len;
3617
0
  Idx i;
3618
3619
0
  if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3620
0
    {
3621
0
      unsigned char c = re_string_byte_at (input, str_idx), d;
3622
0
      if (__glibc_likely (c < 0xc2))
3623
0
  return 0;
3624
3625
0
      if (str_idx + 2 > input->len)
3626
0
  return 0;
3627
3628
0
      d = re_string_byte_at (input, str_idx + 1);
3629
0
      if (c < 0xe0)
3630
0
  return (d < 0x80 || d > 0xbf) ? 0 : 2;
3631
0
      else if (c < 0xf0)
3632
0
  {
3633
0
    char_len = 3;
3634
0
    if (c == 0xe0 && d < 0xa0)
3635
0
      return 0;
3636
0
  }
3637
0
      else if (c < 0xf8)
3638
0
  {
3639
0
    char_len = 4;
3640
0
    if (c == 0xf0 && d < 0x90)
3641
0
      return 0;
3642
0
  }
3643
0
      else if (c < 0xfc)
3644
0
  {
3645
0
    char_len = 5;
3646
0
    if (c == 0xf8 && d < 0x88)
3647
0
      return 0;
3648
0
  }
3649
0
      else if (c < 0xfe)
3650
0
  {
3651
0
    char_len = 6;
3652
0
    if (c == 0xfc && d < 0x84)
3653
0
      return 0;
3654
0
  }
3655
0
      else
3656
0
  return 0;
3657
3658
0
      if (str_idx + char_len > input->len)
3659
0
  return 0;
3660
3661
0
      for (i = 1; i < char_len; ++i)
3662
0
  {
3663
0
    d = re_string_byte_at (input, str_idx + i);
3664
0
    if (d < 0x80 || d > 0xbf)
3665
0
      return 0;
3666
0
  }
3667
0
      return char_len;
3668
0
    }
3669
3670
0
  char_len = re_string_char_size_at (input, str_idx);
3671
0
  if (node->type == OP_PERIOD)
3672
0
    {
3673
0
      if (char_len <= 1)
3674
0
  return 0;
3675
      /* FIXME: I don't think this if is needed, as both '\n'
3676
   and '\0' are char_len == 1.  */
3677
      /* '.' accepts any one character except the following two cases.  */
3678
0
      if ((!(dfa->syntax & RE_DOT_NEWLINE)
3679
0
     && re_string_byte_at (input, str_idx) == '\n')
3680
0
    || ((dfa->syntax & RE_DOT_NOT_NULL)
3681
0
        && re_string_byte_at (input, str_idx) == '\0'))
3682
0
  return 0;
3683
0
      return char_len;
3684
0
    }
3685
3686
0
  elem_len = re_string_elem_size_at (input, str_idx);
3687
0
  if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3688
0
    return 0;
3689
3690
0
  if (node->type == COMPLEX_BRACKET)
3691
0
    {
3692
0
      const re_charset_t *cset = node->opr.mbcset;
3693
#ifdef _LIBC
3694
      const unsigned char *pin
3695
  = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3696
      Idx j;
3697
      uint32_t nrules;
3698
#endif
3699
0
      int match_len = 0;
3700
0
      wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3701
0
        ? re_string_wchar_at (input, str_idx) : 0);
3702
3703
      /* match with multibyte character?  */
3704
0
      for (i = 0; i < cset->nmbchars; ++i)
3705
0
  if (wc == cset->mbchars[i])
3706
0
    {
3707
0
      match_len = char_len;
3708
0
      goto check_node_accept_bytes_match;
3709
0
    }
3710
      /* match with character_class?  */
3711
0
      for (i = 0; i < cset->nchar_classes; ++i)
3712
0
  {
3713
0
    wctype_t wt = cset->char_classes[i];
3714
0
    if (__iswctype (wc, wt))
3715
0
      {
3716
0
        match_len = char_len;
3717
0
        goto check_node_accept_bytes_match;
3718
0
      }
3719
0
  }
3720
3721
#ifdef _LIBC
3722
      nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3723
      if (nrules != 0)
3724
  {
3725
    unsigned int in_collseq = 0;
3726
    const int32_t *table, *indirect;
3727
    const unsigned char *weights, *extra;
3728
    const char *collseqwc;
3729
3730
    /* match with collating_symbol?  */
3731
    if (cset->ncoll_syms)
3732
      extra = (const unsigned char *)
3733
        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3734
    for (i = 0; i < cset->ncoll_syms; ++i)
3735
      {
3736
        const unsigned char *coll_sym = extra + cset->coll_syms[i];
3737
        /* Compare the length of input collating element and
3738
     the length of current collating element.  */
3739
        if (*coll_sym != elem_len)
3740
    continue;
3741
        /* Compare each bytes.  */
3742
        for (j = 0; j < *coll_sym; j++)
3743
    if (pin[j] != coll_sym[1 + j])
3744
      break;
3745
        if (j == *coll_sym)
3746
    {
3747
      /* Match if every bytes is equal.  */
3748
      match_len = j;
3749
      goto check_node_accept_bytes_match;
3750
    }
3751
      }
3752
3753
    if (cset->nranges)
3754
      {
3755
        if (elem_len <= char_len)
3756
    {
3757
      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3758
      in_collseq = __collseq_table_lookup (collseqwc, wc);
3759
    }
3760
        else
3761
    in_collseq = find_collation_sequence_value (pin, elem_len);
3762
      }
3763
    /* match with range expression?  */
3764
    /* FIXME: Implement rational ranges here, too.  */
3765
    for (i = 0; i < cset->nranges; ++i)
3766
      if (cset->range_starts[i] <= in_collseq
3767
    && in_collseq <= cset->range_ends[i])
3768
        {
3769
    match_len = elem_len;
3770
    goto check_node_accept_bytes_match;
3771
        }
3772
3773
    /* match with equivalence_class?  */
3774
    if (cset->nequiv_classes)
3775
      {
3776
        const unsigned char *cp = pin;
3777
        table = (const int32_t *)
3778
    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3779
        weights = (const unsigned char *)
3780
    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3781
        extra = (const unsigned char *)
3782
    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3783
        indirect = (const int32_t *)
3784
    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3785
        int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3786
        int32_t rule = idx >> 24;
3787
        idx &= 0xffffff;
3788
        if (idx > 0)
3789
    {
3790
      size_t weight_len = weights[idx];
3791
      for (i = 0; i < cset->nequiv_classes; ++i)
3792
        {
3793
          int32_t equiv_class_idx = cset->equiv_classes[i];
3794
          int32_t equiv_class_rule = equiv_class_idx >> 24;
3795
          equiv_class_idx &= 0xffffff;
3796
          if (weights[equiv_class_idx] == weight_len
3797
        && equiv_class_rule == rule
3798
        && memcmp (weights + idx + 1,
3799
             weights + equiv_class_idx + 1,
3800
             weight_len) == 0)
3801
      {
3802
        match_len = elem_len;
3803
        goto check_node_accept_bytes_match;
3804
      }
3805
        }
3806
    }
3807
      }
3808
  }
3809
      else
3810
#endif /* _LIBC */
3811
0
  {
3812
    /* match with range expression?  */
3813
0
    for (i = 0; i < cset->nranges; ++i)
3814
0
      {
3815
0
        if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3816
0
    {
3817
0
      match_len = char_len;
3818
0
      goto check_node_accept_bytes_match;
3819
0
    }
3820
0
      }
3821
0
  }
3822
0
    check_node_accept_bytes_match:
3823
0
      if (!cset->non_match)
3824
0
  return match_len;
3825
0
      else
3826
0
  {
3827
0
    if (match_len > 0)
3828
0
      return 0;
3829
0
    else
3830
0
      return (elem_len > char_len) ? elem_len : char_len;
3831
0
  }
3832
0
    }
3833
0
  return 0;
3834
0
}
3835
3836
#ifdef _LIBC
3837
static unsigned int
3838
find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3839
{
3840
  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3841
  if (nrules == 0)
3842
    {
3843
      if (mbs_len == 1)
3844
  {
3845
    /* No valid character.  Match it as a single byte character.  */
3846
    const unsigned char *collseq = (const unsigned char *)
3847
      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3848
    return collseq[mbs[0]];
3849
  }
3850
      return UINT_MAX;
3851
    }
3852
  else
3853
    {
3854
      int32_t idx;
3855
      const unsigned char *extra = (const unsigned char *)
3856
  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3857
      int32_t extrasize = (const unsigned char *)
3858
  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3859
3860
      for (idx = 0; idx < extrasize;)
3861
  {
3862
    int mbs_cnt;
3863
    bool found = false;
3864
    int32_t elem_mbs_len;
3865
    /* Skip the name of collating element name.  */
3866
    idx = idx + extra[idx] + 1;
3867
    elem_mbs_len = extra[idx++];
3868
    if (mbs_len == elem_mbs_len)
3869
      {
3870
        for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3871
    if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3872
      break;
3873
        if (mbs_cnt == elem_mbs_len)
3874
    /* Found the entry.  */
3875
    found = true;
3876
      }
3877
    /* Skip the byte sequence of the collating element.  */
3878
    idx += elem_mbs_len;
3879
    /* Adjust for the alignment.  */
3880
    idx = (idx + 3) & ~3;
3881
    /* Skip the collation sequence value.  */
3882
    idx += sizeof (uint32_t);
3883
    /* Skip the wide char sequence of the collating element.  */
3884
    idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3885
    /* If we found the entry, return the sequence value.  */
3886
    if (found)
3887
      return *(uint32_t *) (extra + idx);
3888
    /* Skip the collation sequence value.  */
3889
    idx += sizeof (uint32_t);
3890
  }
3891
      return UINT_MAX;
3892
    }
3893
}
3894
#endif /* _LIBC */
3895
3896
/* Check whether the node accepts the byte which is IDX-th
3897
   byte of the INPUT.  */
3898
3899
static bool
3900
check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3901
       Idx idx)
3902
0
{
3903
0
  unsigned char ch;
3904
0
  ch = re_string_byte_at (&mctx->input, idx);
3905
0
  switch (node->type)
3906
0
    {
3907
0
    case CHARACTER:
3908
0
      if (node->opr.c != ch)
3909
0
        return false;
3910
0
      break;
3911
3912
0
    case SIMPLE_BRACKET:
3913
0
      if (!bitset_contain (node->opr.sbcset, ch))
3914
0
        return false;
3915
0
      break;
3916
3917
0
    case OP_UTF8_PERIOD:
3918
0
      if (ch >= ASCII_CHARS)
3919
0
        return false;
3920
0
      FALLTHROUGH;
3921
0
    case OP_PERIOD:
3922
0
      if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3923
0
    || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3924
0
  return false;
3925
0
      break;
3926
3927
0
    default:
3928
0
      return false;
3929
0
    }
3930
3931
0
  if (node->constraint)
3932
0
    {
3933
      /* The node has constraints.  Check whether the current context
3934
   satisfies the constraints.  */
3935
0
      unsigned int context = re_string_context_at (&mctx->input, idx,
3936
0
               mctx->eflags);
3937
0
      if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3938
0
  return false;
3939
0
    }
3940
3941
0
  return true;
3942
0
}
3943
3944
/* Extend the buffers, if the buffers have run out.  */
3945
3946
static reg_errcode_t
3947
__attribute_warn_unused_result__
3948
extend_buffers (re_match_context_t *mctx, int min_len)
3949
0
{
3950
0
  reg_errcode_t ret;
3951
0
  re_string_t *pstr = &mctx->input;
3952
3953
  /* Avoid overflow.  */
3954
0
  if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3955
0
      <= pstr->bufs_len))
3956
0
    return REG_ESPACE;
3957
3958
  /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
3959
0
  ret = re_string_realloc_buffers (pstr,
3960
0
           MAX (min_len,
3961
0
          MIN (pstr->len, pstr->bufs_len * 2)));
3962
0
  if (__glibc_unlikely (ret != REG_NOERROR))
3963
0
    return ret;
3964
3965
0
  if (mctx->state_log != NULL)
3966
0
    {
3967
      /* And double the length of state_log.  */
3968
      /* XXX We have no indication of the size of this buffer.  If this
3969
   allocation fail we have no indication that the state_log array
3970
   does not have the right size.  */
3971
0
      re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3972
0
                pstr->bufs_len + 1);
3973
0
      if (__glibc_unlikely (new_array == NULL))
3974
0
  return REG_ESPACE;
3975
0
      mctx->state_log = new_array;
3976
0
    }
3977
3978
  /* Then reconstruct the buffers.  */
3979
0
  if (pstr->icase)
3980
0
    {
3981
0
      if (pstr->mb_cur_max > 1)
3982
0
  {
3983
0
    ret = build_wcs_upper_buffer (pstr);
3984
0
    if (__glibc_unlikely (ret != REG_NOERROR))
3985
0
      return ret;
3986
0
  }
3987
0
      else
3988
0
  build_upper_buffer (pstr);
3989
0
    }
3990
0
  else
3991
0
    {
3992
0
      if (pstr->mb_cur_max > 1)
3993
0
  build_wcs_buffer (pstr);
3994
0
      else
3995
0
  {
3996
0
    if (pstr->trans != NULL)
3997
0
      re_string_translate_buffer (pstr);
3998
0
  }
3999
0
    }
4000
0
  return REG_NOERROR;
4001
0
}
4002
4003

4004
/* Functions for matching context.  */
4005
4006
/* Initialize MCTX.  */
4007
4008
static reg_errcode_t
4009
__attribute_warn_unused_result__
4010
match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4011
0
{
4012
0
  mctx->eflags = eflags;
4013
0
  mctx->match_last = -1;
4014
0
  if (n > 0)
4015
0
    {
4016
      /* Avoid overflow.  */
4017
0
      size_t max_object_size =
4018
0
  MAX (sizeof (struct re_backref_cache_entry),
4019
0
       sizeof (re_sub_match_top_t *));
4020
0
      if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4021
0
  return REG_ESPACE;
4022
4023
0
      mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4024
0
      mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4025
0
      if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4026
0
  return REG_ESPACE;
4027
0
    }
4028
  /* Already zero-ed by the caller.
4029
     else
4030
       mctx->bkref_ents = NULL;
4031
     mctx->nbkref_ents = 0;
4032
     mctx->nsub_tops = 0;  */
4033
0
  mctx->abkref_ents = n;
4034
0
  mctx->max_mb_elem_len = 1;
4035
0
  mctx->asub_tops = n;
4036
0
  return REG_NOERROR;
4037
0
}
4038
4039
/* Clean the entries which depend on the current input in MCTX.
4040
   This function must be invoked when the matcher changes the start index
4041
   of the input, or changes the input string.  */
4042
4043
static void
4044
match_ctx_clean (re_match_context_t *mctx)
4045
0
{
4046
0
  Idx st_idx;
4047
0
  for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4048
0
    {
4049
0
      Idx sl_idx;
4050
0
      re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4051
0
      for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4052
0
  {
4053
0
    re_sub_match_last_t *last = top->lasts[sl_idx];
4054
0
    re_free (last->path.array);
4055
0
    re_free (last);
4056
0
  }
4057
0
      re_free (top->lasts);
4058
0
      if (top->path)
4059
0
  {
4060
0
    re_free (top->path->array);
4061
0
    re_free (top->path);
4062
0
  }
4063
0
      re_free (top);
4064
0
    }
4065
4066
0
  mctx->nsub_tops = 0;
4067
0
  mctx->nbkref_ents = 0;
4068
0
}
4069
4070
/* Free all the memory associated with MCTX.  */
4071
4072
static void
4073
match_ctx_free (re_match_context_t *mctx)
4074
0
{
4075
  /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4076
0
  match_ctx_clean (mctx);
4077
0
  re_free (mctx->sub_tops);
4078
0
  re_free (mctx->bkref_ents);
4079
0
}
4080
4081
/* Add a new backreference entry to MCTX.
4082
   Note that we assume that caller never call this function with duplicate
4083
   entry, and call with STR_IDX which isn't smaller than any existing entry.
4084
*/
4085
4086
static reg_errcode_t
4087
__attribute_warn_unused_result__
4088
match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4089
         Idx to)
4090
0
{
4091
0
  if (mctx->nbkref_ents >= mctx->abkref_ents)
4092
0
    {
4093
0
      struct re_backref_cache_entry* new_entry;
4094
0
      new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4095
0
            mctx->abkref_ents * 2);
4096
0
      if (__glibc_unlikely (new_entry == NULL))
4097
0
  {
4098
0
    re_free (mctx->bkref_ents);
4099
0
    return REG_ESPACE;
4100
0
  }
4101
0
      mctx->bkref_ents = new_entry;
4102
0
      memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4103
0
        sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4104
0
      mctx->abkref_ents *= 2;
4105
0
    }
4106
0
  if (mctx->nbkref_ents > 0
4107
0
      && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4108
0
    mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4109
4110
0
  mctx->bkref_ents[mctx->nbkref_ents].node = node;
4111
0
  mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4112
0
  mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4113
0
  mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4114
4115
  /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4116
     If bit N is clear, means that this entry won't epsilon-transition to
4117
     an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4118
     it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4119
     such node.
4120
4121
     A backreference does not epsilon-transition unless it is empty, so set
4122
     to all zeros if FROM != TO.  */
4123
0
  mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4124
0
    = (from == to ? -1 : 0);
4125
4126
0
  mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4127
0
  if (mctx->max_mb_elem_len < to - from)
4128
0
    mctx->max_mb_elem_len = to - from;
4129
0
  return REG_NOERROR;
4130
0
}
4131
4132
/* Return the first entry with the same str_idx, or -1 if none is
4133
   found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4134
4135
static Idx
4136
search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4137
0
{
4138
0
  Idx left, right, mid, last;
4139
0
  last = right = mctx->nbkref_ents;
4140
0
  for (left = 0; left < right;)
4141
0
    {
4142
0
      mid = (left + right) / 2;
4143
0
      if (mctx->bkref_ents[mid].str_idx < str_idx)
4144
0
  left = mid + 1;
4145
0
      else
4146
0
  right = mid;
4147
0
    }
4148
0
  if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4149
0
    return left;
4150
0
  else
4151
0
    return -1;
4152
0
}
4153
4154
/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4155
   at STR_IDX.  */
4156
4157
static reg_errcode_t
4158
__attribute_warn_unused_result__
4159
match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4160
0
{
4161
0
  DEBUG_ASSERT (mctx->sub_tops != NULL);
4162
0
  DEBUG_ASSERT (mctx->asub_tops > 0);
4163
0
  if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4164
0
    {
4165
0
      Idx new_asub_tops = mctx->asub_tops * 2;
4166
0
      re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4167
0
               re_sub_match_top_t *,
4168
0
               new_asub_tops);
4169
0
      if (__glibc_unlikely (new_array == NULL))
4170
0
  return REG_ESPACE;
4171
0
      mctx->sub_tops = new_array;
4172
0
      mctx->asub_tops = new_asub_tops;
4173
0
    }
4174
0
  mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4175
0
  if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4176
0
    return REG_ESPACE;
4177
0
  mctx->sub_tops[mctx->nsub_tops]->node = node;
4178
0
  mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4179
0
  return REG_NOERROR;
4180
0
}
4181
4182
/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4183
   at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4184
   Return the new entry if successful, NULL if memory is exhausted.  */
4185
4186
static re_sub_match_last_t *
4187
match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4188
0
{
4189
0
  re_sub_match_last_t *new_entry;
4190
0
  if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4191
0
    {
4192
0
      Idx new_alasts = 2 * subtop->alasts + 1;
4193
0
      re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4194
0
                re_sub_match_last_t *,
4195
0
                new_alasts);
4196
0
      if (__glibc_unlikely (new_array == NULL))
4197
0
  return NULL;
4198
0
      subtop->lasts = new_array;
4199
0
      subtop->alasts = new_alasts;
4200
0
    }
4201
0
  new_entry = calloc (1, sizeof (re_sub_match_last_t));
4202
0
  if (__glibc_likely (new_entry != NULL))
4203
0
    {
4204
0
      subtop->lasts[subtop->nlasts] = new_entry;
4205
0
      new_entry->node = node;
4206
0
      new_entry->str_idx = str_idx;
4207
0
      ++subtop->nlasts;
4208
0
    }
4209
0
  return new_entry;
4210
0
}
4211
4212
static void
4213
sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4214
         re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4215
0
{
4216
0
  sctx->sifted_states = sifted_sts;
4217
0
  sctx->limited_states = limited_sts;
4218
0
  sctx->last_node = last_node;
4219
0
  sctx->last_str_idx = last_str_idx;
4220
0
  re_node_set_init_empty (&sctx->limits);
4221
0
}