Coverage Report

Created: 2023-09-23 07:04

/src/augeas/gnulib/lib/regex_internal.c
Line
Count
Source (jump to first uncovered line)
1
/* Extended regular expression matching and search library.
2
   Copyright (C) 2002-2022 Free Software Foundation, Inc.
3
   This file is part of the GNU C Library.
4
   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6
   The GNU C Library is free software; you can redistribute it and/or
7
   modify it under the terms of the GNU Lesser General Public
8
   License as published by the Free Software Foundation; either
9
   version 2.1 of the License, or (at your option) any later version.
10
11
   The GNU C Library is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
   Lesser General Public License for more details.
15
16
   You should have received a copy of the GNU Lesser General Public
17
   License along with the GNU C Library; if not, see
18
   <https://www.gnu.org/licenses/>.  */
19
20
static void re_string_construct_common (const char *str, Idx len,
21
          re_string_t *pstr,
22
          RE_TRANSLATE_TYPE trans, bool icase,
23
          const re_dfa_t *dfa);
24
static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25
            const re_node_set *nodes,
26
            re_hashval_t hash);
27
static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28
            const re_node_set *nodes,
29
            unsigned int context,
30
            re_hashval_t hash);
31
static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32
            Idx new_buf_len);
33
static void build_wcs_buffer (re_string_t *pstr);
34
static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
35
static void build_upper_buffer (re_string_t *pstr);
36
static void re_string_translate_buffer (re_string_t *pstr);
37
static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
38
            int eflags) __attribute__ ((pure));
39

40
/* Functions for string operation.  */
41
42
/* This function allocate the buffers.  It is necessary to call
43
   re_string_reconstruct before using the object.  */
44
45
static reg_errcode_t
46
__attribute_warn_unused_result__
47
re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
48
        RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
49
0
{
50
0
  reg_errcode_t ret;
51
0
  Idx init_buf_len;
52
53
  /* Ensure at least one character fits into the buffers.  */
54
0
  if (init_len < dfa->mb_cur_max)
55
0
    init_len = dfa->mb_cur_max;
56
0
  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
57
0
  re_string_construct_common (str, len, pstr, trans, icase, dfa);
58
59
0
  ret = re_string_realloc_buffers (pstr, init_buf_len);
60
0
  if (__glibc_unlikely (ret != REG_NOERROR))
61
0
    return ret;
62
63
0
  pstr->word_char = dfa->word_char;
64
0
  pstr->word_ops_used = dfa->word_ops_used;
65
0
  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
66
0
  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
67
0
  pstr->valid_raw_len = pstr->valid_len;
68
0
  return REG_NOERROR;
69
0
}
70
71
/* This function allocate the buffers, and initialize them.  */
72
73
static reg_errcode_t
74
__attribute_warn_unused_result__
75
re_string_construct (re_string_t *pstr, const char *str, Idx len,
76
         RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
77
0
{
78
0
  reg_errcode_t ret;
79
0
  memset (pstr, '\0', sizeof (re_string_t));
80
0
  re_string_construct_common (str, len, pstr, trans, icase, dfa);
81
82
0
  if (len > 0)
83
0
    {
84
0
      ret = re_string_realloc_buffers (pstr, len + 1);
85
0
      if (__glibc_unlikely (ret != REG_NOERROR))
86
0
  return ret;
87
0
    }
88
0
  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
89
90
0
  if (icase)
91
0
    {
92
0
      if (dfa->mb_cur_max > 1)
93
0
  {
94
0
    while (1)
95
0
      {
96
0
        ret = build_wcs_upper_buffer (pstr);
97
0
        if (__glibc_unlikely (ret != REG_NOERROR))
98
0
    return ret;
99
0
        if (pstr->valid_raw_len >= len)
100
0
    break;
101
0
        if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
102
0
    break;
103
0
        ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
104
0
        if (__glibc_unlikely (ret != REG_NOERROR))
105
0
    return ret;
106
0
      }
107
0
  }
108
0
      else
109
0
  build_upper_buffer (pstr);
110
0
    }
111
0
  else
112
0
    {
113
0
      if (dfa->mb_cur_max > 1)
114
0
  build_wcs_buffer (pstr);
115
0
      else
116
0
  {
117
0
    if (trans != NULL)
118
0
      re_string_translate_buffer (pstr);
119
0
    else
120
0
      {
121
0
        pstr->valid_len = pstr->bufs_len;
122
0
        pstr->valid_raw_len = pstr->bufs_len;
123
0
      }
124
0
  }
125
0
    }
126
127
0
  return REG_NOERROR;
128
0
}
129
130
/* Helper functions for re_string_allocate, and re_string_construct.  */
131
132
static reg_errcode_t
133
__attribute_warn_unused_result__
134
re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
135
0
{
136
0
  if (pstr->mb_cur_max > 1)
137
0
    {
138
0
      wint_t *new_wcs;
139
140
      /* Avoid overflow in realloc.  */
141
0
      const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
142
0
      if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
143
0
          < new_buf_len))
144
0
  return REG_ESPACE;
145
146
0
      new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
147
0
      if (__glibc_unlikely (new_wcs == NULL))
148
0
  return REG_ESPACE;
149
0
      pstr->wcs = new_wcs;
150
0
      if (pstr->offsets != NULL)
151
0
  {
152
0
    Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
153
0
    if (__glibc_unlikely (new_offsets == NULL))
154
0
      return REG_ESPACE;
155
0
    pstr->offsets = new_offsets;
156
0
  }
157
0
    }
158
0
  if (pstr->mbs_allocated)
159
0
    {
160
0
      unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
161
0
             new_buf_len);
162
0
      if (__glibc_unlikely (new_mbs == NULL))
163
0
  return REG_ESPACE;
164
0
      pstr->mbs = new_mbs;
165
0
    }
166
0
  pstr->bufs_len = new_buf_len;
167
0
  return REG_NOERROR;
168
0
}
169
170
171
static void
172
re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
173
          RE_TRANSLATE_TYPE trans, bool icase,
174
          const re_dfa_t *dfa)
175
0
{
176
0
  pstr->raw_mbs = (const unsigned char *) str;
177
0
  pstr->len = len;
178
0
  pstr->raw_len = len;
179
0
  pstr->trans = trans;
180
0
  pstr->icase = icase;
181
0
  pstr->mbs_allocated = (trans != NULL || icase);
182
0
  pstr->mb_cur_max = dfa->mb_cur_max;
183
0
  pstr->is_utf8 = dfa->is_utf8;
184
0
  pstr->map_notascii = dfa->map_notascii;
185
0
  pstr->stop = pstr->len;
186
0
  pstr->raw_stop = pstr->stop;
187
0
}
188
189
190
/* Build wide character buffer PSTR->WCS.
191
   If the byte sequence of the string are:
192
     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193
   Then wide character buffer will be:
194
     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
195
   We use WEOF for padding, they indicate that the position isn't
196
   a first byte of a multibyte character.
197
198
   Note that this function assumes PSTR->VALID_LEN elements are already
199
   built and starts from PSTR->VALID_LEN.  */
200
201
static void
202
build_wcs_buffer (re_string_t *pstr)
203
0
{
204
#ifdef _LIBC
205
  unsigned char buf[MB_LEN_MAX];
206
  DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
207
#else
208
0
  unsigned char buf[64];
209
0
#endif
210
0
  mbstate_t prev_st;
211
0
  Idx byte_idx, end_idx, remain_len;
212
0
  size_t mbclen;
213
214
  /* Build the buffers from pstr->valid_len to either pstr->len or
215
     pstr->bufs_len.  */
216
0
  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217
0
  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218
0
    {
219
0
      wchar_t wc;
220
0
      const char *p;
221
222
0
      remain_len = end_idx - byte_idx;
223
0
      prev_st = pstr->cur_state;
224
      /* Apply the translation if we need.  */
225
0
      if (__glibc_unlikely (pstr->trans != NULL))
226
0
  {
227
0
    int i, ch;
228
229
0
    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
230
0
      {
231
0
        ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232
0
        buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233
0
      }
234
0
    p = (const char *) buf;
235
0
  }
236
0
      else
237
0
  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238
0
      mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239
0
      if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
240
0
          || (mbclen == (size_t) -2
241
0
        && pstr->bufs_len >= pstr->len)))
242
0
  {
243
    /* We treat these cases as a singlebyte character.  */
244
0
    mbclen = 1;
245
0
    wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
246
0
    if (__glibc_unlikely (pstr->trans != NULL))
247
0
      wc = pstr->trans[wc];
248
0
    pstr->cur_state = prev_st;
249
0
  }
250
0
      else if (__glibc_unlikely (mbclen == (size_t) -2))
251
0
  {
252
    /* The buffer doesn't have enough space, finish to build.  */
253
0
    pstr->cur_state = prev_st;
254
0
    break;
255
0
  }
256
257
      /* Write wide character and padding.  */
258
0
      pstr->wcs[byte_idx++] = wc;
259
      /* Write paddings.  */
260
0
      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
261
0
  pstr->wcs[byte_idx++] = WEOF;
262
0
    }
263
0
  pstr->valid_len = byte_idx;
264
0
  pstr->valid_raw_len = byte_idx;
265
0
}
266
267
/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
268
   but for REG_ICASE.  */
269
270
static reg_errcode_t
271
__attribute_warn_unused_result__
272
build_wcs_upper_buffer (re_string_t *pstr)
273
0
{
274
0
  mbstate_t prev_st;
275
0
  Idx src_idx, byte_idx, end_idx, remain_len;
276
0
  size_t mbclen;
277
#ifdef _LIBC
278
  char buf[MB_LEN_MAX];
279
  DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
280
#else
281
0
  char buf[64];
282
0
#endif
283
284
0
  byte_idx = pstr->valid_len;
285
0
  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
286
287
  /* The following optimization assumes that ASCII characters can be
288
     mapped to wide characters with a simple cast.  */
289
0
  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
290
0
    {
291
0
      while (byte_idx < end_idx)
292
0
  {
293
0
    wchar_t wc;
294
0
    unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
295
296
0
    if (isascii (ch) && mbsinit (&pstr->cur_state))
297
0
      {
298
        /* The next step uses the assumption that wchar_t is encoded
299
     ASCII-safe: all ASCII values can be converted like this.  */
300
0
        wchar_t wcu = __towupper (ch);
301
0
        if (isascii (wcu))
302
0
    {
303
0
      pstr->mbs[byte_idx] = wcu;
304
0
      pstr->wcs[byte_idx] = wcu;
305
0
      byte_idx++;
306
0
      continue;
307
0
    }
308
0
      }
309
310
0
    remain_len = end_idx - byte_idx;
311
0
    prev_st = pstr->cur_state;
312
0
    mbclen = __mbrtowc (&wc,
313
0
            ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
314
0
             + byte_idx), remain_len, &pstr->cur_state);
315
0
    if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
316
0
      {
317
0
        wchar_t wcu = __towupper (wc);
318
0
        if (wcu != wc)
319
0
    {
320
0
      size_t mbcdlen;
321
322
0
      mbcdlen = __wcrtomb (buf, wcu, &prev_st);
323
0
      if (__glibc_likely (mbclen == mbcdlen))
324
0
        memcpy (pstr->mbs + byte_idx, buf, mbclen);
325
0
      else
326
0
        {
327
0
          src_idx = byte_idx;
328
0
          goto offsets_needed;
329
0
        }
330
0
    }
331
0
        else
332
0
    memcpy (pstr->mbs + byte_idx,
333
0
      pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334
0
        pstr->wcs[byte_idx++] = wcu;
335
        /* Write paddings.  */
336
0
        for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337
0
    pstr->wcs[byte_idx++] = WEOF;
338
0
      }
339
0
    else if (mbclen == (size_t) -1 || mbclen == 0
340
0
       || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
341
0
      {
342
        /* It is an invalid character, an incomplete character
343
     at the end of the string, or '\0'.  Just use the byte.  */
344
0
        pstr->mbs[byte_idx] = ch;
345
        /* And also cast it to wide char.  */
346
0
        pstr->wcs[byte_idx++] = (wchar_t) ch;
347
0
        if (__glibc_unlikely (mbclen == (size_t) -1))
348
0
    pstr->cur_state = prev_st;
349
0
      }
350
0
    else
351
0
      {
352
        /* The buffer doesn't have enough space, finish to build.  */
353
0
        pstr->cur_state = prev_st;
354
0
        break;
355
0
      }
356
0
  }
357
0
      pstr->valid_len = byte_idx;
358
0
      pstr->valid_raw_len = byte_idx;
359
0
      return REG_NOERROR;
360
0
    }
361
0
  else
362
0
    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
363
0
      {
364
0
  wchar_t wc;
365
0
  const char *p;
366
0
      offsets_needed:
367
0
  remain_len = end_idx - byte_idx;
368
0
  prev_st = pstr->cur_state;
369
0
  if (__glibc_unlikely (pstr->trans != NULL))
370
0
    {
371
0
      int i, ch;
372
373
0
      for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
374
0
        {
375
0
    ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
376
0
    buf[i] = pstr->trans[ch];
377
0
        }
378
0
      p = (const char *) buf;
379
0
    }
380
0
  else
381
0
    p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
382
0
  mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
383
0
  if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
384
0
    {
385
0
      wchar_t wcu = __towupper (wc);
386
0
      if (wcu != wc)
387
0
        {
388
0
    size_t mbcdlen;
389
390
0
    mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
391
0
    if (__glibc_likely (mbclen == mbcdlen))
392
0
      memcpy (pstr->mbs + byte_idx, buf, mbclen);
393
0
    else if (mbcdlen != (size_t) -1)
394
0
      {
395
0
        size_t i;
396
397
0
        if (byte_idx + mbcdlen > pstr->bufs_len)
398
0
          {
399
0
      pstr->cur_state = prev_st;
400
0
      break;
401
0
          }
402
403
0
        if (pstr->offsets == NULL)
404
0
          {
405
0
      pstr->offsets = re_malloc (Idx, pstr->bufs_len);
406
407
0
      if (pstr->offsets == NULL)
408
0
        return REG_ESPACE;
409
0
          }
410
0
        if (!pstr->offsets_needed)
411
0
          {
412
0
      for (i = 0; i < (size_t) byte_idx; ++i)
413
0
        pstr->offsets[i] = i;
414
0
      pstr->offsets_needed = 1;
415
0
          }
416
417
0
        memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
418
0
        pstr->wcs[byte_idx] = wcu;
419
0
        pstr->offsets[byte_idx] = src_idx;
420
0
        for (i = 1; i < mbcdlen; ++i)
421
0
          {
422
0
      pstr->offsets[byte_idx + i]
423
0
        = src_idx + (i < mbclen ? i : mbclen - 1);
424
0
      pstr->wcs[byte_idx + i] = WEOF;
425
0
          }
426
0
        pstr->len += mbcdlen - mbclen;
427
0
        if (pstr->raw_stop > src_idx)
428
0
          pstr->stop += mbcdlen - mbclen;
429
0
        end_idx = (pstr->bufs_len > pstr->len)
430
0
            ? pstr->len : pstr->bufs_len;
431
0
        byte_idx += mbcdlen;
432
0
        src_idx += mbclen;
433
0
        continue;
434
0
      }
435
0
    else
436
0
      memcpy (pstr->mbs + byte_idx, p, mbclen);
437
0
        }
438
0
      else
439
0
        memcpy (pstr->mbs + byte_idx, p, mbclen);
440
441
0
      if (__glibc_unlikely (pstr->offsets_needed != 0))
442
0
        {
443
0
    size_t i;
444
0
    for (i = 0; i < mbclen; ++i)
445
0
      pstr->offsets[byte_idx + i] = src_idx + i;
446
0
        }
447
0
      src_idx += mbclen;
448
449
0
      pstr->wcs[byte_idx++] = wcu;
450
      /* Write paddings.  */
451
0
      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
452
0
        pstr->wcs[byte_idx++] = WEOF;
453
0
    }
454
0
  else if (mbclen == (size_t) -1 || mbclen == 0
455
0
     || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
456
0
    {
457
      /* It is an invalid character or '\0'.  Just use the byte.  */
458
0
      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
459
460
0
      if (__glibc_unlikely (pstr->trans != NULL))
461
0
        ch = pstr->trans [ch];
462
0
      pstr->mbs[byte_idx] = ch;
463
464
0
      if (__glibc_unlikely (pstr->offsets_needed != 0))
465
0
        pstr->offsets[byte_idx] = src_idx;
466
0
      ++src_idx;
467
468
      /* And also cast it to wide char.  */
469
0
      pstr->wcs[byte_idx++] = (wchar_t) ch;
470
0
      if (__glibc_unlikely (mbclen == (size_t) -1))
471
0
        pstr->cur_state = prev_st;
472
0
    }
473
0
  else
474
0
    {
475
      /* The buffer doesn't have enough space, finish to build.  */
476
0
      pstr->cur_state = prev_st;
477
0
      break;
478
0
    }
479
0
      }
480
0
  pstr->valid_len = byte_idx;
481
0
  pstr->valid_raw_len = src_idx;
482
0
  return REG_NOERROR;
483
0
}
484
485
/* Skip characters until the index becomes greater than NEW_RAW_IDX.
486
   Return the index.  */
487
488
static Idx
489
re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
490
0
{
491
0
  mbstate_t prev_st;
492
0
  Idx rawbuf_idx;
493
0
  size_t mbclen;
494
0
  wint_t wc = WEOF;
495
496
  /* Skip the characters which are not necessary to check.  */
497
0
  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
498
0
       rawbuf_idx < new_raw_idx;)
499
0
    {
500
0
      wchar_t wc2;
501
0
      Idx remain_len = pstr->raw_len - rawbuf_idx;
502
0
      prev_st = pstr->cur_state;
503
0
      mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
504
0
        remain_len, &pstr->cur_state);
505
0
      if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
506
0
          || mbclen == 0))
507
0
  {
508
    /* We treat these cases as a single byte character.  */
509
0
    if (mbclen == 0 || remain_len == 0)
510
0
      wc = L'\0';
511
0
    else
512
0
      wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
513
0
    mbclen = 1;
514
0
    pstr->cur_state = prev_st;
515
0
  }
516
0
      else
517
0
  wc = wc2;
518
      /* Then proceed the next character.  */
519
0
      rawbuf_idx += mbclen;
520
0
    }
521
0
  *last_wc = wc;
522
0
  return rawbuf_idx;
523
0
}
524
525
/* Build the buffer PSTR->MBS, and apply the translation if we need.
526
   This function is used in case of REG_ICASE.  */
527
528
static void
529
build_upper_buffer (re_string_t *pstr)
530
0
{
531
0
  Idx char_idx, end_idx;
532
0
  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533
534
0
  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535
0
    {
536
0
      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
537
0
      if (__glibc_unlikely (pstr->trans != NULL))
538
0
  ch = pstr->trans[ch];
539
0
      pstr->mbs[char_idx] = toupper (ch);
540
0
    }
541
0
  pstr->valid_len = char_idx;
542
0
  pstr->valid_raw_len = char_idx;
543
0
}
544
545
/* Apply TRANS to the buffer in PSTR.  */
546
547
static void
548
re_string_translate_buffer (re_string_t *pstr)
549
0
{
550
0
  Idx buf_idx, end_idx;
551
0
  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
552
553
0
  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
554
0
    {
555
0
      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
556
0
      pstr->mbs[buf_idx] = pstr->trans[ch];
557
0
    }
558
559
0
  pstr->valid_len = buf_idx;
560
0
  pstr->valid_raw_len = buf_idx;
561
0
}
562
563
/* This function re-construct the buffers.
564
   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
565
   convert to upper case in case of REG_ICASE, apply translation.  */
566
567
static reg_errcode_t
568
__attribute_warn_unused_result__
569
re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
570
0
{
571
0
  Idx offset;
572
573
0
  if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
574
0
    offset = idx - pstr->raw_mbs_idx;
575
0
  else
576
0
    {
577
      /* Reset buffer.  */
578
0
      if (pstr->mb_cur_max > 1)
579
0
  memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
580
0
      pstr->len = pstr->raw_len;
581
0
      pstr->stop = pstr->raw_stop;
582
0
      pstr->valid_len = 0;
583
0
      pstr->raw_mbs_idx = 0;
584
0
      pstr->valid_raw_len = 0;
585
0
      pstr->offsets_needed = 0;
586
0
      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
587
0
         : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
588
0
      if (!pstr->mbs_allocated)
589
0
  pstr->mbs = (unsigned char *) pstr->raw_mbs;
590
0
      offset = idx;
591
0
    }
592
593
0
  if (__glibc_likely (offset != 0))
594
0
    {
595
      /* Should the already checked characters be kept?  */
596
0
      if (__glibc_likely (offset < pstr->valid_raw_len))
597
0
  {
598
    /* Yes, move them to the front of the buffer.  */
599
0
    if (__glibc_unlikely (pstr->offsets_needed))
600
0
      {
601
0
        Idx low = 0, high = pstr->valid_len, mid;
602
0
        do
603
0
    {
604
0
      mid = (high + low) / 2;
605
0
      if (pstr->offsets[mid] > offset)
606
0
        high = mid;
607
0
      else if (pstr->offsets[mid] < offset)
608
0
        low = mid + 1;
609
0
      else
610
0
        break;
611
0
    }
612
0
        while (low < high);
613
0
        if (pstr->offsets[mid] < offset)
614
0
    ++mid;
615
0
        pstr->tip_context = re_string_context_at (pstr, mid - 1,
616
0
              eflags);
617
        /* This can be quite complicated, so handle specially
618
     only the common and easy case where the character with
619
     different length representation of lower and upper
620
     case is present at or after offset.  */
621
0
        if (pstr->valid_len > offset
622
0
      && mid == offset && pstr->offsets[mid] == offset)
623
0
    {
624
0
      memmove (pstr->wcs, pstr->wcs + offset,
625
0
         (pstr->valid_len - offset) * sizeof (wint_t));
626
0
      memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
627
0
      pstr->valid_len -= offset;
628
0
      pstr->valid_raw_len -= offset;
629
0
      for (low = 0; low < pstr->valid_len; low++)
630
0
        pstr->offsets[low] = pstr->offsets[low + offset] - offset;
631
0
    }
632
0
        else
633
0
    {
634
      /* Otherwise, just find out how long the partial multibyte
635
         character at offset is and fill it with WEOF/255.  */
636
0
      pstr->len = pstr->raw_len - idx + offset;
637
0
      pstr->stop = pstr->raw_stop - idx + offset;
638
0
      pstr->offsets_needed = 0;
639
0
      while (mid > 0 && pstr->offsets[mid - 1] == offset)
640
0
        --mid;
641
0
      while (mid < pstr->valid_len)
642
0
        if (pstr->wcs[mid] != WEOF)
643
0
          break;
644
0
        else
645
0
          ++mid;
646
0
      if (mid == pstr->valid_len)
647
0
        pstr->valid_len = 0;
648
0
      else
649
0
        {
650
0
          pstr->valid_len = pstr->offsets[mid] - offset;
651
0
          if (pstr->valid_len)
652
0
      {
653
0
        for (low = 0; low < pstr->valid_len; ++low)
654
0
          pstr->wcs[low] = WEOF;
655
0
        memset (pstr->mbs, 255, pstr->valid_len);
656
0
      }
657
0
        }
658
0
      pstr->valid_raw_len = pstr->valid_len;
659
0
    }
660
0
      }
661
0
    else
662
0
      {
663
0
        pstr->tip_context = re_string_context_at (pstr, offset - 1,
664
0
              eflags);
665
0
        if (pstr->mb_cur_max > 1)
666
0
    memmove (pstr->wcs, pstr->wcs + offset,
667
0
       (pstr->valid_len - offset) * sizeof (wint_t));
668
0
        if (__glibc_unlikely (pstr->mbs_allocated))
669
0
    memmove (pstr->mbs, pstr->mbs + offset,
670
0
       pstr->valid_len - offset);
671
0
        pstr->valid_len -= offset;
672
0
        pstr->valid_raw_len -= offset;
673
0
        DEBUG_ASSERT (pstr->valid_len > 0);
674
0
      }
675
0
  }
676
0
      else
677
0
  {
678
    /* No, skip all characters until IDX.  */
679
0
    Idx prev_valid_len = pstr->valid_len;
680
681
0
    if (__glibc_unlikely (pstr->offsets_needed))
682
0
      {
683
0
        pstr->len = pstr->raw_len - idx + offset;
684
0
        pstr->stop = pstr->raw_stop - idx + offset;
685
0
        pstr->offsets_needed = 0;
686
0
      }
687
0
    pstr->valid_len = 0;
688
0
    if (pstr->mb_cur_max > 1)
689
0
      {
690
0
        Idx wcs_idx;
691
0
        wint_t wc = WEOF;
692
693
0
        if (pstr->is_utf8)
694
0
    {
695
0
      const unsigned char *raw, *p, *end;
696
697
      /* Special case UTF-8.  Multi-byte chars start with any
698
         byte other than 0x80 - 0xbf.  */
699
0
      raw = pstr->raw_mbs + pstr->raw_mbs_idx;
700
0
      end = raw + (offset - pstr->mb_cur_max);
701
0
      if (end < pstr->raw_mbs)
702
0
        end = pstr->raw_mbs;
703
0
      p = raw + offset - 1;
704
#ifdef _LIBC
705
      /* We know the wchar_t encoding is UCS4, so for the simple
706
         case, ASCII characters, skip the conversion step.  */
707
      if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
708
        {
709
          memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
710
          /* pstr->valid_len = 0; */
711
          wc = (wchar_t) *p;
712
        }
713
      else
714
#endif
715
0
        for (; p >= end; --p)
716
0
          if ((*p & 0xc0) != 0x80)
717
0
      {
718
0
        mbstate_t cur_state;
719
0
        wchar_t wc2;
720
0
        Idx mlen = raw + pstr->len - p;
721
0
        unsigned char buf[6];
722
0
        size_t mbclen;
723
724
0
        const unsigned char *pp = p;
725
0
        if (__glibc_unlikely (pstr->trans != NULL))
726
0
          {
727
0
            int i = mlen < 6 ? mlen : 6;
728
0
            while (--i >= 0)
729
0
        buf[i] = pstr->trans[p[i]];
730
0
            pp = buf;
731
0
          }
732
        /* XXX Don't use mbrtowc, we know which conversion
733
           to use (UTF-8 -> UCS4).  */
734
0
        memset (&cur_state, 0, sizeof (cur_state));
735
0
        mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
736
0
                &cur_state);
737
0
        if (raw + offset - p <= mbclen
738
0
            && mbclen < (size_t) -2)
739
0
          {
740
0
            memset (&pstr->cur_state, '\0',
741
0
              sizeof (mbstate_t));
742
0
            pstr->valid_len = mbclen - (raw + offset - p);
743
0
            wc = wc2;
744
0
          }
745
0
        break;
746
0
      }
747
0
    }
748
749
0
        if (wc == WEOF)
750
0
    pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
751
0
        if (wc == WEOF)
752
0
    pstr->tip_context
753
0
      = re_string_context_at (pstr, prev_valid_len - 1, eflags);
754
0
        else
755
0
    pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
756
0
              && IS_WIDE_WORD_CHAR (wc))
757
0
             ? CONTEXT_WORD
758
0
             : ((IS_WIDE_NEWLINE (wc)
759
0
           && pstr->newline_anchor)
760
0
          ? CONTEXT_NEWLINE : 0));
761
0
        if (__glibc_unlikely (pstr->valid_len))
762
0
    {
763
0
      for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
764
0
        pstr->wcs[wcs_idx] = WEOF;
765
0
      if (pstr->mbs_allocated)
766
0
        memset (pstr->mbs, 255, pstr->valid_len);
767
0
    }
768
0
        pstr->valid_raw_len = pstr->valid_len;
769
0
      }
770
0
    else
771
0
      {
772
0
        int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
773
0
        pstr->valid_raw_len = 0;
774
0
        if (pstr->trans)
775
0
    c = pstr->trans[c];
776
0
        pstr->tip_context = (bitset_contain (pstr->word_char, c)
777
0
           ? CONTEXT_WORD
778
0
           : ((IS_NEWLINE (c) && pstr->newline_anchor)
779
0
              ? CONTEXT_NEWLINE : 0));
780
0
      }
781
0
  }
782
0
      if (!__glibc_unlikely (pstr->mbs_allocated))
783
0
  pstr->mbs += offset;
784
0
    }
785
0
  pstr->raw_mbs_idx = idx;
786
0
  pstr->len -= offset;
787
0
  pstr->stop -= offset;
788
789
  /* Then build the buffers.  */
790
0
  if (pstr->mb_cur_max > 1)
791
0
    {
792
0
      if (pstr->icase)
793
0
  {
794
0
    reg_errcode_t ret = build_wcs_upper_buffer (pstr);
795
0
    if (__glibc_unlikely (ret != REG_NOERROR))
796
0
      return ret;
797
0
  }
798
0
      else
799
0
  build_wcs_buffer (pstr);
800
0
    }
801
0
  else
802
0
    if (__glibc_unlikely (pstr->mbs_allocated))
803
0
      {
804
0
  if (pstr->icase)
805
0
    build_upper_buffer (pstr);
806
0
  else if (pstr->trans != NULL)
807
0
    re_string_translate_buffer (pstr);
808
0
      }
809
0
    else
810
0
      pstr->valid_len = pstr->len;
811
812
0
  pstr->cur_idx = 0;
813
0
  return REG_NOERROR;
814
0
}
815
816
static unsigned char
817
__attribute__ ((pure))
818
re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
819
0
{
820
0
  int ch;
821
0
  Idx off;
822
823
  /* Handle the common (easiest) cases first.  */
824
0
  if (__glibc_likely (!pstr->mbs_allocated))
825
0
    return re_string_peek_byte (pstr, idx);
826
827
0
  if (pstr->mb_cur_max > 1
828
0
      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
829
0
    return re_string_peek_byte (pstr, idx);
830
831
0
  off = pstr->cur_idx + idx;
832
0
  if (pstr->offsets_needed)
833
0
    off = pstr->offsets[off];
834
835
0
  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
836
837
  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
838
     this function returns CAPITAL LETTER I instead of first byte of
839
     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
840
     since peek_byte_case doesn't advance cur_idx in any way.  */
841
0
  if (pstr->offsets_needed && !isascii (ch))
842
0
    return re_string_peek_byte (pstr, idx);
843
844
0
  return ch;
845
0
}
846
847
static unsigned char
848
re_string_fetch_byte_case (re_string_t *pstr)
849
0
{
850
0
  if (__glibc_likely (!pstr->mbs_allocated))
851
0
    return re_string_fetch_byte (pstr);
852
853
0
  if (pstr->offsets_needed)
854
0
    {
855
0
      Idx off;
856
0
      int ch;
857
858
      /* For tr_TR.UTF-8 [[:islower:]] there is
859
   [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
860
   in that case the whole multi-byte character and return
861
   the original letter.  On the other side, with
862
   [[: DOTLESS SMALL LETTER I return [[:I, as doing
863
   anything else would complicate things too much.  */
864
865
0
      if (!re_string_first_byte (pstr, pstr->cur_idx))
866
0
  return re_string_fetch_byte (pstr);
867
868
0
      off = pstr->offsets[pstr->cur_idx];
869
0
      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
870
871
0
      if (! isascii (ch))
872
0
  return re_string_fetch_byte (pstr);
873
874
0
      re_string_skip_bytes (pstr,
875
0
          re_string_char_size_at (pstr, pstr->cur_idx));
876
0
      return ch;
877
0
    }
878
879
0
  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
880
0
}
881
882
static void
883
re_string_destruct (re_string_t *pstr)
884
0
{
885
0
  re_free (pstr->wcs);
886
0
  re_free (pstr->offsets);
887
0
  if (pstr->mbs_allocated)
888
0
    re_free (pstr->mbs);
889
0
}
890
891
/* Return the context at IDX in INPUT.  */
892
893
static unsigned int
894
re_string_context_at (const re_string_t *input, Idx idx, int eflags)
895
0
{
896
0
  int c;
897
0
  if (__glibc_unlikely (idx < 0))
898
    /* In this case, we use the value stored in input->tip_context,
899
       since we can't know the character in input->mbs[-1] here.  */
900
0
    return input->tip_context;
901
0
  if (__glibc_unlikely (idx == input->len))
902
0
    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
903
0
      : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
904
0
  if (input->mb_cur_max > 1)
905
0
    {
906
0
      wint_t wc;
907
0
      Idx wc_idx = idx;
908
0
      while(input->wcs[wc_idx] == WEOF)
909
0
  {
910
0
    DEBUG_ASSERT (wc_idx >= 0);
911
0
    --wc_idx;
912
0
    if (wc_idx < 0)
913
0
      return input->tip_context;
914
0
  }
915
0
      wc = input->wcs[wc_idx];
916
0
      if (__glibc_unlikely (input->word_ops_used != 0)
917
0
    && IS_WIDE_WORD_CHAR (wc))
918
0
  return CONTEXT_WORD;
919
0
      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
920
0
        ? CONTEXT_NEWLINE : 0);
921
0
    }
922
0
  else
923
0
    {
924
0
      c = re_string_byte_at (input, idx);
925
0
      if (bitset_contain (input->word_char, c))
926
0
  return CONTEXT_WORD;
927
0
      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
928
0
    }
929
0
}
930

931
/* Functions for set operation.  */
932
933
static reg_errcode_t
934
__attribute_warn_unused_result__
935
re_node_set_alloc (re_node_set *set, Idx size)
936
0
{
937
0
  set->alloc = size;
938
0
  set->nelem = 0;
939
0
  set->elems = re_malloc (Idx, size);
940
0
  if (__glibc_unlikely (set->elems == NULL)
941
0
      && (MALLOC_0_IS_NONNULL || size != 0))
942
0
    return REG_ESPACE;
943
0
  return REG_NOERROR;
944
0
}
945
946
static reg_errcode_t
947
__attribute_warn_unused_result__
948
re_node_set_init_1 (re_node_set *set, Idx elem)
949
0
{
950
0
  set->alloc = 1;
951
0
  set->nelem = 1;
952
0
  set->elems = re_malloc (Idx, 1);
953
0
  if (__glibc_unlikely (set->elems == NULL))
954
0
    {
955
0
      set->alloc = set->nelem = 0;
956
0
      return REG_ESPACE;
957
0
    }
958
0
  set->elems[0] = elem;
959
0
  return REG_NOERROR;
960
0
}
961
962
static reg_errcode_t
963
__attribute_warn_unused_result__
964
re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
965
0
{
966
0
  set->alloc = 2;
967
0
  set->elems = re_malloc (Idx, 2);
968
0
  if (__glibc_unlikely (set->elems == NULL))
969
0
    return REG_ESPACE;
970
0
  if (elem1 == elem2)
971
0
    {
972
0
      set->nelem = 1;
973
0
      set->elems[0] = elem1;
974
0
    }
975
0
  else
976
0
    {
977
0
      set->nelem = 2;
978
0
      if (elem1 < elem2)
979
0
  {
980
0
    set->elems[0] = elem1;
981
0
    set->elems[1] = elem2;
982
0
  }
983
0
      else
984
0
  {
985
0
    set->elems[0] = elem2;
986
0
    set->elems[1] = elem1;
987
0
  }
988
0
    }
989
0
  return REG_NOERROR;
990
0
}
991
992
static reg_errcode_t
993
__attribute_warn_unused_result__
994
re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
995
0
{
996
0
  dest->nelem = src->nelem;
997
0
  if (src->nelem > 0)
998
0
    {
999
0
      dest->alloc = dest->nelem;
1000
0
      dest->elems = re_malloc (Idx, dest->alloc);
1001
0
      if (__glibc_unlikely (dest->elems == NULL))
1002
0
  {
1003
0
    dest->alloc = dest->nelem = 0;
1004
0
    return REG_ESPACE;
1005
0
  }
1006
0
      memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1007
0
    }
1008
0
  else
1009
0
    re_node_set_init_empty (dest);
1010
0
  return REG_NOERROR;
1011
0
}
1012
1013
/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1014
   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1015
   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1016
1017
static reg_errcode_t
1018
__attribute_warn_unused_result__
1019
re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1020
         const re_node_set *src2)
1021
0
{
1022
0
  Idx i1, i2, is, id, delta, sbase;
1023
0
  if (src1->nelem == 0 || src2->nelem == 0)
1024
0
    return REG_NOERROR;
1025
1026
  /* We need dest->nelem + 2 * elems_in_intersection; this is a
1027
     conservative estimate.  */
1028
0
  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1029
0
    {
1030
0
      Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1031
0
      Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1032
0
      if (__glibc_unlikely (new_elems == NULL))
1033
0
  return REG_ESPACE;
1034
0
      dest->elems = new_elems;
1035
0
      dest->alloc = new_alloc;
1036
0
    }
1037
1038
  /* Find the items in the intersection of SRC1 and SRC2, and copy
1039
     into the top of DEST those that are not already in DEST itself.  */
1040
0
  sbase = dest->nelem + src1->nelem + src2->nelem;
1041
0
  i1 = src1->nelem - 1;
1042
0
  i2 = src2->nelem - 1;
1043
0
  id = dest->nelem - 1;
1044
0
  for (;;)
1045
0
    {
1046
0
      if (src1->elems[i1] == src2->elems[i2])
1047
0
  {
1048
    /* Try to find the item in DEST.  Maybe we could binary search?  */
1049
0
    while (id >= 0 && dest->elems[id] > src1->elems[i1])
1050
0
      --id;
1051
1052
0
    if (id < 0 || dest->elems[id] != src1->elems[i1])
1053
0
            dest->elems[--sbase] = src1->elems[i1];
1054
1055
0
    if (--i1 < 0 || --i2 < 0)
1056
0
      break;
1057
0
  }
1058
1059
      /* Lower the highest of the two items.  */
1060
0
      else if (src1->elems[i1] < src2->elems[i2])
1061
0
  {
1062
0
    if (--i2 < 0)
1063
0
      break;
1064
0
  }
1065
0
      else
1066
0
  {
1067
0
    if (--i1 < 0)
1068
0
      break;
1069
0
  }
1070
0
    }
1071
1072
0
  id = dest->nelem - 1;
1073
0
  is = dest->nelem + src1->nelem + src2->nelem - 1;
1074
0
  delta = is - sbase + 1;
1075
1076
  /* Now copy.  When DELTA becomes zero, the remaining
1077
     DEST elements are already in place; this is more or
1078
     less the same loop that is in re_node_set_merge.  */
1079
0
  dest->nelem += delta;
1080
0
  if (delta > 0 && id >= 0)
1081
0
    for (;;)
1082
0
      {
1083
0
  if (dest->elems[is] > dest->elems[id])
1084
0
    {
1085
      /* Copy from the top.  */
1086
0
      dest->elems[id + delta--] = dest->elems[is--];
1087
0
      if (delta == 0)
1088
0
        break;
1089
0
    }
1090
0
  else
1091
0
    {
1092
      /* Slide from the bottom.  */
1093
0
      dest->elems[id + delta] = dest->elems[id];
1094
0
      if (--id < 0)
1095
0
        break;
1096
0
    }
1097
0
      }
1098
1099
  /* Copy remaining SRC elements.  */
1100
0
  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1101
1102
0
  return REG_NOERROR;
1103
0
}
1104
1105
/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1106
   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1107
1108
static reg_errcode_t
1109
__attribute_warn_unused_result__
1110
re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1111
      const re_node_set *src2)
1112
0
{
1113
0
  Idx i1, i2, id;
1114
0
  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1115
0
    {
1116
0
      dest->alloc = src1->nelem + src2->nelem;
1117
0
      dest->elems = re_malloc (Idx, dest->alloc);
1118
0
      if (__glibc_unlikely (dest->elems == NULL))
1119
0
  return REG_ESPACE;
1120
0
    }
1121
0
  else
1122
0
    {
1123
0
      if (src1 != NULL && src1->nelem > 0)
1124
0
  return re_node_set_init_copy (dest, src1);
1125
0
      else if (src2 != NULL && src2->nelem > 0)
1126
0
  return re_node_set_init_copy (dest, src2);
1127
0
      else
1128
0
  re_node_set_init_empty (dest);
1129
0
      return REG_NOERROR;
1130
0
    }
1131
0
  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1132
0
    {
1133
0
      if (src1->elems[i1] > src2->elems[i2])
1134
0
  {
1135
0
    dest->elems[id++] = src2->elems[i2++];
1136
0
    continue;
1137
0
  }
1138
0
      if (src1->elems[i1] == src2->elems[i2])
1139
0
  ++i2;
1140
0
      dest->elems[id++] = src1->elems[i1++];
1141
0
    }
1142
0
  if (i1 < src1->nelem)
1143
0
    {
1144
0
      memcpy (dest->elems + id, src1->elems + i1,
1145
0
       (src1->nelem - i1) * sizeof (Idx));
1146
0
      id += src1->nelem - i1;
1147
0
    }
1148
0
  else if (i2 < src2->nelem)
1149
0
    {
1150
0
      memcpy (dest->elems + id, src2->elems + i2,
1151
0
       (src2->nelem - i2) * sizeof (Idx));
1152
0
      id += src2->nelem - i2;
1153
0
    }
1154
0
  dest->nelem = id;
1155
0
  return REG_NOERROR;
1156
0
}
1157
1158
/* Calculate the union set of the sets DEST and SRC. And store it to
1159
   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1160
1161
static reg_errcode_t
1162
__attribute_warn_unused_result__
1163
re_node_set_merge (re_node_set *dest, const re_node_set *src)
1164
0
{
1165
0
  Idx is, id, sbase, delta;
1166
0
  if (src == NULL || src->nelem == 0)
1167
0
    return REG_NOERROR;
1168
0
  if (dest->alloc < 2 * src->nelem + dest->nelem)
1169
0
    {
1170
0
      Idx new_alloc = 2 * (src->nelem + dest->alloc);
1171
0
      Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1172
0
      if (__glibc_unlikely (new_buffer == NULL))
1173
0
  return REG_ESPACE;
1174
0
      dest->elems = new_buffer;
1175
0
      dest->alloc = new_alloc;
1176
0
    }
1177
1178
0
  if (__glibc_unlikely (dest->nelem == 0))
1179
0
    {
1180
      /* Although we already guaranteed above that dest->alloc != 0 and
1181
         therefore dest->elems != NULL, add a debug assertion to pacify
1182
         GCC 11.2.1's -fanalyzer.  */
1183
0
      DEBUG_ASSERT (dest->elems);
1184
0
      dest->nelem = src->nelem;
1185
0
      memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1186
0
      return REG_NOERROR;
1187
0
    }
1188
1189
  /* Copy into the top of DEST the items of SRC that are not
1190
     found in DEST.  Maybe we could binary search in DEST?  */
1191
0
  for (sbase = dest->nelem + 2 * src->nelem,
1192
0
       is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1193
0
    {
1194
0
      if (dest->elems[id] == src->elems[is])
1195
0
  is--, id--;
1196
0
      else if (dest->elems[id] < src->elems[is])
1197
0
  dest->elems[--sbase] = src->elems[is--];
1198
0
      else /* if (dest->elems[id] > src->elems[is]) */
1199
0
  --id;
1200
0
    }
1201
1202
0
  if (is >= 0)
1203
0
    {
1204
      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1205
0
      sbase -= is + 1;
1206
0
      memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1207
0
    }
1208
1209
0
  id = dest->nelem - 1;
1210
0
  is = dest->nelem + 2 * src->nelem - 1;
1211
0
  delta = is - sbase + 1;
1212
0
  if (delta == 0)
1213
0
    return REG_NOERROR;
1214
1215
  /* Now copy.  When DELTA becomes zero, the remaining
1216
     DEST elements are already in place.  */
1217
0
  dest->nelem += delta;
1218
0
  for (;;)
1219
0
    {
1220
0
      if (dest->elems[is] > dest->elems[id])
1221
0
  {
1222
    /* Copy from the top.  */
1223
0
    dest->elems[id + delta--] = dest->elems[is--];
1224
0
    if (delta == 0)
1225
0
      break;
1226
0
  }
1227
0
      else
1228
0
  {
1229
    /* Slide from the bottom.  */
1230
0
    dest->elems[id + delta] = dest->elems[id];
1231
0
    if (--id < 0)
1232
0
      {
1233
        /* Copy remaining SRC elements.  */
1234
0
        memcpy (dest->elems, dest->elems + sbase,
1235
0
          delta * sizeof (Idx));
1236
0
        break;
1237
0
      }
1238
0
  }
1239
0
    }
1240
1241
0
  return REG_NOERROR;
1242
0
}
1243
1244
/* Insert the new element ELEM to the re_node_set* SET.
1245
   SET should not already have ELEM.
1246
   Return true if successful.  */
1247
1248
static bool
1249
__attribute_warn_unused_result__
1250
re_node_set_insert (re_node_set *set, Idx elem)
1251
0
{
1252
0
  Idx idx;
1253
  /* In case the set is empty.  */
1254
0
  if (set->alloc == 0)
1255
0
    return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1256
1257
0
  if (__glibc_unlikely (set->nelem) == 0)
1258
0
    {
1259
      /* Although we already guaranteed above that set->alloc != 0 and
1260
         therefore set->elems != NULL, add a debug assertion to pacify
1261
         GCC 11.2 -fanalyzer.  */
1262
0
      DEBUG_ASSERT (set->elems);
1263
0
      set->elems[0] = elem;
1264
0
      ++set->nelem;
1265
0
      return true;
1266
0
    }
1267
1268
  /* Realloc if we need.  */
1269
0
  if (set->alloc == set->nelem)
1270
0
    {
1271
0
      Idx *new_elems;
1272
0
      set->alloc = set->alloc * 2;
1273
0
      new_elems = re_realloc (set->elems, Idx, set->alloc);
1274
0
      if (__glibc_unlikely (new_elems == NULL))
1275
0
  return false;
1276
0
      set->elems = new_elems;
1277
0
    }
1278
1279
  /* Move the elements which follows the new element.  Test the
1280
     first element separately to skip a check in the inner loop.  */
1281
0
  if (elem < set->elems[0])
1282
0
    {
1283
0
      for (idx = set->nelem; idx > 0; idx--)
1284
0
  set->elems[idx] = set->elems[idx - 1];
1285
0
    }
1286
0
  else
1287
0
    {
1288
0
      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1289
0
  set->elems[idx] = set->elems[idx - 1];
1290
0
      DEBUG_ASSERT (set->elems[idx - 1] < elem);
1291
0
    }
1292
1293
  /* Insert the new element.  */
1294
0
  set->elems[idx] = elem;
1295
0
  ++set->nelem;
1296
0
  return true;
1297
0
}
1298
1299
/* Insert the new element ELEM to the re_node_set* SET.
1300
   SET should not already have any element greater than or equal to ELEM.
1301
   Return true if successful.  */
1302
1303
static bool
1304
__attribute_warn_unused_result__
1305
re_node_set_insert_last (re_node_set *set, Idx elem)
1306
0
{
1307
  /* Realloc if we need.  */
1308
0
  if (set->alloc == set->nelem)
1309
0
    {
1310
0
      Idx *new_elems;
1311
0
      set->alloc = (set->alloc + 1) * 2;
1312
0
      new_elems = re_realloc (set->elems, Idx, set->alloc);
1313
0
      if (__glibc_unlikely (new_elems == NULL))
1314
0
  return false;
1315
0
      set->elems = new_elems;
1316
0
    }
1317
1318
  /* Insert the new element.  */
1319
0
  set->elems[set->nelem++] = elem;
1320
0
  return true;
1321
0
}
1322
1323
/* Compare two node sets SET1 and SET2.
1324
   Return true if SET1 and SET2 are equivalent.  */
1325
1326
static bool
1327
__attribute__ ((pure))
1328
re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1329
0
{
1330
0
  Idx i;
1331
0
  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1332
0
    return false;
1333
0
  for (i = set1->nelem ; --i >= 0 ; )
1334
0
    if (set1->elems[i] != set2->elems[i])
1335
0
      return false;
1336
0
  return true;
1337
0
}
1338
1339
/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1340
1341
static Idx
1342
__attribute__ ((pure))
1343
re_node_set_contains (const re_node_set *set, Idx elem)
1344
0
{
1345
0
  __re_size_t idx, right, mid;
1346
0
  if (set->nelem <= 0)
1347
0
    return 0;
1348
1349
  /* Binary search the element.  */
1350
0
  idx = 0;
1351
0
  right = set->nelem - 1;
1352
0
  while (idx < right)
1353
0
    {
1354
0
      mid = (idx + right) / 2;
1355
0
      if (set->elems[mid] < elem)
1356
0
  idx = mid + 1;
1357
0
      else
1358
0
  right = mid;
1359
0
    }
1360
0
  return set->elems[idx] == elem ? idx + 1 : 0;
1361
0
}
1362
1363
static void
1364
re_node_set_remove_at (re_node_set *set, Idx idx)
1365
0
{
1366
0
  if (idx < 0 || idx >= set->nelem)
1367
0
    return;
1368
0
  --set->nelem;
1369
0
  for (; idx < set->nelem; idx++)
1370
0
    set->elems[idx] = set->elems[idx + 1];
1371
0
}
1372

1373
1374
/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1375
   Or return -1 if an error occurred.  */
1376
1377
static Idx
1378
re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1379
0
{
1380
0
  if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1381
0
    {
1382
0
      size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1383
0
      Idx *new_nexts, *new_indices;
1384
0
      re_node_set *new_edests, *new_eclosures;
1385
0
      re_token_t *new_nodes;
1386
1387
      /* Avoid overflows in realloc.  */
1388
0
      const size_t max_object_size = MAX (sizeof (re_token_t),
1389
0
            MAX (sizeof (re_node_set),
1390
0
                 sizeof (Idx)));
1391
0
      if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1392
0
          < new_nodes_alloc))
1393
0
  return -1;
1394
1395
0
      new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1396
0
      if (__glibc_unlikely (new_nodes == NULL))
1397
0
  return -1;
1398
0
      dfa->nodes = new_nodes;
1399
0
      dfa->nodes_alloc = new_nodes_alloc;
1400
0
      new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1401
0
      if (new_nexts != NULL)
1402
0
  dfa->nexts = new_nexts;
1403
0
      new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1404
0
      if (new_indices != NULL)
1405
0
  dfa->org_indices = new_indices;
1406
0
      new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1407
0
      if (new_edests != NULL)
1408
0
  dfa->edests = new_edests;
1409
0
      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1410
0
      if (new_eclosures != NULL)
1411
0
  dfa->eclosures = new_eclosures;
1412
0
      if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1413
0
          || new_edests == NULL || new_eclosures == NULL))
1414
0
  return -1;
1415
0
    }
1416
0
  dfa->nodes[dfa->nodes_len] = token;
1417
0
  dfa->nodes[dfa->nodes_len].constraint = 0;
1418
0
  dfa->nodes[dfa->nodes_len].accept_mb =
1419
0
    ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1420
0
     || token.type == COMPLEX_BRACKET);
1421
0
  dfa->nexts[dfa->nodes_len] = -1;
1422
0
  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1423
0
  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1424
0
  return dfa->nodes_len++;
1425
0
}
1426
1427
static re_hashval_t
1428
calc_state_hash (const re_node_set *nodes, unsigned int context)
1429
0
{
1430
0
  re_hashval_t hash = nodes->nelem + context;
1431
0
  Idx i;
1432
0
  for (i = 0 ; i < nodes->nelem ; i++)
1433
0
    hash += nodes->elems[i];
1434
0
  return hash;
1435
0
}
1436
1437
/* Search for the state whose node_set is equivalent to NODES.
1438
   Return the pointer to the state, if we found it in the DFA.
1439
   Otherwise create the new one and return it.  In case of an error
1440
   return NULL and set the error code in ERR.
1441
   Note: - We assume NULL as the invalid state, then it is possible that
1442
     return value is NULL and ERR is REG_NOERROR.
1443
   - We never return non-NULL value in case of any errors, it is for
1444
     optimization.  */
1445
1446
static re_dfastate_t *
1447
__attribute_warn_unused_result__
1448
re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1449
      const re_node_set *nodes)
1450
0
{
1451
0
  re_hashval_t hash;
1452
0
  re_dfastate_t *new_state;
1453
0
  struct re_state_table_entry *spot;
1454
0
  Idx i;
1455
#if defined GCC_LINT || defined lint
1456
  /* Suppress bogus uninitialized-variable warnings.  */
1457
  *err = REG_NOERROR;
1458
#endif
1459
0
  if (__glibc_unlikely (nodes->nelem == 0))
1460
0
    {
1461
0
      *err = REG_NOERROR;
1462
0
      return NULL;
1463
0
    }
1464
0
  hash = calc_state_hash (nodes, 0);
1465
0
  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1466
1467
0
  for (i = 0 ; i < spot->num ; i++)
1468
0
    {
1469
0
      re_dfastate_t *state = spot->array[i];
1470
0
      if (hash != state->hash)
1471
0
  continue;
1472
0
      if (re_node_set_compare (&state->nodes, nodes))
1473
0
  return state;
1474
0
    }
1475
1476
  /* There are no appropriate state in the dfa, create the new one.  */
1477
0
  new_state = create_ci_newstate (dfa, nodes, hash);
1478
0
  if (__glibc_unlikely (new_state == NULL))
1479
0
    *err = REG_ESPACE;
1480
1481
0
  return new_state;
1482
0
}
1483
1484
/* Search for the state whose node_set is equivalent to NODES and
1485
   whose context is equivalent to CONTEXT.
1486
   Return the pointer to the state, if we found it in the DFA.
1487
   Otherwise create the new one and return it.  In case of an error
1488
   return NULL and set the error code in ERR.
1489
   Note: - We assume NULL as the invalid state, then it is possible that
1490
     return value is NULL and ERR is REG_NOERROR.
1491
   - We never return non-NULL value in case of any errors, it is for
1492
     optimization.  */
1493
1494
static re_dfastate_t *
1495
__attribute_warn_unused_result__
1496
re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1497
        const re_node_set *nodes, unsigned int context)
1498
0
{
1499
0
  re_hashval_t hash;
1500
0
  re_dfastate_t *new_state;
1501
0
  struct re_state_table_entry *spot;
1502
0
  Idx i;
1503
#if defined GCC_LINT || defined lint
1504
  /* Suppress bogus uninitialized-variable warnings.  */
1505
  *err = REG_NOERROR;
1506
#endif
1507
0
  if (nodes->nelem == 0)
1508
0
    {
1509
0
      *err = REG_NOERROR;
1510
0
      return NULL;
1511
0
    }
1512
0
  hash = calc_state_hash (nodes, context);
1513
0
  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1514
1515
0
  for (i = 0 ; i < spot->num ; i++)
1516
0
    {
1517
0
      re_dfastate_t *state = spot->array[i];
1518
0
      if (state->hash == hash
1519
0
    && state->context == context
1520
0
    && re_node_set_compare (state->entrance_nodes, nodes))
1521
0
  return state;
1522
0
    }
1523
  /* There are no appropriate state in 'dfa', create the new one.  */
1524
0
  new_state = create_cd_newstate (dfa, nodes, context, hash);
1525
0
  if (__glibc_unlikely (new_state == NULL))
1526
0
    *err = REG_ESPACE;
1527
1528
0
  return new_state;
1529
0
}
1530
1531
/* Finish initialization of the new state NEWSTATE, and using its hash value
1532
   HASH put in the appropriate bucket of DFA's state table.  Return value
1533
   indicates the error code if failed.  */
1534
1535
static reg_errcode_t
1536
__attribute_warn_unused_result__
1537
register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1538
    re_hashval_t hash)
1539
0
{
1540
0
  struct re_state_table_entry *spot;
1541
0
  reg_errcode_t err;
1542
0
  Idx i;
1543
1544
0
  newstate->hash = hash;
1545
0
  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1546
0
  if (__glibc_unlikely (err != REG_NOERROR))
1547
0
    return REG_ESPACE;
1548
0
  for (i = 0; i < newstate->nodes.nelem; i++)
1549
0
    {
1550
0
      Idx elem = newstate->nodes.elems[i];
1551
0
      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1552
0
  if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1553
0
    return REG_ESPACE;
1554
0
    }
1555
1556
0
  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1557
0
  if (__glibc_unlikely (spot->alloc <= spot->num))
1558
0
    {
1559
0
      Idx new_alloc = 2 * spot->num + 2;
1560
0
      re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1561
0
                new_alloc);
1562
0
      if (__glibc_unlikely (new_array == NULL))
1563
0
  return REG_ESPACE;
1564
0
      spot->array = new_array;
1565
0
      spot->alloc = new_alloc;
1566
0
    }
1567
0
  spot->array[spot->num++] = newstate;
1568
0
  return REG_NOERROR;
1569
0
}
1570
1571
static void
1572
free_state (re_dfastate_t *state)
1573
0
{
1574
0
  re_node_set_free (&state->non_eps_nodes);
1575
0
  re_node_set_free (&state->inveclosure);
1576
0
  if (state->entrance_nodes != &state->nodes)
1577
0
    {
1578
0
      re_node_set_free (state->entrance_nodes);
1579
0
      re_free (state->entrance_nodes);
1580
0
    }
1581
0
  re_node_set_free (&state->nodes);
1582
0
  re_free (state->word_trtable);
1583
0
  re_free (state->trtable);
1584
0
  re_free (state);
1585
0
}
1586
1587
/* Create the new state which is independent of contexts.
1588
   Return the new state if succeeded, otherwise return NULL.  */
1589
1590
static re_dfastate_t *
1591
__attribute_warn_unused_result__
1592
create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1593
        re_hashval_t hash)
1594
0
{
1595
0
  Idx i;
1596
0
  reg_errcode_t err;
1597
0
  re_dfastate_t *newstate;
1598
1599
0
  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1600
0
  if (__glibc_unlikely (newstate == NULL))
1601
0
    return NULL;
1602
0
  err = re_node_set_init_copy (&newstate->nodes, nodes);
1603
0
  if (__glibc_unlikely (err != REG_NOERROR))
1604
0
    {
1605
0
      re_free (newstate);
1606
0
      return NULL;
1607
0
    }
1608
1609
0
  newstate->entrance_nodes = &newstate->nodes;
1610
0
  for (i = 0 ; i < nodes->nelem ; i++)
1611
0
    {
1612
0
      re_token_t *node = dfa->nodes + nodes->elems[i];
1613
0
      re_token_type_t type = node->type;
1614
0
      if (type == CHARACTER && !node->constraint)
1615
0
  continue;
1616
0
      newstate->accept_mb |= node->accept_mb;
1617
1618
      /* If the state has the halt node, the state is a halt state.  */
1619
0
      if (type == END_OF_RE)
1620
0
  newstate->halt = 1;
1621
0
      else if (type == OP_BACK_REF)
1622
0
  newstate->has_backref = 1;
1623
0
      else if (type == ANCHOR || node->constraint)
1624
0
  newstate->has_constraint = 1;
1625
0
    }
1626
0
  err = register_state (dfa, newstate, hash);
1627
0
  if (__glibc_unlikely (err != REG_NOERROR))
1628
0
    {
1629
0
      free_state (newstate);
1630
0
      newstate = NULL;
1631
0
    }
1632
0
  return newstate;
1633
0
}
1634
1635
/* Create the new state which is depend on the context CONTEXT.
1636
   Return the new state if succeeded, otherwise return NULL.  */
1637
1638
static re_dfastate_t *
1639
__attribute_warn_unused_result__
1640
create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1641
        unsigned int context, re_hashval_t hash)
1642
0
{
1643
0
  Idx i, nctx_nodes = 0;
1644
0
  reg_errcode_t err;
1645
0
  re_dfastate_t *newstate;
1646
1647
0
  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1648
0
  if (__glibc_unlikely (newstate == NULL))
1649
0
    return NULL;
1650
0
  err = re_node_set_init_copy (&newstate->nodes, nodes);
1651
0
  if (__glibc_unlikely (err != REG_NOERROR))
1652
0
    {
1653
0
      re_free (newstate);
1654
0
      return NULL;
1655
0
    }
1656
1657
0
  newstate->context = context;
1658
0
  newstate->entrance_nodes = &newstate->nodes;
1659
1660
0
  for (i = 0 ; i < nodes->nelem ; i++)
1661
0
    {
1662
0
      re_token_t *node = dfa->nodes + nodes->elems[i];
1663
0
      re_token_type_t type = node->type;
1664
0
      unsigned int constraint = node->constraint;
1665
1666
0
      if (type == CHARACTER && !constraint)
1667
0
  continue;
1668
0
      newstate->accept_mb |= node->accept_mb;
1669
1670
      /* If the state has the halt node, the state is a halt state.  */
1671
0
      if (type == END_OF_RE)
1672
0
  newstate->halt = 1;
1673
0
      else if (type == OP_BACK_REF)
1674
0
  newstate->has_backref = 1;
1675
1676
0
      if (constraint)
1677
0
  {
1678
0
    if (newstate->entrance_nodes == &newstate->nodes)
1679
0
      {
1680
0
        re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1681
0
        if (__glibc_unlikely (entrance_nodes == NULL))
1682
0
    {
1683
0
      free_state (newstate);
1684
0
      return NULL;
1685
0
    }
1686
0
        newstate->entrance_nodes = entrance_nodes;
1687
0
        if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1688
0
      != REG_NOERROR)
1689
0
    {
1690
0
      free_state (newstate);
1691
0
      return NULL;
1692
0
    }
1693
0
        nctx_nodes = 0;
1694
0
        newstate->has_constraint = 1;
1695
0
      }
1696
1697
0
    if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1698
0
      {
1699
0
        re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1700
0
        ++nctx_nodes;
1701
0
      }
1702
0
  }
1703
0
    }
1704
0
  err = register_state (dfa, newstate, hash);
1705
0
  if (__glibc_unlikely (err != REG_NOERROR))
1706
0
    {
1707
0
      free_state (newstate);
1708
0
      newstate = NULL;
1709
0
    }
1710
0
  return  newstate;
1711
0
}