/src/wget/lib/regex_internal.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* Extended regular expression matching and search library.  | 
2  |  |    Copyright (C) 2002-2025 Free Software Foundation, Inc.  | 
3  |  |    This file is part of the GNU C Library.  | 
4  |  |    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.  | 
5  |  |  | 
6  |  |    The GNU C Library is free software; you can redistribute it and/or  | 
7  |  |    modify it under the terms of the GNU Lesser General Public  | 
8  |  |    License as published by the Free Software Foundation; either  | 
9  |  |    version 2.1 of the License, or (at your option) any later version.  | 
10  |  |  | 
11  |  |    The GNU C Library is distributed in the hope that it will be useful,  | 
12  |  |    but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
13  |  |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU  | 
14  |  |    Lesser General Public License for more details.  | 
15  |  |  | 
16  |  |    You should have received a copy of the GNU Lesser General Public  | 
17  |  |    License along with the GNU C Library; if not, see  | 
18  |  |    <https://www.gnu.org/licenses/>.  */  | 
19  |  |  | 
20  |  | static void re_string_construct_common (const char *str, Idx len,  | 
21  |  |           re_string_t *pstr,  | 
22  |  |           RE_TRANSLATE_TYPE trans, bool icase,  | 
23  |  |           const re_dfa_t *dfa);  | 
24  |  | static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,  | 
25  |  |             const re_node_set *nodes,  | 
26  |  |             re_hashval_t hash);  | 
27  |  | static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,  | 
28  |  |             const re_node_set *nodes,  | 
29  |  |             unsigned int context,  | 
30  |  |             re_hashval_t hash);  | 
31  |  | static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,  | 
32  |  |             Idx new_buf_len);  | 
33  |  | static void build_wcs_buffer (re_string_t *pstr);  | 
34  |  | static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);  | 
35  |  | static void build_upper_buffer (re_string_t *pstr);  | 
36  |  | static void re_string_translate_buffer (re_string_t *pstr);  | 
37  |  | static unsigned int re_string_context_at (const re_string_t *input, Idx idx,  | 
38  |  |             int eflags) __attribute__ ((pure));  | 
39  |  |  | 
40  |  | /* Functions for string operation.  */  | 
41  |  |  | 
42  |  | /* This function allocate the buffers.  It is necessary to call  | 
43  |  |    re_string_reconstruct before using the object.  */  | 
44  |  |  | 
45  |  | static reg_errcode_t  | 
46  |  | __attribute_warn_unused_result__  | 
47  |  | re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,  | 
48  |  |         RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)  | 
49  | 0  | { | 
50  | 0  |   reg_errcode_t ret;  | 
51  | 0  |   Idx init_buf_len;  | 
52  |  |  | 
53  |  |   /* Ensure at least one character fits into the buffers.  */  | 
54  | 0  |   if (init_len < dfa->mb_cur_max)  | 
55  | 0  |     init_len = dfa->mb_cur_max;  | 
56  | 0  |   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;  | 
57  | 0  |   re_string_construct_common (str, len, pstr, trans, icase, dfa);  | 
58  |  | 
  | 
59  | 0  |   ret = re_string_realloc_buffers (pstr, init_buf_len);  | 
60  | 0  |   if (__glibc_unlikely (ret != REG_NOERROR))  | 
61  | 0  |     return ret;  | 
62  |  |  | 
63  | 0  |   pstr->word_char = dfa->word_char;  | 
64  | 0  |   pstr->word_ops_used = dfa->word_ops_used;  | 
65  | 0  |   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;  | 
66  | 0  |   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;  | 
67  | 0  |   pstr->valid_raw_len = pstr->valid_len;  | 
68  | 0  |   return REG_NOERROR;  | 
69  | 0  | }  | 
70  |  |  | 
71  |  | /* This function allocate the buffers, and initialize them.  */  | 
72  |  |  | 
73  |  | static reg_errcode_t  | 
74  |  | __attribute_warn_unused_result__  | 
75  |  | re_string_construct (re_string_t *pstr, const char *str, Idx len,  | 
76  |  |          RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)  | 
77  | 0  | { | 
78  | 0  |   reg_errcode_t ret;  | 
79  | 0  |   memset (pstr, '\0', sizeof (re_string_t));  | 
80  | 0  |   re_string_construct_common (str, len, pstr, trans, icase, dfa);  | 
81  |  | 
  | 
82  | 0  |   if (len > 0)  | 
83  | 0  |     { | 
84  | 0  |       ret = re_string_realloc_buffers (pstr, len + 1);  | 
85  | 0  |       if (__glibc_unlikely (ret != REG_NOERROR))  | 
86  | 0  |   return ret;  | 
87  | 0  |     }  | 
88  | 0  |   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;  | 
89  |  | 
  | 
90  | 0  |   if (icase)  | 
91  | 0  |     { | 
92  | 0  |       if (dfa->mb_cur_max > 1)  | 
93  | 0  |   { | 
94  | 0  |     while (1)  | 
95  | 0  |       { | 
96  | 0  |         ret = build_wcs_upper_buffer (pstr);  | 
97  | 0  |         if (__glibc_unlikely (ret != REG_NOERROR))  | 
98  | 0  |     return ret;  | 
99  | 0  |         if (pstr->valid_raw_len >= len)  | 
100  | 0  |     break;  | 
101  | 0  |         if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)  | 
102  | 0  |     break;  | 
103  | 0  |         ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);  | 
104  | 0  |         if (__glibc_unlikely (ret != REG_NOERROR))  | 
105  | 0  |     return ret;  | 
106  | 0  |       }  | 
107  | 0  |   }  | 
108  | 0  |       else  | 
109  | 0  |   build_upper_buffer (pstr);  | 
110  | 0  |     }  | 
111  | 0  |   else  | 
112  | 0  |     { | 
113  | 0  |       if (dfa->mb_cur_max > 1)  | 
114  | 0  |   build_wcs_buffer (pstr);  | 
115  | 0  |       else  | 
116  | 0  |   { | 
117  | 0  |     if (trans != NULL)  | 
118  | 0  |       re_string_translate_buffer (pstr);  | 
119  | 0  |     else  | 
120  | 0  |       { | 
121  | 0  |         pstr->valid_len = pstr->bufs_len;  | 
122  | 0  |         pstr->valid_raw_len = pstr->bufs_len;  | 
123  | 0  |       }  | 
124  | 0  |   }  | 
125  | 0  |     }  | 
126  |  |  | 
127  | 0  |   return REG_NOERROR;  | 
128  | 0  | }  | 
129  |  |  | 
130  |  | /* Helper functions for re_string_allocate, and re_string_construct.  */  | 
131  |  |  | 
132  |  | static reg_errcode_t  | 
133  |  | __attribute_warn_unused_result__  | 
134  |  | re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)  | 
135  | 0  | { | 
136  | 0  |   if (pstr->mb_cur_max > 1)  | 
137  | 0  |     { | 
138  | 0  |       wint_t *new_wcs;  | 
139  |  |  | 
140  |  |       /* Avoid overflow in realloc.  */  | 
141  | 0  |       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));  | 
142  | 0  |       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)  | 
143  | 0  |           < new_buf_len))  | 
144  | 0  |   return REG_ESPACE;  | 
145  |  |  | 
146  | 0  |       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);  | 
147  | 0  |       if (__glibc_unlikely (new_wcs == NULL))  | 
148  | 0  |   return REG_ESPACE;  | 
149  | 0  |       pstr->wcs = new_wcs;  | 
150  | 0  |       if (pstr->offsets != NULL)  | 
151  | 0  |   { | 
152  | 0  |     Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);  | 
153  | 0  |     if (__glibc_unlikely (new_offsets == NULL))  | 
154  | 0  |       return REG_ESPACE;  | 
155  | 0  |     pstr->offsets = new_offsets;  | 
156  | 0  |   }  | 
157  | 0  |     }  | 
158  | 0  |   if (pstr->mbs_allocated)  | 
159  | 0  |     { | 
160  | 0  |       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,  | 
161  | 0  |              new_buf_len);  | 
162  | 0  |       if (__glibc_unlikely (new_mbs == NULL))  | 
163  | 0  |   return REG_ESPACE;  | 
164  | 0  |       pstr->mbs = new_mbs;  | 
165  | 0  |     }  | 
166  | 0  |   pstr->bufs_len = new_buf_len;  | 
167  | 0  |   return REG_NOERROR;  | 
168  | 0  | }  | 
169  |  |  | 
170  |  |  | 
171  |  | static void  | 
172  |  | re_string_construct_common (const char *str, Idx len, re_string_t *pstr,  | 
173  |  |           RE_TRANSLATE_TYPE trans, bool icase,  | 
174  |  |           const re_dfa_t *dfa)  | 
175  | 0  | { | 
176  | 0  |   pstr->raw_mbs = (const unsigned char *) str;  | 
177  | 0  |   pstr->len = len;  | 
178  | 0  |   pstr->raw_len = len;  | 
179  | 0  |   pstr->trans = trans;  | 
180  | 0  |   pstr->icase = icase;  | 
181  | 0  |   pstr->mbs_allocated = (trans != NULL || icase);  | 
182  | 0  |   pstr->mb_cur_max = dfa->mb_cur_max;  | 
183  | 0  |   pstr->is_utf8 = dfa->is_utf8;  | 
184  | 0  |   pstr->map_notascii = dfa->map_notascii;  | 
185  | 0  |   pstr->stop = pstr->len;  | 
186  | 0  |   pstr->raw_stop = pstr->stop;  | 
187  | 0  | }  | 
188  |  |  | 
189  |  |  | 
190  |  | /* Build wide character buffer PSTR->WCS.  | 
191  |  |    If the byte sequence of the string are:  | 
192  |  |      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>  | 
193  |  |    Then wide character buffer will be:  | 
194  |  |      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>  | 
195  |  |    We use WEOF for padding, they indicate that the position isn't  | 
196  |  |    a first byte of a multibyte character.  | 
197  |  |  | 
198  |  |    Note that this function assumes PSTR->VALID_LEN elements are already  | 
199  |  |    built and starts from PSTR->VALID_LEN.  */  | 
200  |  |  | 
201  |  | static void  | 
202  |  | build_wcs_buffer (re_string_t *pstr)  | 
203  | 0  | { | 
204  |  | #ifdef _LIBC  | 
205  |  |   unsigned char buf[MB_LEN_MAX];  | 
206  |  |   DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);  | 
207  |  | #else  | 
208  | 0  |   unsigned char buf[64];  | 
209  | 0  | #endif  | 
210  | 0  |   mbstate_t prev_st;  | 
211  | 0  |   Idx byte_idx, end_idx, remain_len;  | 
212  | 0  |   size_t mbclen;  | 
213  |  |  | 
214  |  |   /* Build the buffers from pstr->valid_len to either pstr->len or  | 
215  |  |      pstr->bufs_len.  */  | 
216  | 0  |   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;  | 
217  | 0  |   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)  | 
218  | 0  |     { | 
219  | 0  |       wchar_t wc;  | 
220  | 0  |       const char *p;  | 
221  |  | 
  | 
222  | 0  |       remain_len = end_idx - byte_idx;  | 
223  | 0  |       prev_st = pstr->cur_state;  | 
224  |  |       /* Apply the translation if we need.  */  | 
225  | 0  |       if (__glibc_unlikely (pstr->trans != NULL))  | 
226  | 0  |   { | 
227  | 0  |     int i, ch;  | 
228  |  | 
  | 
229  | 0  |     for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)  | 
230  | 0  |       { | 
231  | 0  |         ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];  | 
232  | 0  |         buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];  | 
233  | 0  |       }  | 
234  | 0  |     p = (const char *) buf;  | 
235  | 0  |   }  | 
236  | 0  |       else  | 
237  | 0  |   p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;  | 
238  | 0  |       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);  | 
239  | 0  |       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0  | 
240  | 0  |           || (mbclen == (size_t) -2  | 
241  | 0  |         && pstr->bufs_len >= pstr->len)))  | 
242  | 0  |   { | 
243  |  |     /* We treat these cases as a singlebyte character.  */  | 
244  | 0  |     mbclen = 1;  | 
245  | 0  |     wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];  | 
246  | 0  |     if (__glibc_unlikely (pstr->trans != NULL))  | 
247  | 0  |       wc = pstr->trans[wc];  | 
248  | 0  |     pstr->cur_state = prev_st;  | 
249  | 0  |   }  | 
250  | 0  |       else if (__glibc_unlikely (mbclen == (size_t) -2))  | 
251  | 0  |   { | 
252  |  |     /* The buffer doesn't have enough space, finish to build.  */  | 
253  | 0  |     pstr->cur_state = prev_st;  | 
254  | 0  |     break;  | 
255  | 0  |   }  | 
256  |  |  | 
257  |  |       /* Write wide character and padding.  */  | 
258  | 0  |       pstr->wcs[byte_idx++] = wc;  | 
259  |  |       /* Write paddings.  */  | 
260  | 0  |       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)  | 
261  | 0  |   pstr->wcs[byte_idx++] = WEOF;  | 
262  | 0  |     }  | 
263  | 0  |   pstr->valid_len = byte_idx;  | 
264  | 0  |   pstr->valid_raw_len = byte_idx;  | 
265  | 0  | }  | 
266  |  |  | 
267  |  | /* Build wide character buffer PSTR->WCS like build_wcs_buffer,  | 
268  |  |    but for REG_ICASE.  */  | 
269  |  |  | 
270  |  | static reg_errcode_t  | 
271  |  | __attribute_warn_unused_result__  | 
272  |  | build_wcs_upper_buffer (re_string_t *pstr)  | 
273  | 0  | { | 
274  | 0  |   mbstate_t prev_st;  | 
275  | 0  |   Idx src_idx, byte_idx, end_idx, remain_len;  | 
276  | 0  |   size_t mbclen;  | 
277  |  | #ifdef _LIBC  | 
278  |  |   char buf[MB_LEN_MAX];  | 
279  |  |   DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);  | 
280  |  | #else  | 
281  | 0  |   char buf[64];  | 
282  | 0  | #endif  | 
283  |  | 
  | 
284  | 0  |   byte_idx = pstr->valid_len;  | 
285  | 0  |   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;  | 
286  |  |  | 
287  |  |   /* The following optimization assumes that ASCII characters can be  | 
288  |  |      mapped to wide characters with a simple cast.  */  | 
289  | 0  |   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)  | 
290  | 0  |     { | 
291  | 0  |       while (byte_idx < end_idx)  | 
292  | 0  |   { | 
293  | 0  |     wchar_t wc;  | 
294  | 0  |     unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];  | 
295  |  | 
  | 
296  | 0  |     if (isascii (ch) && mbsinit (&pstr->cur_state))  | 
297  | 0  |       { | 
298  |  |         /* The next step uses the assumption that wchar_t is encoded  | 
299  |  |      ASCII-safe: all ASCII values can be converted like this.  */  | 
300  | 0  |         wchar_t wcu = __towupper (ch);  | 
301  | 0  |         if (isascii (wcu))  | 
302  | 0  |     { | 
303  | 0  |       pstr->mbs[byte_idx] = wcu;  | 
304  | 0  |       pstr->wcs[byte_idx] = wcu;  | 
305  | 0  |       byte_idx++;  | 
306  | 0  |       continue;  | 
307  | 0  |     }  | 
308  | 0  |       }  | 
309  |  |  | 
310  | 0  |     remain_len = end_idx - byte_idx;  | 
311  | 0  |     prev_st = pstr->cur_state;  | 
312  | 0  |     mbclen = __mbrtowc (&wc,  | 
313  | 0  |             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx  | 
314  | 0  |              + byte_idx), remain_len, &pstr->cur_state);  | 
315  | 0  |     if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))  | 
316  | 0  |       { | 
317  | 0  |         wchar_t wcu = __towupper (wc);  | 
318  | 0  |         if (wcu != wc)  | 
319  | 0  |     { | 
320  | 0  |       size_t mbcdlen;  | 
321  |  | 
  | 
322  | 0  |       mbcdlen = __wcrtomb (buf, wcu, &prev_st);  | 
323  | 0  |       if (__glibc_likely (mbclen == mbcdlen))  | 
324  | 0  |         memcpy (pstr->mbs + byte_idx, buf, mbclen);  | 
325  | 0  |       else  | 
326  | 0  |         { | 
327  | 0  |           src_idx = byte_idx;  | 
328  | 0  |           goto offsets_needed;  | 
329  | 0  |         }  | 
330  | 0  |     }  | 
331  | 0  |         else  | 
332  | 0  |     memcpy (pstr->mbs + byte_idx,  | 
333  | 0  |       pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);  | 
334  | 0  |         pstr->wcs[byte_idx++] = wcu;  | 
335  |  |         /* Write paddings.  */  | 
336  | 0  |         for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)  | 
337  | 0  |     pstr->wcs[byte_idx++] = WEOF;  | 
338  | 0  |       }  | 
339  | 0  |     else if (mbclen == (size_t) -1 || mbclen == 0  | 
340  | 0  |        || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))  | 
341  | 0  |       { | 
342  |  |         /* It is an invalid character, an incomplete character  | 
343  |  |      at the end of the string, or '\0'.  Just use the byte.  */  | 
344  | 0  |         pstr->mbs[byte_idx] = ch;  | 
345  |  |         /* And also cast it to wide char.  */  | 
346  | 0  |         pstr->wcs[byte_idx++] = (wchar_t) ch;  | 
347  | 0  |         if (__glibc_unlikely (mbclen == (size_t) -1))  | 
348  | 0  |     pstr->cur_state = prev_st;  | 
349  | 0  |       }  | 
350  | 0  |     else  | 
351  | 0  |       { | 
352  |  |         /* The buffer doesn't have enough space, finish to build.  */  | 
353  | 0  |         pstr->cur_state = prev_st;  | 
354  | 0  |         break;  | 
355  | 0  |       }  | 
356  | 0  |   }  | 
357  | 0  |       pstr->valid_len = byte_idx;  | 
358  | 0  |       pstr->valid_raw_len = byte_idx;  | 
359  | 0  |       return REG_NOERROR;  | 
360  | 0  |     }  | 
361  | 0  |   else  | 
362  | 0  |     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)  | 
363  | 0  |       { | 
364  | 0  |   wchar_t wc;  | 
365  | 0  |   const char *p;  | 
366  | 0  |       offsets_needed:  | 
367  | 0  |   remain_len = end_idx - byte_idx;  | 
368  | 0  |   prev_st = pstr->cur_state;  | 
369  | 0  |   if (__glibc_unlikely (pstr->trans != NULL))  | 
370  | 0  |     { | 
371  | 0  |       int i, ch;  | 
372  |  | 
  | 
373  | 0  |       for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)  | 
374  | 0  |         { | 
375  | 0  |     ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];  | 
376  | 0  |     buf[i] = pstr->trans[ch];  | 
377  | 0  |         }  | 
378  | 0  |       p = (const char *) buf;  | 
379  | 0  |     }  | 
380  | 0  |   else  | 
381  | 0  |     p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;  | 
382  | 0  |   mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);  | 
383  | 0  |   if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))  | 
384  | 0  |     { | 
385  | 0  |       wchar_t wcu = __towupper (wc);  | 
386  | 0  |       if (wcu != wc)  | 
387  | 0  |         { | 
388  | 0  |     size_t mbcdlen;  | 
389  |  | 
  | 
390  | 0  |     mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);  | 
391  | 0  |     if (__glibc_likely (mbclen == mbcdlen))  | 
392  | 0  |       memcpy (pstr->mbs + byte_idx, buf, mbclen);  | 
393  | 0  |     else if (mbcdlen != (size_t) -1)  | 
394  | 0  |       { | 
395  | 0  |         size_t i;  | 
396  |  | 
  | 
397  | 0  |         if (byte_idx + mbcdlen > pstr->bufs_len)  | 
398  | 0  |           { | 
399  | 0  |       pstr->cur_state = prev_st;  | 
400  | 0  |       break;  | 
401  | 0  |           }  | 
402  |  |  | 
403  | 0  |         if (pstr->offsets == NULL)  | 
404  | 0  |           { | 
405  | 0  |       pstr->offsets = re_malloc (Idx, pstr->bufs_len);  | 
406  |  | 
  | 
407  | 0  |       if (pstr->offsets == NULL)  | 
408  | 0  |         return REG_ESPACE;  | 
409  | 0  |           }  | 
410  | 0  |         if (!pstr->offsets_needed)  | 
411  | 0  |           { | 
412  | 0  |       for (i = 0; i < (size_t) byte_idx; ++i)  | 
413  | 0  |         pstr->offsets[i] = i;  | 
414  | 0  |       pstr->offsets_needed = 1;  | 
415  | 0  |           }  | 
416  |  | 
  | 
417  | 0  |         memcpy (pstr->mbs + byte_idx, buf, mbcdlen);  | 
418  | 0  |         pstr->wcs[byte_idx] = wcu;  | 
419  | 0  |         pstr->offsets[byte_idx] = src_idx;  | 
420  | 0  |         for (i = 1; i < mbcdlen; ++i)  | 
421  | 0  |           { | 
422  | 0  |       pstr->offsets[byte_idx + i]  | 
423  | 0  |         = src_idx + (i < mbclen ? i : mbclen - 1);  | 
424  | 0  |       pstr->wcs[byte_idx + i] = WEOF;  | 
425  | 0  |           }  | 
426  | 0  |         pstr->len += mbcdlen - mbclen;  | 
427  | 0  |         if (pstr->raw_stop > src_idx)  | 
428  | 0  |           pstr->stop += mbcdlen - mbclen;  | 
429  | 0  |         end_idx = (pstr->bufs_len > pstr->len)  | 
430  | 0  |             ? pstr->len : pstr->bufs_len;  | 
431  | 0  |         byte_idx += mbcdlen;  | 
432  | 0  |         src_idx += mbclen;  | 
433  | 0  |         continue;  | 
434  | 0  |       }  | 
435  | 0  |     else  | 
436  | 0  |       memcpy (pstr->mbs + byte_idx, p, mbclen);  | 
437  | 0  |         }  | 
438  | 0  |       else  | 
439  | 0  |         memcpy (pstr->mbs + byte_idx, p, mbclen);  | 
440  |  |  | 
441  | 0  |       if (__glibc_unlikely (pstr->offsets_needed != 0))  | 
442  | 0  |         { | 
443  | 0  |     size_t i;  | 
444  | 0  |     for (i = 0; i < mbclen; ++i)  | 
445  | 0  |       pstr->offsets[byte_idx + i] = src_idx + i;  | 
446  | 0  |         }  | 
447  | 0  |       src_idx += mbclen;  | 
448  |  | 
  | 
449  | 0  |       pstr->wcs[byte_idx++] = wcu;  | 
450  |  |       /* Write paddings.  */  | 
451  | 0  |       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)  | 
452  | 0  |         pstr->wcs[byte_idx++] = WEOF;  | 
453  | 0  |     }  | 
454  | 0  |   else if (mbclen == (size_t) -1 || mbclen == 0  | 
455  | 0  |      || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))  | 
456  | 0  |     { | 
457  |  |       /* It is an invalid character or '\0'.  Just use the byte.  */  | 
458  | 0  |       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];  | 
459  |  | 
  | 
460  | 0  |       if (__glibc_unlikely (pstr->trans != NULL))  | 
461  | 0  |         ch = pstr->trans [ch];  | 
462  | 0  |       pstr->mbs[byte_idx] = ch;  | 
463  |  | 
  | 
464  | 0  |       if (__glibc_unlikely (pstr->offsets_needed != 0))  | 
465  | 0  |         pstr->offsets[byte_idx] = src_idx;  | 
466  | 0  |       ++src_idx;  | 
467  |  |  | 
468  |  |       /* And also cast it to wide char.  */  | 
469  | 0  |       pstr->wcs[byte_idx++] = (wchar_t) ch;  | 
470  | 0  |       if (__glibc_unlikely (mbclen == (size_t) -1))  | 
471  | 0  |         pstr->cur_state = prev_st;  | 
472  | 0  |     }  | 
473  | 0  |   else  | 
474  | 0  |     { | 
475  |  |       /* The buffer doesn't have enough space, finish to build.  */  | 
476  | 0  |       pstr->cur_state = prev_st;  | 
477  | 0  |       break;  | 
478  | 0  |     }  | 
479  | 0  |       }  | 
480  | 0  |   pstr->valid_len = byte_idx;  | 
481  | 0  |   pstr->valid_raw_len = src_idx;  | 
482  | 0  |   return REG_NOERROR;  | 
483  | 0  | }  | 
484  |  |  | 
485  |  | /* Skip characters until the index becomes greater than NEW_RAW_IDX.  | 
486  |  |    Return the index.  */  | 
487  |  |  | 
488  |  | static Idx  | 
489  |  | re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)  | 
490  | 0  | { | 
491  | 0  |   mbstate_t prev_st;  | 
492  | 0  |   Idx rawbuf_idx;  | 
493  | 0  |   size_t mbclen;  | 
494  | 0  |   wint_t wc = WEOF;  | 
495  |  |  | 
496  |  |   /* Skip the characters which are not necessary to check.  */  | 
497  | 0  |   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;  | 
498  | 0  |        rawbuf_idx < new_raw_idx;)  | 
499  | 0  |     { | 
500  | 0  |       wchar_t wc2;  | 
501  | 0  |       Idx remain_len = pstr->raw_len - rawbuf_idx;  | 
502  | 0  |       prev_st = pstr->cur_state;  | 
503  | 0  |       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,  | 
504  | 0  |         remain_len, &pstr->cur_state);  | 
505  | 0  |       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1  | 
506  | 0  |           || mbclen == 0))  | 
507  | 0  |   { | 
508  |  |     /* We treat these cases as a single byte character.  */  | 
509  | 0  |     if (mbclen == 0 || remain_len == 0)  | 
510  | 0  |       wc = L'\0';  | 
511  | 0  |     else  | 
512  | 0  |       wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);  | 
513  | 0  |     mbclen = 1;  | 
514  | 0  |     pstr->cur_state = prev_st;  | 
515  | 0  |   }  | 
516  | 0  |       else  | 
517  | 0  |   wc = wc2;  | 
518  |  |       /* Then proceed the next character.  */  | 
519  | 0  |       rawbuf_idx += mbclen;  | 
520  | 0  |     }  | 
521  | 0  |   *last_wc = wc;  | 
522  | 0  |   return rawbuf_idx;  | 
523  | 0  | }  | 
524  |  |  | 
525  |  | /* Build the buffer PSTR->MBS, and apply the translation if we need.  | 
526  |  |    This function is used in case of REG_ICASE.  */  | 
527  |  |  | 
528  |  | static void  | 
529  |  | build_upper_buffer (re_string_t *pstr)  | 
530  | 0  | { | 
531  | 0  |   Idx char_idx, end_idx;  | 
532  | 0  |   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;  | 
533  |  | 
  | 
534  | 0  |   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)  | 
535  | 0  |     { | 
536  | 0  |       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];  | 
537  | 0  |       if (__glibc_unlikely (pstr->trans != NULL))  | 
538  | 0  |   ch = pstr->trans[ch];  | 
539  | 0  |       pstr->mbs[char_idx] = toupper (ch);  | 
540  | 0  |     }  | 
541  | 0  |   pstr->valid_len = char_idx;  | 
542  | 0  |   pstr->valid_raw_len = char_idx;  | 
543  | 0  | }  | 
544  |  |  | 
545  |  | /* Apply TRANS to the buffer in PSTR.  */  | 
546  |  |  | 
547  |  | static void  | 
548  |  | re_string_translate_buffer (re_string_t *pstr)  | 
549  | 0  | { | 
550  | 0  |   Idx buf_idx, end_idx;  | 
551  | 0  |   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;  | 
552  |  | 
  | 
553  | 0  |   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)  | 
554  | 0  |     { | 
555  | 0  |       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];  | 
556  | 0  |       pstr->mbs[buf_idx] = pstr->trans[ch];  | 
557  | 0  |     }  | 
558  |  | 
  | 
559  | 0  |   pstr->valid_len = buf_idx;  | 
560  | 0  |   pstr->valid_raw_len = buf_idx;  | 
561  | 0  | }  | 
562  |  |  | 
563  |  | /* This function re-construct the buffers.  | 
564  |  |    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,  | 
565  |  |    convert to upper case in case of REG_ICASE, apply translation.  */  | 
566  |  |  | 
567  |  | static reg_errcode_t  | 
568  |  | __attribute_warn_unused_result__  | 
569  |  | re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)  | 
570  | 0  | { | 
571  | 0  |   Idx offset;  | 
572  |  | 
  | 
573  | 0  |   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))  | 
574  | 0  |     offset = idx - pstr->raw_mbs_idx;  | 
575  | 0  |   else  | 
576  | 0  |     { | 
577  |  |       /* Reset buffer.  */  | 
578  | 0  |       if (pstr->mb_cur_max > 1)  | 
579  | 0  |   memset (&pstr->cur_state, '\0', sizeof (mbstate_t));  | 
580  | 0  |       pstr->len = pstr->raw_len;  | 
581  | 0  |       pstr->stop = pstr->raw_stop;  | 
582  | 0  |       pstr->valid_len = 0;  | 
583  | 0  |       pstr->raw_mbs_idx = 0;  | 
584  | 0  |       pstr->valid_raw_len = 0;  | 
585  | 0  |       pstr->offsets_needed = 0;  | 
586  | 0  |       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF  | 
587  | 0  |          : CONTEXT_NEWLINE | CONTEXT_BEGBUF);  | 
588  | 0  |       if (!pstr->mbs_allocated)  | 
589  | 0  |   pstr->mbs = (unsigned char *) pstr->raw_mbs;  | 
590  | 0  |       offset = idx;  | 
591  | 0  |     }  | 
592  |  | 
  | 
593  | 0  |   if (__glibc_likely (offset != 0))  | 
594  | 0  |     { | 
595  |  |       /* Should the already checked characters be kept?  */  | 
596  | 0  |       if (__glibc_likely (offset < pstr->valid_raw_len))  | 
597  | 0  |   { | 
598  |  |     /* Yes, move them to the front of the buffer.  */  | 
599  | 0  |     if (__glibc_unlikely (pstr->offsets_needed))  | 
600  | 0  |       { | 
601  | 0  |         Idx low = 0, high = pstr->valid_len, mid;  | 
602  | 0  |         do  | 
603  | 0  |     { | 
604  | 0  |       mid = (high + low) / 2;  | 
605  | 0  |       if (pstr->offsets[mid] > offset)  | 
606  | 0  |         high = mid;  | 
607  | 0  |       else if (pstr->offsets[mid] < offset)  | 
608  | 0  |         low = mid + 1;  | 
609  | 0  |       else  | 
610  | 0  |         break;  | 
611  | 0  |     }  | 
612  | 0  |         while (low < high);  | 
613  | 0  |         if (pstr->offsets[mid] < offset)  | 
614  | 0  |     ++mid;  | 
615  | 0  |         pstr->tip_context = re_string_context_at (pstr, mid - 1,  | 
616  | 0  |               eflags);  | 
617  |  |         /* This can be quite complicated, so handle specially  | 
618  |  |      only the common and easy case where the character with  | 
619  |  |      different length representation of lower and upper  | 
620  |  |      case is present at or after offset.  */  | 
621  | 0  |         if (pstr->valid_len > offset  | 
622  | 0  |       && mid == offset && pstr->offsets[mid] == offset)  | 
623  | 0  |     { | 
624  | 0  |       memmove (pstr->wcs, pstr->wcs + offset,  | 
625  | 0  |          (pstr->valid_len - offset) * sizeof (wint_t));  | 
626  | 0  |       memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);  | 
627  | 0  |       pstr->valid_len -= offset;  | 
628  | 0  |       pstr->valid_raw_len -= offset;  | 
629  | 0  |       for (low = 0; low < pstr->valid_len; low++)  | 
630  | 0  |         pstr->offsets[low] = pstr->offsets[low + offset] - offset;  | 
631  | 0  |     }  | 
632  | 0  |         else  | 
633  | 0  |     { | 
634  |  |       /* Otherwise, just find out how long the partial multibyte  | 
635  |  |          character at offset is and fill it with WEOF/255.  */  | 
636  | 0  |       pstr->len = pstr->raw_len - idx + offset;  | 
637  | 0  |       pstr->stop = pstr->raw_stop - idx + offset;  | 
638  | 0  |       pstr->offsets_needed = 0;  | 
639  | 0  |       while (mid > 0 && pstr->offsets[mid - 1] == offset)  | 
640  | 0  |         --mid;  | 
641  | 0  |       while (mid < pstr->valid_len)  | 
642  | 0  |         if (pstr->wcs[mid] != WEOF)  | 
643  | 0  |           break;  | 
644  | 0  |         else  | 
645  | 0  |           ++mid;  | 
646  | 0  |       if (mid == pstr->valid_len)  | 
647  | 0  |         pstr->valid_len = 0;  | 
648  | 0  |       else  | 
649  | 0  |         { | 
650  | 0  |           pstr->valid_len = pstr->offsets[mid] - offset;  | 
651  | 0  |           if (pstr->valid_len)  | 
652  | 0  |       { | 
653  | 0  |         for (low = 0; low < pstr->valid_len; ++low)  | 
654  | 0  |           pstr->wcs[low] = WEOF;  | 
655  | 0  |         memset (pstr->mbs, 255, pstr->valid_len);  | 
656  | 0  |       }  | 
657  | 0  |         }  | 
658  | 0  |       pstr->valid_raw_len = pstr->valid_len;  | 
659  | 0  |     }  | 
660  | 0  |       }  | 
661  | 0  |     else  | 
662  | 0  |       { | 
663  | 0  |         pstr->tip_context = re_string_context_at (pstr, offset - 1,  | 
664  | 0  |               eflags);  | 
665  | 0  |         if (pstr->mb_cur_max > 1)  | 
666  | 0  |     memmove (pstr->wcs, pstr->wcs + offset,  | 
667  | 0  |        (pstr->valid_len - offset) * sizeof (wint_t));  | 
668  | 0  |         if (__glibc_unlikely (pstr->mbs_allocated))  | 
669  | 0  |     memmove (pstr->mbs, pstr->mbs + offset,  | 
670  | 0  |        pstr->valid_len - offset);  | 
671  | 0  |         pstr->valid_len -= offset;  | 
672  | 0  |         pstr->valid_raw_len -= offset;  | 
673  | 0  |         DEBUG_ASSERT (pstr->valid_len > 0);  | 
674  | 0  |       }  | 
675  | 0  |   }  | 
676  | 0  |       else  | 
677  | 0  |   { | 
678  |  |     /* No, skip all characters until IDX.  */  | 
679  | 0  |     Idx prev_valid_len = pstr->valid_len;  | 
680  |  | 
  | 
681  | 0  |     if (__glibc_unlikely (pstr->offsets_needed))  | 
682  | 0  |       { | 
683  | 0  |         pstr->len = pstr->raw_len - idx + offset;  | 
684  | 0  |         pstr->stop = pstr->raw_stop - idx + offset;  | 
685  | 0  |         pstr->offsets_needed = 0;  | 
686  | 0  |       }  | 
687  | 0  |     pstr->valid_len = 0;  | 
688  | 0  |     if (pstr->mb_cur_max > 1)  | 
689  | 0  |       { | 
690  | 0  |         Idx wcs_idx;  | 
691  | 0  |         wint_t wc = WEOF;  | 
692  |  | 
  | 
693  | 0  |         if (pstr->is_utf8)  | 
694  | 0  |     { | 
695  | 0  |       const unsigned char *raw, *p, *end;  | 
696  |  |  | 
697  |  |       /* Special case UTF-8.  Multi-byte chars start with any  | 
698  |  |          byte other than 0x80 - 0xbf.  */  | 
699  | 0  |       raw = pstr->raw_mbs + pstr->raw_mbs_idx;  | 
700  | 0  |       end = raw + (offset - pstr->mb_cur_max);  | 
701  | 0  |       if (end < pstr->raw_mbs)  | 
702  | 0  |         end = pstr->raw_mbs;  | 
703  | 0  |       p = raw + offset - 1;  | 
704  |  | #ifdef _LIBC  | 
705  |  |       /* We know the wchar_t encoding is UCS4, so for the simple  | 
706  |  |          case, ASCII characters, skip the conversion step.  */  | 
707  |  |       if (isascii (*p) && __glibc_likely (pstr->trans == NULL))  | 
708  |  |         { | 
709  |  |           memset (&pstr->cur_state, '\0', sizeof (mbstate_t));  | 
710  |  |           /* pstr->valid_len = 0; */  | 
711  |  |           wc = (wchar_t) *p;  | 
712  |  |         }  | 
713  |  |       else  | 
714  |  | #endif  | 
715  | 0  |         for (; p >= end; --p)  | 
716  | 0  |           if ((*p & 0xc0) != 0x80)  | 
717  | 0  |       { | 
718  | 0  |         mbstate_t cur_state;  | 
719  | 0  |         wchar_t wc2;  | 
720  | 0  |         Idx mlen = raw + pstr->len - p;  | 
721  | 0  |         unsigned char buf[6];  | 
722  | 0  |         size_t mbclen;  | 
723  |  | 
  | 
724  | 0  |         const unsigned char *pp = p;  | 
725  | 0  |         if (__glibc_unlikely (pstr->trans != NULL))  | 
726  | 0  |           { | 
727  | 0  |             int i = mlen < 6 ? mlen : 6;  | 
728  | 0  |             while (--i >= 0)  | 
729  | 0  |         buf[i] = pstr->trans[p[i]];  | 
730  | 0  |             pp = buf;  | 
731  | 0  |           }  | 
732  |  |         /* XXX Don't use mbrtowc, we know which conversion  | 
733  |  |            to use (UTF-8 -> UCS4).  */  | 
734  | 0  |         memset (&cur_state, 0, sizeof (cur_state));  | 
735  | 0  |         mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,  | 
736  | 0  |                 &cur_state);  | 
737  | 0  |         if (raw + offset - p <= mbclen  | 
738  | 0  |             && mbclen < (size_t) -2)  | 
739  | 0  |           { | 
740  | 0  |             memset (&pstr->cur_state, '\0',  | 
741  | 0  |               sizeof (mbstate_t));  | 
742  | 0  |             pstr->valid_len = mbclen - (raw + offset - p);  | 
743  | 0  |             wc = wc2;  | 
744  | 0  |           }  | 
745  | 0  |         break;  | 
746  | 0  |       }  | 
747  | 0  |     }  | 
748  |  | 
  | 
749  | 0  |         if (wc == WEOF)  | 
750  | 0  |     pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;  | 
751  | 0  |         if (wc == WEOF)  | 
752  | 0  |     pstr->tip_context  | 
753  | 0  |       = re_string_context_at (pstr, prev_valid_len - 1, eflags);  | 
754  | 0  |         else  | 
755  | 0  |     pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)  | 
756  | 0  |               && IS_WIDE_WORD_CHAR (wc))  | 
757  | 0  |              ? CONTEXT_WORD  | 
758  | 0  |              : ((IS_WIDE_NEWLINE (wc)  | 
759  | 0  |            && pstr->newline_anchor)  | 
760  | 0  |           ? CONTEXT_NEWLINE : 0));  | 
761  | 0  |         if (__glibc_unlikely (pstr->valid_len))  | 
762  | 0  |     { | 
763  | 0  |       for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)  | 
764  | 0  |         pstr->wcs[wcs_idx] = WEOF;  | 
765  | 0  |       if (pstr->mbs_allocated)  | 
766  | 0  |         memset (pstr->mbs, 255, pstr->valid_len);  | 
767  | 0  |     }  | 
768  | 0  |         pstr->valid_raw_len = pstr->valid_len;  | 
769  | 0  |       }  | 
770  | 0  |     else  | 
771  | 0  |       { | 
772  | 0  |         int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];  | 
773  | 0  |         pstr->valid_raw_len = 0;  | 
774  | 0  |         if (pstr->trans)  | 
775  | 0  |     c = pstr->trans[c];  | 
776  | 0  |         pstr->tip_context = (bitset_contain (pstr->word_char, c)  | 
777  | 0  |            ? CONTEXT_WORD  | 
778  | 0  |            : ((IS_NEWLINE (c) && pstr->newline_anchor)  | 
779  | 0  |               ? CONTEXT_NEWLINE : 0));  | 
780  | 0  |       }  | 
781  | 0  |   }  | 
782  | 0  |       if (!__glibc_unlikely (pstr->mbs_allocated))  | 
783  | 0  |   pstr->mbs += offset;  | 
784  | 0  |     }  | 
785  | 0  |   pstr->raw_mbs_idx = idx;  | 
786  | 0  |   pstr->len -= offset;  | 
787  | 0  |   pstr->stop -= offset;  | 
788  |  |  | 
789  |  |   /* Then build the buffers.  */  | 
790  | 0  |   if (pstr->mb_cur_max > 1)  | 
791  | 0  |     { | 
792  | 0  |       if (pstr->icase)  | 
793  | 0  |   { | 
794  | 0  |     reg_errcode_t ret = build_wcs_upper_buffer (pstr);  | 
795  | 0  |     if (__glibc_unlikely (ret != REG_NOERROR))  | 
796  | 0  |       return ret;  | 
797  | 0  |   }  | 
798  | 0  |       else  | 
799  | 0  |   build_wcs_buffer (pstr);  | 
800  | 0  |     }  | 
801  | 0  |   else  | 
802  | 0  |     if (__glibc_unlikely (pstr->mbs_allocated))  | 
803  | 0  |       { | 
804  | 0  |   if (pstr->icase)  | 
805  | 0  |     build_upper_buffer (pstr);  | 
806  | 0  |   else if (pstr->trans != NULL)  | 
807  | 0  |     re_string_translate_buffer (pstr);  | 
808  | 0  |       }  | 
809  | 0  |     else  | 
810  | 0  |       pstr->valid_len = pstr->len;  | 
811  |  |  | 
812  | 0  |   pstr->cur_idx = 0;  | 
813  | 0  |   return REG_NOERROR;  | 
814  | 0  | }  | 
815  |  |  | 
816  |  | static unsigned char  | 
817  |  | __attribute__ ((pure))  | 
818  |  | re_string_peek_byte_case (const re_string_t *pstr, Idx idx)  | 
819  | 0  | { | 
820  | 0  |   int ch;  | 
821  | 0  |   Idx off;  | 
822  |  |  | 
823  |  |   /* Handle the common (easiest) cases first.  */  | 
824  | 0  |   if (__glibc_likely (!pstr->mbs_allocated))  | 
825  | 0  |     return re_string_peek_byte (pstr, idx);  | 
826  |  |  | 
827  | 0  |   if (pstr->mb_cur_max > 1  | 
828  | 0  |       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))  | 
829  | 0  |     return re_string_peek_byte (pstr, idx);  | 
830  |  |  | 
831  | 0  |   off = pstr->cur_idx + idx;  | 
832  | 0  |   if (pstr->offsets_needed)  | 
833  | 0  |     off = pstr->offsets[off];  | 
834  |  | 
  | 
835  | 0  |   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];  | 
836  |  |  | 
837  |  |   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I  | 
838  |  |      this function returns CAPITAL LETTER I instead of first byte of  | 
839  |  |      DOTLESS SMALL LETTER I.  The latter would confuse the parser,  | 
840  |  |      since peek_byte_case doesn't advance cur_idx in any way.  */  | 
841  | 0  |   if (pstr->offsets_needed && !isascii (ch))  | 
842  | 0  |     return re_string_peek_byte (pstr, idx);  | 
843  |  |  | 
844  | 0  |   return ch;  | 
845  | 0  | }  | 
846  |  |  | 
847  |  | static unsigned char  | 
848  |  | re_string_fetch_byte_case (re_string_t *pstr)  | 
849  | 0  | { | 
850  | 0  |   if (__glibc_likely (!pstr->mbs_allocated))  | 
851  | 0  |     return re_string_fetch_byte (pstr);  | 
852  |  |  | 
853  | 0  |   if (pstr->offsets_needed)  | 
854  | 0  |     { | 
855  | 0  |       Idx off;  | 
856  | 0  |       int ch;  | 
857  |  |  | 
858  |  |       /* For tr_TR.UTF-8 [[:islower:]] there is  | 
859  |  |    [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip  | 
860  |  |    in that case the whole multi-byte character and return  | 
861  |  |    the original letter.  On the other side, with  | 
862  |  |    [[: DOTLESS SMALL LETTER I return [[:I, as doing  | 
863  |  |    anything else would complicate things too much.  */  | 
864  |  | 
  | 
865  | 0  |       if (!re_string_first_byte (pstr, pstr->cur_idx))  | 
866  | 0  |   return re_string_fetch_byte (pstr);  | 
867  |  |  | 
868  | 0  |       off = pstr->offsets[pstr->cur_idx];  | 
869  | 0  |       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];  | 
870  |  | 
  | 
871  | 0  |       if (! isascii (ch))  | 
872  | 0  |   return re_string_fetch_byte (pstr);  | 
873  |  |  | 
874  | 0  |       re_string_skip_bytes (pstr,  | 
875  | 0  |           re_string_char_size_at (pstr, pstr->cur_idx));  | 
876  | 0  |       return ch;  | 
877  | 0  |     }  | 
878  |  |  | 
879  | 0  |   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];  | 
880  | 0  | }  | 
881  |  |  | 
882  |  | static void  | 
883  |  | re_string_destruct (re_string_t *pstr)  | 
884  | 0  | { | 
885  | 0  |   re_free (pstr->wcs);  | 
886  | 0  |   re_free (pstr->offsets);  | 
887  | 0  |   if (pstr->mbs_allocated)  | 
888  | 0  |     re_free (pstr->mbs);  | 
889  | 0  | }  | 
890  |  |  | 
891  |  | /* Return the context at IDX in INPUT.  */  | 
892  |  |  | 
893  |  | static unsigned int  | 
894  |  | re_string_context_at (const re_string_t *input, Idx idx, int eflags)  | 
895  | 0  | { | 
896  | 0  |   int c;  | 
897  | 0  |   if (__glibc_unlikely (idx < 0))  | 
898  |  |     /* In this case, we use the value stored in input->tip_context,  | 
899  |  |        since we can't know the character in input->mbs[-1] here.  */  | 
900  | 0  |     return input->tip_context;  | 
901  | 0  |   if (__glibc_unlikely (idx == input->len))  | 
902  | 0  |     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF  | 
903  | 0  |       : CONTEXT_NEWLINE | CONTEXT_ENDBUF);  | 
904  | 0  |   if (input->mb_cur_max > 1)  | 
905  | 0  |     { | 
906  | 0  |       wint_t wc;  | 
907  | 0  |       Idx wc_idx = idx;  | 
908  | 0  |       while(input->wcs[wc_idx] == WEOF)  | 
909  | 0  |   { | 
910  | 0  |     DEBUG_ASSERT (wc_idx >= 0);  | 
911  | 0  |     --wc_idx;  | 
912  | 0  |     if (wc_idx < 0)  | 
913  | 0  |       return input->tip_context;  | 
914  | 0  |   }  | 
915  | 0  |       wc = input->wcs[wc_idx];  | 
916  | 0  |       if (__glibc_unlikely (input->word_ops_used != 0)  | 
917  | 0  |     && IS_WIDE_WORD_CHAR (wc))  | 
918  | 0  |   return CONTEXT_WORD;  | 
919  | 0  |       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor  | 
920  | 0  |         ? CONTEXT_NEWLINE : 0);  | 
921  | 0  |     }  | 
922  | 0  |   else  | 
923  | 0  |     { | 
924  | 0  |       c = re_string_byte_at (input, idx);  | 
925  | 0  |       if (bitset_contain (input->word_char, c))  | 
926  | 0  |   return CONTEXT_WORD;  | 
927  | 0  |       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;  | 
928  | 0  |     }  | 
929  | 0  | }  | 
930  |  |  | 
931  |  | /* Functions for set operation.  */  | 
932  |  |  | 
933  |  | static reg_errcode_t  | 
934  |  | __attribute_warn_unused_result__  | 
935  |  | re_node_set_alloc (re_node_set *set, Idx size)  | 
936  | 0  | { | 
937  | 0  |   set->alloc = size;  | 
938  | 0  |   set->nelem = 0;  | 
939  | 0  |   set->elems = re_malloc (Idx, size);  | 
940  | 0  |   if (__glibc_unlikely (set->elems == NULL))  | 
941  | 0  |     return REG_ESPACE;  | 
942  | 0  |   return REG_NOERROR;  | 
943  | 0  | }  | 
944  |  |  | 
945  |  | static reg_errcode_t  | 
946  |  | __attribute_warn_unused_result__  | 
947  |  | re_node_set_init_1 (re_node_set *set, Idx elem)  | 
948  | 0  | { | 
949  | 0  |   set->alloc = 1;  | 
950  | 0  |   set->nelem = 1;  | 
951  | 0  |   set->elems = re_malloc (Idx, 1);  | 
952  | 0  |   if (__glibc_unlikely (set->elems == NULL))  | 
953  | 0  |     { | 
954  | 0  |       set->alloc = set->nelem = 0;  | 
955  | 0  |       return REG_ESPACE;  | 
956  | 0  |     }  | 
957  | 0  |   set->elems[0] = elem;  | 
958  | 0  |   return REG_NOERROR;  | 
959  | 0  | }  | 
960  |  |  | 
961  |  | static reg_errcode_t  | 
962  |  | __attribute_warn_unused_result__  | 
963  |  | re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)  | 
964  | 0  | { | 
965  | 0  |   set->alloc = 2;  | 
966  | 0  |   set->elems = re_malloc (Idx, 2);  | 
967  | 0  |   if (__glibc_unlikely (set->elems == NULL))  | 
968  | 0  |     return REG_ESPACE;  | 
969  | 0  |   if (elem1 == elem2)  | 
970  | 0  |     { | 
971  | 0  |       set->nelem = 1;  | 
972  | 0  |       set->elems[0] = elem1;  | 
973  | 0  |     }  | 
974  | 0  |   else  | 
975  | 0  |     { | 
976  | 0  |       set->nelem = 2;  | 
977  | 0  |       if (elem1 < elem2)  | 
978  | 0  |   { | 
979  | 0  |     set->elems[0] = elem1;  | 
980  | 0  |     set->elems[1] = elem2;  | 
981  | 0  |   }  | 
982  | 0  |       else  | 
983  | 0  |   { | 
984  | 0  |     set->elems[0] = elem2;  | 
985  | 0  |     set->elems[1] = elem1;  | 
986  | 0  |   }  | 
987  | 0  |     }  | 
988  | 0  |   return REG_NOERROR;  | 
989  | 0  | }  | 
990  |  |  | 
991  |  | static reg_errcode_t  | 
992  |  | __attribute_warn_unused_result__  | 
993  |  | re_node_set_init_copy (re_node_set *dest, const re_node_set *src)  | 
994  | 0  | { | 
995  | 0  |   dest->nelem = src->nelem;  | 
996  | 0  |   if (src->nelem > 0)  | 
997  | 0  |     { | 
998  | 0  |       dest->alloc = dest->nelem;  | 
999  | 0  |       dest->elems = re_malloc (Idx, dest->alloc);  | 
1000  | 0  |       if (__glibc_unlikely (dest->elems == NULL))  | 
1001  | 0  |   { | 
1002  | 0  |     dest->alloc = dest->nelem = 0;  | 
1003  | 0  |     return REG_ESPACE;  | 
1004  | 0  |   }  | 
1005  | 0  |       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));  | 
1006  | 0  |     }  | 
1007  | 0  |   else  | 
1008  | 0  |     re_node_set_init_empty (dest);  | 
1009  | 0  |   return REG_NOERROR;  | 
1010  | 0  | }  | 
1011  |  |  | 
1012  |  | /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to  | 
1013  |  |    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  | 
1014  |  |    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */  | 
1015  |  |  | 
1016  |  | static reg_errcode_t  | 
1017  |  | __attribute_warn_unused_result__  | 
1018  |  | re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,  | 
1019  |  |          const re_node_set *src2)  | 
1020  | 0  | { | 
1021  | 0  |   Idx i1, i2, is, id, delta, sbase;  | 
1022  | 0  |   if (src1->nelem == 0 || src2->nelem == 0)  | 
1023  | 0  |     return REG_NOERROR;  | 
1024  |  |  | 
1025  |  |   /* We need dest->nelem + 2 * elems_in_intersection; this is a  | 
1026  |  |      conservative estimate.  */  | 
1027  | 0  |   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)  | 
1028  | 0  |     { | 
1029  | 0  |       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;  | 
1030  | 0  |       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);  | 
1031  | 0  |       if (__glibc_unlikely (new_elems == NULL))  | 
1032  | 0  |   return REG_ESPACE;  | 
1033  | 0  |       dest->elems = new_elems;  | 
1034  | 0  |       dest->alloc = new_alloc;  | 
1035  | 0  |     }  | 
1036  |  |  | 
1037  |  |   /* Find the items in the intersection of SRC1 and SRC2, and copy  | 
1038  |  |      into the top of DEST those that are not already in DEST itself.  */  | 
1039  | 0  |   sbase = dest->nelem + src1->nelem + src2->nelem;  | 
1040  | 0  |   i1 = src1->nelem - 1;  | 
1041  | 0  |   i2 = src2->nelem - 1;  | 
1042  | 0  |   id = dest->nelem - 1;  | 
1043  | 0  |   for (;;)  | 
1044  | 0  |     { | 
1045  | 0  |       if (src1->elems[i1] == src2->elems[i2])  | 
1046  | 0  |   { | 
1047  |  |     /* Try to find the item in DEST.  Maybe we could binary search?  */  | 
1048  | 0  |     while (id >= 0 && dest->elems[id] > src1->elems[i1])  | 
1049  | 0  |       --id;  | 
1050  |  | 
  | 
1051  | 0  |     if (id < 0 || dest->elems[id] != src1->elems[i1])  | 
1052  | 0  |             dest->elems[--sbase] = src1->elems[i1];  | 
1053  |  | 
  | 
1054  | 0  |     if (--i1 < 0 || --i2 < 0)  | 
1055  | 0  |       break;  | 
1056  | 0  |   }  | 
1057  |  |  | 
1058  |  |       /* Lower the highest of the two items.  */  | 
1059  | 0  |       else if (src1->elems[i1] < src2->elems[i2])  | 
1060  | 0  |   { | 
1061  | 0  |     if (--i2 < 0)  | 
1062  | 0  |       break;  | 
1063  | 0  |   }  | 
1064  | 0  |       else  | 
1065  | 0  |   { | 
1066  | 0  |     if (--i1 < 0)  | 
1067  | 0  |       break;  | 
1068  | 0  |   }  | 
1069  | 0  |     }  | 
1070  |  | 
  | 
1071  | 0  |   id = dest->nelem - 1;  | 
1072  | 0  |   is = dest->nelem + src1->nelem + src2->nelem - 1;  | 
1073  | 0  |   delta = is - sbase + 1;  | 
1074  |  |  | 
1075  |  |   /* Now copy.  When DELTA becomes zero, the remaining  | 
1076  |  |      DEST elements are already in place; this is more or  | 
1077  |  |      less the same loop that is in re_node_set_merge.  */  | 
1078  | 0  |   dest->nelem += delta;  | 
1079  | 0  |   if (delta > 0 && id >= 0)  | 
1080  | 0  |     for (;;)  | 
1081  | 0  |       { | 
1082  | 0  |   if (dest->elems[is] > dest->elems[id])  | 
1083  | 0  |     { | 
1084  |  |       /* Copy from the top.  */  | 
1085  | 0  |       dest->elems[id + delta--] = dest->elems[is--];  | 
1086  | 0  |       if (delta == 0)  | 
1087  | 0  |         break;  | 
1088  | 0  |     }  | 
1089  | 0  |   else  | 
1090  | 0  |     { | 
1091  |  |       /* Slide from the bottom.  */  | 
1092  | 0  |       dest->elems[id + delta] = dest->elems[id];  | 
1093  | 0  |       if (--id < 0)  | 
1094  | 0  |         break;  | 
1095  | 0  |     }  | 
1096  | 0  |       }  | 
1097  |  |  | 
1098  |  |   /* Copy remaining SRC elements.  */  | 
1099  | 0  |   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));  | 
1100  |  | 
  | 
1101  | 0  |   return REG_NOERROR;  | 
1102  | 0  | }  | 
1103  |  |  | 
1104  |  | /* Calculate the union set of the sets SRC1 and SRC2. And store it to  | 
1105  |  |    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */  | 
1106  |  |  | 
1107  |  | static reg_errcode_t  | 
1108  |  | __attribute_warn_unused_result__  | 
1109  |  | re_node_set_init_union (re_node_set *dest, const re_node_set *src1,  | 
1110  |  |       const re_node_set *src2)  | 
1111  | 0  | { | 
1112  | 0  |   Idx i1, i2, id;  | 
1113  | 0  |   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)  | 
1114  | 0  |     { | 
1115  | 0  |       dest->alloc = src1->nelem + src2->nelem;  | 
1116  | 0  |       dest->elems = re_malloc (Idx, dest->alloc);  | 
1117  | 0  |       if (__glibc_unlikely (dest->elems == NULL))  | 
1118  | 0  |   return REG_ESPACE;  | 
1119  | 0  |     }  | 
1120  | 0  |   else  | 
1121  | 0  |     { | 
1122  | 0  |       if (src1 != NULL && src1->nelem > 0)  | 
1123  | 0  |   return re_node_set_init_copy (dest, src1);  | 
1124  | 0  |       else if (src2 != NULL && src2->nelem > 0)  | 
1125  | 0  |   return re_node_set_init_copy (dest, src2);  | 
1126  | 0  |       else  | 
1127  | 0  |   re_node_set_init_empty (dest);  | 
1128  | 0  |       return REG_NOERROR;  | 
1129  | 0  |     }  | 
1130  | 0  |   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)  | 
1131  | 0  |     { | 
1132  | 0  |       if (src1->elems[i1] > src2->elems[i2])  | 
1133  | 0  |   { | 
1134  | 0  |     dest->elems[id++] = src2->elems[i2++];  | 
1135  | 0  |     continue;  | 
1136  | 0  |   }  | 
1137  | 0  |       if (src1->elems[i1] == src2->elems[i2])  | 
1138  | 0  |   ++i2;  | 
1139  | 0  |       dest->elems[id++] = src1->elems[i1++];  | 
1140  | 0  |     }  | 
1141  | 0  |   if (i1 < src1->nelem)  | 
1142  | 0  |     { | 
1143  | 0  |       memcpy (dest->elems + id, src1->elems + i1,  | 
1144  | 0  |        (src1->nelem - i1) * sizeof (Idx));  | 
1145  | 0  |       id += src1->nelem - i1;  | 
1146  | 0  |     }  | 
1147  | 0  |   else if (i2 < src2->nelem)  | 
1148  | 0  |     { | 
1149  | 0  |       memcpy (dest->elems + id, src2->elems + i2,  | 
1150  | 0  |        (src2->nelem - i2) * sizeof (Idx));  | 
1151  | 0  |       id += src2->nelem - i2;  | 
1152  | 0  |     }  | 
1153  | 0  |   dest->nelem = id;  | 
1154  | 0  |   return REG_NOERROR;  | 
1155  | 0  | }  | 
1156  |  |  | 
1157  |  | /* Calculate the union set of the sets DEST and SRC. And store it to  | 
1158  |  |    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */  | 
1159  |  |  | 
1160  |  | static reg_errcode_t  | 
1161  |  | __attribute_warn_unused_result__  | 
1162  |  | re_node_set_merge (re_node_set *dest, const re_node_set *src)  | 
1163  | 0  | { | 
1164  | 0  |   Idx is, id, sbase, delta;  | 
1165  | 0  |   if (src == NULL || src->nelem == 0)  | 
1166  | 0  |     return REG_NOERROR;  | 
1167  | 0  |   if (dest->alloc < 2 * src->nelem + dest->nelem)  | 
1168  | 0  |     { | 
1169  | 0  |       Idx new_alloc = 2 * (src->nelem + dest->alloc);  | 
1170  | 0  |       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);  | 
1171  | 0  |       if (__glibc_unlikely (new_buffer == NULL))  | 
1172  | 0  |   return REG_ESPACE;  | 
1173  | 0  |       dest->elems = new_buffer;  | 
1174  | 0  |       dest->alloc = new_alloc;  | 
1175  | 0  |     }  | 
1176  |  |  | 
1177  | 0  |   if (__glibc_unlikely (dest->nelem == 0))  | 
1178  | 0  |     { | 
1179  |  |       /* Although we already guaranteed above that dest->alloc != 0 and  | 
1180  |  |          therefore dest->elems != NULL, add a debug assertion to pacify  | 
1181  |  |          GCC 11.2.1's -fanalyzer.  */  | 
1182  | 0  |       DEBUG_ASSERT (dest->elems);  | 
1183  | 0  |       dest->nelem = src->nelem;  | 
1184  | 0  |       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));  | 
1185  | 0  |       return REG_NOERROR;  | 
1186  | 0  |     }  | 
1187  |  |  | 
1188  |  |   /* Copy into the top of DEST the items of SRC that are not  | 
1189  |  |      found in DEST.  Maybe we could binary search in DEST?  */  | 
1190  | 0  |   for (sbase = dest->nelem + 2 * src->nelem,  | 
1191  | 0  |        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )  | 
1192  | 0  |     { | 
1193  | 0  |       if (dest->elems[id] == src->elems[is])  | 
1194  | 0  |   is--, id--;  | 
1195  | 0  |       else if (dest->elems[id] < src->elems[is])  | 
1196  | 0  |   dest->elems[--sbase] = src->elems[is--];  | 
1197  | 0  |       else /* if (dest->elems[id] > src->elems[is]) */  | 
1198  | 0  |   --id;  | 
1199  | 0  |     }  | 
1200  |  | 
  | 
1201  | 0  |   if (is >= 0)  | 
1202  | 0  |     { | 
1203  |  |       /* If DEST is exhausted, the remaining items of SRC must be unique.  */  | 
1204  | 0  |       sbase -= is + 1;  | 
1205  | 0  |       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));  | 
1206  | 0  |     }  | 
1207  |  | 
  | 
1208  | 0  |   id = dest->nelem - 1;  | 
1209  | 0  |   is = dest->nelem + 2 * src->nelem - 1;  | 
1210  | 0  |   delta = is - sbase + 1;  | 
1211  | 0  |   if (delta == 0)  | 
1212  | 0  |     return REG_NOERROR;  | 
1213  |  |  | 
1214  |  |   /* Now copy.  When DELTA becomes zero, the remaining  | 
1215  |  |      DEST elements are already in place.  */  | 
1216  | 0  |   dest->nelem += delta;  | 
1217  | 0  |   for (;;)  | 
1218  | 0  |     { | 
1219  | 0  |       if (dest->elems[is] > dest->elems[id])  | 
1220  | 0  |   { | 
1221  |  |     /* Copy from the top.  */  | 
1222  | 0  |     dest->elems[id + delta--] = dest->elems[is--];  | 
1223  | 0  |     if (delta == 0)  | 
1224  | 0  |       break;  | 
1225  | 0  |   }  | 
1226  | 0  |       else  | 
1227  | 0  |   { | 
1228  |  |     /* Slide from the bottom.  */  | 
1229  | 0  |     dest->elems[id + delta] = dest->elems[id];  | 
1230  | 0  |     if (--id < 0)  | 
1231  | 0  |       { | 
1232  |  |         /* Copy remaining SRC elements.  */  | 
1233  | 0  |         memcpy (dest->elems, dest->elems + sbase,  | 
1234  | 0  |           delta * sizeof (Idx));  | 
1235  | 0  |         break;  | 
1236  | 0  |       }  | 
1237  | 0  |   }  | 
1238  | 0  |     }  | 
1239  |  | 
  | 
1240  | 0  |   return REG_NOERROR;  | 
1241  | 0  | }  | 
1242  |  |  | 
1243  |  | /* Insert the new element ELEM to the re_node_set* SET.  | 
1244  |  |    SET should not already have ELEM.  | 
1245  |  |    Return true if successful.  */  | 
1246  |  |  | 
1247  |  | static bool  | 
1248  |  | __attribute_warn_unused_result__  | 
1249  |  | re_node_set_insert (re_node_set *set, Idx elem)  | 
1250  | 0  | { | 
1251  | 0  |   Idx idx;  | 
1252  |  |   /* In case the set is empty.  */  | 
1253  | 0  |   if (set->alloc == 0)  | 
1254  | 0  |     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);  | 
1255  |  |  | 
1256  | 0  |   if (__glibc_unlikely (set->nelem) == 0)  | 
1257  | 0  |     { | 
1258  |  |       /* Although we already guaranteed above that set->alloc != 0 and  | 
1259  |  |          therefore set->elems != NULL, add a debug assertion to pacify  | 
1260  |  |          GCC 11.2 -fanalyzer.  */  | 
1261  | 0  |       DEBUG_ASSERT (set->elems);  | 
1262  | 0  |       set->elems[0] = elem;  | 
1263  | 0  |       ++set->nelem;  | 
1264  | 0  |       return true;  | 
1265  | 0  |     }  | 
1266  |  |  | 
1267  |  |   /* Realloc if we need.  */  | 
1268  | 0  |   if (set->alloc == set->nelem)  | 
1269  | 0  |     { | 
1270  | 0  |       Idx *new_elems;  | 
1271  | 0  |       set->alloc = set->alloc * 2;  | 
1272  | 0  |       new_elems = re_realloc (set->elems, Idx, set->alloc);  | 
1273  | 0  |       if (__glibc_unlikely (new_elems == NULL))  | 
1274  | 0  |   return false;  | 
1275  | 0  |       set->elems = new_elems;  | 
1276  | 0  |     }  | 
1277  |  |  | 
1278  |  |   /* Move the elements which follows the new element.  Test the  | 
1279  |  |      first element separately to skip a check in the inner loop.  */  | 
1280  | 0  |   if (elem < set->elems[0])  | 
1281  | 0  |     { | 
1282  | 0  |       for (idx = set->nelem; idx > 0; idx--)  | 
1283  | 0  |   set->elems[idx] = set->elems[idx - 1];  | 
1284  | 0  |     }  | 
1285  | 0  |   else  | 
1286  | 0  |     { | 
1287  | 0  |       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)  | 
1288  | 0  |   set->elems[idx] = set->elems[idx - 1];  | 
1289  | 0  |       DEBUG_ASSERT (set->elems[idx - 1] < elem);  | 
1290  | 0  |     }  | 
1291  |  |  | 
1292  |  |   /* Insert the new element.  */  | 
1293  | 0  |   set->elems[idx] = elem;  | 
1294  | 0  |   ++set->nelem;  | 
1295  | 0  |   return true;  | 
1296  | 0  | }  | 
1297  |  |  | 
1298  |  | /* Insert the new element ELEM to the re_node_set* SET.  | 
1299  |  |    SET should not already have any element greater than or equal to ELEM.  | 
1300  |  |    Return true if successful.  */  | 
1301  |  |  | 
1302  |  | static bool  | 
1303  |  | __attribute_warn_unused_result__  | 
1304  |  | re_node_set_insert_last (re_node_set *set, Idx elem)  | 
1305  | 0  | { | 
1306  |  |   /* Realloc if we need.  */  | 
1307  | 0  |   if (set->alloc == set->nelem)  | 
1308  | 0  |     { | 
1309  | 0  |       Idx *new_elems;  | 
1310  | 0  |       set->alloc = (set->alloc + 1) * 2;  | 
1311  | 0  |       new_elems = re_realloc (set->elems, Idx, set->alloc);  | 
1312  | 0  |       if (__glibc_unlikely (new_elems == NULL))  | 
1313  | 0  |   return false;  | 
1314  | 0  |       set->elems = new_elems;  | 
1315  | 0  |     }  | 
1316  |  |  | 
1317  |  |   /* Insert the new element.  */  | 
1318  | 0  |   set->elems[set->nelem++] = elem;  | 
1319  | 0  |   return true;  | 
1320  | 0  | }  | 
1321  |  |  | 
1322  |  | /* Compare two node sets SET1 and SET2.  | 
1323  |  |    Return true if SET1 and SET2 are equivalent.  */  | 
1324  |  |  | 
1325  |  | static bool  | 
1326  |  | __attribute__ ((pure))  | 
1327  |  | re_node_set_compare (const re_node_set *set1, const re_node_set *set2)  | 
1328  | 0  | { | 
1329  | 0  |   Idx i;  | 
1330  | 0  |   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)  | 
1331  | 0  |     return false;  | 
1332  | 0  |   for (i = set1->nelem ; --i >= 0 ; )  | 
1333  | 0  |     if (set1->elems[i] != set2->elems[i])  | 
1334  | 0  |       return false;  | 
1335  | 0  |   return true;  | 
1336  | 0  | }  | 
1337  |  |  | 
1338  |  | /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */  | 
1339  |  |  | 
1340  |  | static Idx  | 
1341  |  | __attribute__ ((pure))  | 
1342  |  | re_node_set_contains (const re_node_set *set, Idx elem)  | 
1343  | 0  | { | 
1344  | 0  |   __re_size_t idx, right, mid;  | 
1345  | 0  |   if (set->nelem <= 0)  | 
1346  | 0  |     return 0;  | 
1347  |  |  | 
1348  |  |   /* Binary search the element.  */  | 
1349  | 0  |   idx = 0;  | 
1350  | 0  |   right = set->nelem - 1;  | 
1351  | 0  |   while (idx < right)  | 
1352  | 0  |     { | 
1353  | 0  |       mid = (idx + right) / 2;  | 
1354  | 0  |       if (set->elems[mid] < elem)  | 
1355  | 0  |   idx = mid + 1;  | 
1356  | 0  |       else  | 
1357  | 0  |   right = mid;  | 
1358  | 0  |     }  | 
1359  | 0  |   return set->elems[idx] == elem ? idx + 1 : 0;  | 
1360  | 0  | }  | 
1361  |  |  | 
1362  |  | static void  | 
1363  |  | re_node_set_remove_at (re_node_set *set, Idx idx)  | 
1364  | 0  | { | 
1365  | 0  |   if (idx < 0 || idx >= set->nelem)  | 
1366  | 0  |     return;  | 
1367  | 0  |   --set->nelem;  | 
1368  | 0  |   for (; idx < set->nelem; idx++)  | 
1369  | 0  |     set->elems[idx] = set->elems[idx + 1];  | 
1370  | 0  | }  | 
1371  |  |  | 
1372  |  |  | 
1373  |  | /* Add the token TOKEN to dfa->nodes, and return the index of the token.  | 
1374  |  |    Or return -1 if an error occurred.  */  | 
1375  |  |  | 
1376  |  | static Idx  | 
1377  |  | re_dfa_add_node (re_dfa_t *dfa, re_token_t token)  | 
1378  | 0  | { | 
1379  | 0  |   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))  | 
1380  | 0  |     { | 
1381  | 0  |       size_t new_nodes_alloc = dfa->nodes_alloc * 2;  | 
1382  | 0  |       Idx *new_nexts, *new_indices;  | 
1383  | 0  |       re_node_set *new_edests, *new_eclosures;  | 
1384  | 0  |       re_token_t *new_nodes;  | 
1385  |  |  | 
1386  |  |       /* Avoid overflows in realloc.  */  | 
1387  | 0  |       const size_t max_object_size = MAX (sizeof (re_token_t),  | 
1388  | 0  |             MAX (sizeof (re_node_set),  | 
1389  | 0  |                  sizeof (Idx)));  | 
1390  | 0  |       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)  | 
1391  | 0  |           < new_nodes_alloc))  | 
1392  | 0  |   return -1;  | 
1393  |  |  | 
1394  | 0  |       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);  | 
1395  | 0  |       if (__glibc_unlikely (new_nodes == NULL))  | 
1396  | 0  |   return -1;  | 
1397  | 0  |       dfa->nodes = new_nodes;  | 
1398  | 0  |       dfa->nodes_alloc = new_nodes_alloc;  | 
1399  | 0  |       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);  | 
1400  | 0  |       if (new_nexts != NULL)  | 
1401  | 0  |   dfa->nexts = new_nexts;  | 
1402  | 0  |       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);  | 
1403  | 0  |       if (new_indices != NULL)  | 
1404  | 0  |   dfa->org_indices = new_indices;  | 
1405  | 0  |       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);  | 
1406  | 0  |       if (new_edests != NULL)  | 
1407  | 0  |   dfa->edests = new_edests;  | 
1408  | 0  |       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);  | 
1409  | 0  |       if (new_eclosures != NULL)  | 
1410  | 0  |   dfa->eclosures = new_eclosures;  | 
1411  | 0  |       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL  | 
1412  | 0  |           || new_edests == NULL || new_eclosures == NULL))  | 
1413  | 0  |   return -1;  | 
1414  | 0  |     }  | 
1415  | 0  |   dfa->nodes[dfa->nodes_len] = token;  | 
1416  | 0  |   dfa->nodes[dfa->nodes_len].constraint = 0;  | 
1417  | 0  |   dfa->nodes[dfa->nodes_len].accept_mb =  | 
1418  | 0  |     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)  | 
1419  | 0  |      || token.type == COMPLEX_BRACKET);  | 
1420  | 0  |   dfa->nexts[dfa->nodes_len] = -1;  | 
1421  | 0  |   re_node_set_init_empty (dfa->edests + dfa->nodes_len);  | 
1422  | 0  |   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);  | 
1423  | 0  |   return dfa->nodes_len++;  | 
1424  | 0  | }  | 
1425  |  |  | 
1426  |  | static re_hashval_t  | 
1427  |  | calc_state_hash (const re_node_set *nodes, unsigned int context)  | 
1428  | 0  | { | 
1429  | 0  |   re_hashval_t hash = nodes->nelem + context;  | 
1430  | 0  |   Idx i;  | 
1431  | 0  |   for (i = 0 ; i < nodes->nelem ; i++)  | 
1432  | 0  |     hash += nodes->elems[i];  | 
1433  | 0  |   return hash;  | 
1434  | 0  | }  | 
1435  |  |  | 
1436  |  | /* Search for the state whose node_set is equivalent to NODES.  | 
1437  |  |    Return the pointer to the state, if we found it in the DFA.  | 
1438  |  |    Otherwise create the new one and return it.  In case of an error  | 
1439  |  |    return NULL and set the error code in ERR.  | 
1440  |  |    Note: - We assume NULL as the invalid state, then it is possible that  | 
1441  |  |      return value is NULL and ERR is REG_NOERROR.  | 
1442  |  |    - We never return non-NULL value in case of any errors, it is for  | 
1443  |  |      optimization.  */  | 
1444  |  |  | 
1445  |  | static re_dfastate_t *  | 
1446  |  | __attribute_warn_unused_result__  | 
1447  |  | re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,  | 
1448  |  |       const re_node_set *nodes)  | 
1449  | 0  | { | 
1450  | 0  |   re_hashval_t hash;  | 
1451  | 0  |   re_dfastate_t *new_state;  | 
1452  | 0  |   struct re_state_table_entry *spot;  | 
1453  | 0  |   Idx i;  | 
1454  |  | #if defined GCC_LINT || defined lint  | 
1455  |  |   /* Suppress bogus uninitialized-variable warnings.  */  | 
1456  |  |   *err = REG_NOERROR;  | 
1457  |  | #endif  | 
1458  | 0  |   if (__glibc_unlikely (nodes->nelem == 0))  | 
1459  | 0  |     { | 
1460  | 0  |       *err = REG_NOERROR;  | 
1461  | 0  |       return NULL;  | 
1462  | 0  |     }  | 
1463  | 0  |   hash = calc_state_hash (nodes, 0);  | 
1464  | 0  |   spot = dfa->state_table + (hash & dfa->state_hash_mask);  | 
1465  |  | 
  | 
1466  | 0  |   for (i = 0 ; i < spot->num ; i++)  | 
1467  | 0  |     { | 
1468  | 0  |       re_dfastate_t *state = spot->array[i];  | 
1469  | 0  |       if (hash != state->hash)  | 
1470  | 0  |   continue;  | 
1471  | 0  |       if (re_node_set_compare (&state->nodes, nodes))  | 
1472  | 0  |   return state;  | 
1473  | 0  |     }  | 
1474  |  |  | 
1475  |  |   /* There are no appropriate state in the dfa, create the new one.  */  | 
1476  | 0  |   new_state = create_ci_newstate (dfa, nodes, hash);  | 
1477  | 0  |   if (__glibc_unlikely (new_state == NULL))  | 
1478  | 0  |     *err = REG_ESPACE;  | 
1479  |  | 
  | 
1480  | 0  |   return new_state;  | 
1481  | 0  | }  | 
1482  |  |  | 
1483  |  | /* Search for the state whose node_set is equivalent to NODES and  | 
1484  |  |    whose context is equivalent to CONTEXT.  | 
1485  |  |    Return the pointer to the state, if we found it in the DFA.  | 
1486  |  |    Otherwise create the new one and return it.  In case of an error  | 
1487  |  |    return NULL and set the error code in ERR.  | 
1488  |  |    Note: - We assume NULL as the invalid state, then it is possible that  | 
1489  |  |      return value is NULL and ERR is REG_NOERROR.  | 
1490  |  |    - We never return non-NULL value in case of any errors, it is for  | 
1491  |  |      optimization.  */  | 
1492  |  |  | 
1493  |  | static re_dfastate_t *  | 
1494  |  | __attribute_warn_unused_result__  | 
1495  |  | re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,  | 
1496  |  |         const re_node_set *nodes, unsigned int context)  | 
1497  | 0  | { | 
1498  | 0  |   re_hashval_t hash;  | 
1499  | 0  |   re_dfastate_t *new_state;  | 
1500  | 0  |   struct re_state_table_entry *spot;  | 
1501  | 0  |   Idx i;  | 
1502  |  | #if defined GCC_LINT || defined lint  | 
1503  |  |   /* Suppress bogus uninitialized-variable warnings.  */  | 
1504  |  |   *err = REG_NOERROR;  | 
1505  |  | #endif  | 
1506  | 0  |   if (nodes->nelem == 0)  | 
1507  | 0  |     { | 
1508  | 0  |       *err = REG_NOERROR;  | 
1509  | 0  |       return NULL;  | 
1510  | 0  |     }  | 
1511  | 0  |   hash = calc_state_hash (nodes, context);  | 
1512  | 0  |   spot = dfa->state_table + (hash & dfa->state_hash_mask);  | 
1513  |  | 
  | 
1514  | 0  |   for (i = 0 ; i < spot->num ; i++)  | 
1515  | 0  |     { | 
1516  | 0  |       re_dfastate_t *state = spot->array[i];  | 
1517  | 0  |       if (state->hash == hash  | 
1518  | 0  |     && state->context == context  | 
1519  | 0  |     && re_node_set_compare (state->entrance_nodes, nodes))  | 
1520  | 0  |   return state;  | 
1521  | 0  |     }  | 
1522  |  |   /* There are no appropriate state in 'dfa', create the new one.  */  | 
1523  | 0  |   new_state = create_cd_newstate (dfa, nodes, context, hash);  | 
1524  | 0  |   if (__glibc_unlikely (new_state == NULL))  | 
1525  | 0  |     *err = REG_ESPACE;  | 
1526  |  | 
  | 
1527  | 0  |   return new_state;  | 
1528  | 0  | }  | 
1529  |  |  | 
1530  |  | /* Finish initialization of the new state NEWSTATE, and using its hash value  | 
1531  |  |    HASH put in the appropriate bucket of DFA's state table.  Return value  | 
1532  |  |    indicates the error code if failed.  */  | 
1533  |  |  | 
1534  |  | static reg_errcode_t  | 
1535  |  | __attribute_warn_unused_result__  | 
1536  |  | register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,  | 
1537  |  |     re_hashval_t hash)  | 
1538  | 0  | { | 
1539  | 0  |   struct re_state_table_entry *spot;  | 
1540  | 0  |   reg_errcode_t err;  | 
1541  | 0  |   Idx i;  | 
1542  |  | 
  | 
1543  | 0  |   newstate->hash = hash;  | 
1544  | 0  |   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);  | 
1545  | 0  |   if (__glibc_unlikely (err != REG_NOERROR))  | 
1546  | 0  |     return REG_ESPACE;  | 
1547  | 0  |   for (i = 0; i < newstate->nodes.nelem; i++)  | 
1548  | 0  |     { | 
1549  | 0  |       Idx elem = newstate->nodes.elems[i];  | 
1550  | 0  |       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))  | 
1551  | 0  |   if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))  | 
1552  | 0  |     return REG_ESPACE;  | 
1553  | 0  |     }  | 
1554  |  |  | 
1555  | 0  |   spot = dfa->state_table + (hash & dfa->state_hash_mask);  | 
1556  | 0  |   if (__glibc_unlikely (spot->alloc <= spot->num))  | 
1557  | 0  |     { | 
1558  | 0  |       Idx new_alloc = 2 * spot->num + 2;  | 
1559  | 0  |       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,  | 
1560  | 0  |                 new_alloc);  | 
1561  | 0  |       if (__glibc_unlikely (new_array == NULL))  | 
1562  | 0  |   return REG_ESPACE;  | 
1563  | 0  |       spot->array = new_array;  | 
1564  | 0  |       spot->alloc = new_alloc;  | 
1565  | 0  |     }  | 
1566  | 0  |   spot->array[spot->num++] = newstate;  | 
1567  | 0  |   return REG_NOERROR;  | 
1568  | 0  | }  | 
1569  |  |  | 
1570  |  | static void  | 
1571  |  | free_state (re_dfastate_t *state)  | 
1572  | 0  | { | 
1573  | 0  |   re_node_set_free (&state->non_eps_nodes);  | 
1574  | 0  |   re_node_set_free (&state->inveclosure);  | 
1575  | 0  |   if (state->entrance_nodes != &state->nodes)  | 
1576  | 0  |     { | 
1577  | 0  |       re_node_set_free (state->entrance_nodes);  | 
1578  | 0  |       re_free (state->entrance_nodes);  | 
1579  | 0  |     }  | 
1580  | 0  |   re_node_set_free (&state->nodes);  | 
1581  | 0  |   re_free (state->word_trtable);  | 
1582  | 0  |   re_free (state->trtable);  | 
1583  | 0  |   re_free (state);  | 
1584  | 0  | }  | 
1585  |  |  | 
1586  |  | /* Create the new state which is independent of contexts.  | 
1587  |  |    Return the new state if succeeded, otherwise return NULL.  */  | 
1588  |  |  | 
1589  |  | static re_dfastate_t *  | 
1590  |  | __attribute_warn_unused_result__  | 
1591  |  | create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,  | 
1592  |  |         re_hashval_t hash)  | 
1593  | 0  | { | 
1594  | 0  |   Idx i;  | 
1595  | 0  |   reg_errcode_t err;  | 
1596  | 0  |   re_dfastate_t *newstate;  | 
1597  |  | 
  | 
1598  | 0  |   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);  | 
1599  | 0  |   if (__glibc_unlikely (newstate == NULL))  | 
1600  | 0  |     return NULL;  | 
1601  | 0  |   err = re_node_set_init_copy (&newstate->nodes, nodes);  | 
1602  | 0  |   if (__glibc_unlikely (err != REG_NOERROR))  | 
1603  | 0  |     { | 
1604  | 0  |       re_free (newstate);  | 
1605  | 0  |       return NULL;  | 
1606  | 0  |     }  | 
1607  |  |  | 
1608  | 0  |   newstate->entrance_nodes = &newstate->nodes;  | 
1609  | 0  |   for (i = 0 ; i < nodes->nelem ; i++)  | 
1610  | 0  |     { | 
1611  | 0  |       re_token_t *node = dfa->nodes + nodes->elems[i];  | 
1612  | 0  |       re_token_type_t type = node->type;  | 
1613  | 0  |       if (type == CHARACTER && !node->constraint)  | 
1614  | 0  |   continue;  | 
1615  | 0  |       newstate->accept_mb |= node->accept_mb;  | 
1616  |  |  | 
1617  |  |       /* If the state has the halt node, the state is a halt state.  */  | 
1618  | 0  |       if (type == END_OF_RE)  | 
1619  | 0  |   newstate->halt = 1;  | 
1620  | 0  |       else if (type == OP_BACK_REF)  | 
1621  | 0  |   newstate->has_backref = 1;  | 
1622  | 0  |       else if (type == ANCHOR || node->constraint)  | 
1623  | 0  |   newstate->has_constraint = 1;  | 
1624  | 0  |     }  | 
1625  | 0  |   err = register_state (dfa, newstate, hash);  | 
1626  | 0  |   if (__glibc_unlikely (err != REG_NOERROR))  | 
1627  | 0  |     { | 
1628  | 0  |       free_state (newstate);  | 
1629  | 0  |       newstate = NULL;  | 
1630  | 0  |     }  | 
1631  | 0  |   return newstate;  | 
1632  | 0  | }  | 
1633  |  |  | 
1634  |  | /* Create the new state which is depend on the context CONTEXT.  | 
1635  |  |    Return the new state if succeeded, otherwise return NULL.  */  | 
1636  |  |  | 
1637  |  | static re_dfastate_t *  | 
1638  |  | __attribute_warn_unused_result__  | 
1639  |  | create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,  | 
1640  |  |         unsigned int context, re_hashval_t hash)  | 
1641  | 0  | { | 
1642  | 0  |   Idx i, nctx_nodes = 0;  | 
1643  | 0  |   reg_errcode_t err;  | 
1644  | 0  |   re_dfastate_t *newstate;  | 
1645  |  | 
  | 
1646  | 0  |   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);  | 
1647  | 0  |   if (__glibc_unlikely (newstate == NULL))  | 
1648  | 0  |     return NULL;  | 
1649  | 0  |   err = re_node_set_init_copy (&newstate->nodes, nodes);  | 
1650  | 0  |   if (__glibc_unlikely (err != REG_NOERROR))  | 
1651  | 0  |     { | 
1652  | 0  |       re_free (newstate);  | 
1653  | 0  |       return NULL;  | 
1654  | 0  |     }  | 
1655  |  |  | 
1656  | 0  |   newstate->context = context;  | 
1657  | 0  |   newstate->entrance_nodes = &newstate->nodes;  | 
1658  |  | 
  | 
1659  | 0  |   for (i = 0 ; i < nodes->nelem ; i++)  | 
1660  | 0  |     { | 
1661  | 0  |       re_token_t *node = dfa->nodes + nodes->elems[i];  | 
1662  | 0  |       re_token_type_t type = node->type;  | 
1663  | 0  |       unsigned int constraint = node->constraint;  | 
1664  |  | 
  | 
1665  | 0  |       if (type == CHARACTER && !constraint)  | 
1666  | 0  |   continue;  | 
1667  | 0  |       newstate->accept_mb |= node->accept_mb;  | 
1668  |  |  | 
1669  |  |       /* If the state has the halt node, the state is a halt state.  */  | 
1670  | 0  |       if (type == END_OF_RE)  | 
1671  | 0  |   newstate->halt = 1;  | 
1672  | 0  |       else if (type == OP_BACK_REF)  | 
1673  | 0  |   newstate->has_backref = 1;  | 
1674  |  | 
  | 
1675  | 0  |       if (constraint)  | 
1676  | 0  |   { | 
1677  | 0  |     if (newstate->entrance_nodes == &newstate->nodes)  | 
1678  | 0  |       { | 
1679  | 0  |         re_node_set *entrance_nodes = re_malloc (re_node_set, 1);  | 
1680  | 0  |         if (__glibc_unlikely (entrance_nodes == NULL))  | 
1681  | 0  |     { | 
1682  | 0  |       free_state (newstate);  | 
1683  | 0  |       return NULL;  | 
1684  | 0  |     }  | 
1685  | 0  |         newstate->entrance_nodes = entrance_nodes;  | 
1686  | 0  |         if (re_node_set_init_copy (newstate->entrance_nodes, nodes)  | 
1687  | 0  |       != REG_NOERROR)  | 
1688  | 0  |     { | 
1689  | 0  |       free_state (newstate);  | 
1690  | 0  |       return NULL;  | 
1691  | 0  |     }  | 
1692  | 0  |         nctx_nodes = 0;  | 
1693  | 0  |         newstate->has_constraint = 1;  | 
1694  | 0  |       }  | 
1695  |  |  | 
1696  | 0  |     if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))  | 
1697  | 0  |       { | 
1698  | 0  |         re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);  | 
1699  | 0  |         ++nctx_nodes;  | 
1700  | 0  |       }  | 
1701  | 0  |   }  | 
1702  | 0  |     }  | 
1703  | 0  |   err = register_state (dfa, newstate, hash);  | 
1704  | 0  |   if (__glibc_unlikely (err != REG_NOERROR))  | 
1705  | 0  |     { | 
1706  | 0  |       free_state (newstate);  | 
1707  | 0  |       newstate = NULL;  | 
1708  | 0  |     }  | 
1709  | 0  |   return  newstate;  | 
1710  | 0  | }  |