Coverage Report

Created: 2025-07-23 07:16

/src/wget/lib/str-two-way.h
Line
Count
Source (jump to first uncovered line)
1
/* Byte-wise substring search, using the Two-Way algorithm.
2
   Copyright (C) 2008-2025 Free Software Foundation, Inc.
3
   This file is part of the GNU C Library.
4
   Written by Eric Blake <ebb9@byu.net>, 2008.
5
6
   This file is free software: you can redistribute it and/or modify
7
   it under the terms of the GNU Lesser General Public License as
8
   published by the Free Software Foundation; either version 2.1 of the
9
   License, or (at your option) any later version.
10
11
   This file is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
   GNU Lesser General Public License for more details.
15
16
   You should have received a copy of the GNU Lesser General Public License
17
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18
19
/* Before including this file, you need to include <config.h> and
20
   <string.h>, and define:
21
     RETURN_TYPE             A macro that expands to the return type.
22
     AVAILABLE(h, h_l, j, n_l)
23
                             A macro that returns nonzero if there are
24
                             at least N_L bytes left starting at H[J].
25
                             H is 'unsigned char *', H_L, J, and N_L
26
                             are 'size_t'; H_L is an lvalue.  For
27
                             NUL-terminated searches, H_L can be
28
                             modified each iteration to avoid having
29
                             to compute the end of H up front.
30
31
  For case-insensitivity, you may optionally define:
32
     CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
33
                             characters of P1 and P2 are equal.
34
     CANON_ELEMENT(c)        A macro that canonicalizes an element right after
35
                             it has been fetched from one of the two strings.
36
                             The argument is an 'unsigned char'; the result
37
                             must be an 'unsigned char' as well.
38
39
  This file undefines the macros documented above, and defines
40
  LONG_NEEDLE_THRESHOLD.
41
*/
42
43
#include <limits.h>
44
#include <stdint.h>
45
46
/* We use the Two-Way string matching algorithm (also known as
47
   Chrochemore-Perrin), which guarantees linear complexity with
48
   constant space.  Additionally, for long needles, we also use a bad
49
   character shift table similar to the Boyer-Moore algorithm to
50
   achieve improved (potentially sub-linear) performance.
51
52
   See https://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
53
   https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
54
   https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
55
*/
56
57
/* Point at which computing a bad-byte shift table is likely to be
58
   worthwhile.  Small needles should not compute a table, since it
59
   adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
60
   speedup no greater than a factor of NEEDLE_LEN.  The larger the
61
   needle, the better the potential performance gain.  On the other
62
   hand, on non-POSIX systems with CHAR_BIT larger than eight, the
63
   memory required for the table is prohibitive.  */
64
#if CHAR_BIT < 10
65
542
# define LONG_NEEDLE_THRESHOLD 32U
66
#else
67
# define LONG_NEEDLE_THRESHOLD SIZE_MAX
68
#endif
69
70
#ifndef MAX
71
542
# define MAX(a, b) ((a < b) ? (b) : (a))
72
#endif
73
74
#ifndef CANON_ELEMENT
75
# define CANON_ELEMENT(c) c
76
#endif
77
#ifndef CMP_FUNC
78
# define CMP_FUNC memcmp
79
#endif
80
81
/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
82
   Return the index of the first byte in the right half, and set
83
   *PERIOD to the global period of the right half.
84
85
   The global period of a string is the smallest index (possibly its
86
   length) at which all remaining bytes in the string are repetitions
87
   of the prefix (the last repetition may be a subset of the prefix).
88
89
   When NEEDLE is factored into two halves, a local period is the
90
   length of the smallest word that shares a suffix with the left half
91
   and shares a prefix with the right half.  All factorizations of a
92
   non-empty NEEDLE have a local period of at least 1 and no greater
93
   than NEEDLE_LEN.
94
95
   A critical factorization has the property that the local period
96
   equals the global period.  All strings have at least one critical
97
   factorization with the left half smaller than the global period.
98
   And while some strings have more than one critical factorization,
99
   it is provable that with an ordered alphabet, at least one of the
100
   critical factorizations corresponds to a maximal suffix.
101
102
   Given an ordered alphabet, a critical factorization can be computed
103
   in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
104
   shorter of two ordered maximal suffixes.  The ordered maximal
105
   suffixes are determined by lexicographic comparison while tracking
106
   periodicity.  */
107
static size_t
108
critical_factorization (const unsigned char *needle, size_t needle_len,
109
                        size_t *period)
110
542
{
111
  /* Index of last byte of left half, or SIZE_MAX.  */
112
542
  size_t max_suffix, max_suffix_rev;
113
542
  size_t j; /* Index into NEEDLE for current candidate suffix.  */
114
542
  size_t k; /* Offset into current period.  */
115
542
  size_t p; /* Intermediate period.  */
116
542
  unsigned char a, b; /* Current comparison bytes.  */
117
118
  /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
119
     out 0-length needles.  */
120
542
  if (needle_len < 3)
121
0
    {
122
0
      *period = 1;
123
0
      return needle_len - 1;
124
0
    }
125
126
  /* Invariants:
127
     0 <= j < NEEDLE_LEN - 1
128
     -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
129
     min(max_suffix, max_suffix_rev) < global period of NEEDLE
130
     1 <= p <= global period of NEEDLE
131
     p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
132
     1 <= k <= p
133
  */
134
135
  /* Perform lexicographic search.  */
136
542
  max_suffix = SIZE_MAX;
137
542
  j = 0;
138
542
  k = p = 1;
139
4.33k
  while (j + k < needle_len)
140
3.79k
    {
141
3.79k
      a = CANON_ELEMENT (needle[j + k]);
142
3.79k
      b = CANON_ELEMENT (needle[max_suffix + k]);
143
3.79k
      if (a < b)
144
1.62k
        {
145
          /* Suffix is smaller, period is entire prefix so far.  */
146
1.62k
          j += k;
147
1.62k
          k = 1;
148
1.62k
          p = j - max_suffix;
149
1.62k
        }
150
2.16k
      else if (a == b)
151
0
        {
152
          /* Advance through repetition of the current period.  */
153
0
          if (k != p)
154
0
            ++k;
155
0
          else
156
0
            {
157
0
              j += p;
158
0
              k = 1;
159
0
            }
160
0
        }
161
2.16k
      else /* b < a */
162
2.16k
        {
163
          /* Suffix is larger, start over from current location.  */
164
2.16k
          max_suffix = j++;
165
2.16k
          k = p = 1;
166
2.16k
        }
167
3.79k
    }
168
542
  *period = p;
169
170
  /* Perform reverse lexicographic search.  */
171
542
  max_suffix_rev = SIZE_MAX;
172
542
  j = 0;
173
542
  k = p = 1;
174
4.33k
  while (j + k < needle_len)
175
3.79k
    {
176
3.79k
      a = CANON_ELEMENT (needle[j + k]);
177
3.79k
      b = CANON_ELEMENT (needle[max_suffix_rev + k]);
178
3.79k
      if (b < a)
179
2.71k
        {
180
          /* Suffix is smaller, period is entire prefix so far.  */
181
2.71k
          j += k;
182
2.71k
          k = 1;
183
2.71k
          p = j - max_suffix_rev;
184
2.71k
        }
185
1.08k
      else if (a == b)
186
0
        {
187
          /* Advance through repetition of the current period.  */
188
0
          if (k != p)
189
0
            ++k;
190
0
          else
191
0
            {
192
0
              j += p;
193
0
              k = 1;
194
0
            }
195
0
        }
196
1.08k
      else /* a < b */
197
1.08k
        {
198
          /* Suffix is larger, start over from current location.  */
199
1.08k
          max_suffix_rev = j++;
200
1.08k
          k = p = 1;
201
1.08k
        }
202
3.79k
    }
203
204
  /* Choose the shorter suffix.  Return the index of the first byte of
205
     the right half, rather than the last byte of the left half.
206
207
     For some examples, 'banana' has two critical factorizations, both
208
     exposed by the two lexicographic extreme suffixes of 'anana' and
209
     'nana', where both suffixes have a period of 2.  On the other
210
     hand, with 'aab' and 'bba', both strings have a single critical
211
     factorization of the last byte, with the suffix having a period
212
     of 1.  While the maximal lexicographic suffix of 'aab' is 'b',
213
     the maximal lexicographic suffix of 'bba' is 'ba', which is not a
214
     critical factorization.  Conversely, the maximal reverse
215
     lexicographic suffix of 'a' works for 'bba', but not 'ab' for
216
     'aab'.  The shorter suffix of the two will always be a critical
217
     factorization.  */
218
542
  if (max_suffix_rev + 1 < max_suffix + 1)
219
0
    return max_suffix + 1;
220
542
  *period = p;
221
542
  return max_suffix_rev + 1;
222
542
}
223
224
/* Return the first location of non-empty NEEDLE within HAYSTACK, or
225
   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
226
   method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
227
   Performance is guaranteed to be linear, with an initialization cost
228
   of 2 * NEEDLE_LEN comparisons.
229
230
   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
231
   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
232
   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
233
   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
234
static RETURN_TYPE _GL_ATTRIBUTE_PURE
235
two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
236
                      const unsigned char *needle, size_t needle_len)
237
542
{
238
542
  size_t i; /* Index into current byte of NEEDLE.  */
239
542
  size_t j; /* Index into current window of HAYSTACK.  */
240
542
  size_t period; /* The period of the right half of needle.  */
241
542
  size_t suffix; /* The index of the right half of needle.  */
242
243
  /* Factor the needle into two halves, such that the left half is
244
     smaller than the global period, and the right half is
245
     periodic (with a period as large as NEEDLE_LEN - suffix).  */
246
542
  suffix = critical_factorization (needle, needle_len, &period);
247
248
  /* Perform the search.  Each iteration compares the right half
249
     first.  */
250
542
  if (CMP_FUNC (needle, needle + period, suffix) == 0)
251
0
    {
252
      /* Entire needle is periodic; a mismatch in the left half can
253
         only advance by the period, so use memory to avoid rescanning
254
         known occurrences of the period in the right half.  */
255
0
      size_t memory = 0;
256
0
      j = 0;
257
0
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
258
0
        {
259
          /* Scan for matches in right half.  */
260
0
          i = MAX (suffix, memory);
261
0
          while (i < needle_len && (CANON_ELEMENT (needle[i])
262
0
                                    == CANON_ELEMENT (haystack[i + j])))
263
0
            ++i;
264
0
          if (needle_len <= i)
265
0
            {
266
              /* Scan for matches in left half.  */
267
0
              i = suffix - 1;
268
0
              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
269
0
                                        == CANON_ELEMENT (haystack[i + j])))
270
0
                --i;
271
0
              if (i + 1 < memory + 1)
272
0
                return (RETURN_TYPE) (haystack + j);
273
              /* No match, so remember how many repetitions of period
274
                 on the right half were scanned.  */
275
0
              j += period;
276
0
              memory = needle_len - period;
277
0
            }
278
0
          else
279
0
            {
280
0
              j += i - suffix + 1;
281
0
              memory = 0;
282
0
            }
283
0
        }
284
0
    }
285
542
  else
286
542
    {
287
      /* The two halves of needle are distinct; no extra memory is
288
         required, and any mismatch results in a maximal shift.  */
289
542
      period = MAX (suffix, needle_len - suffix) + 1;
290
542
      j = 0;
291
4.87k
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
292
4.40k
        {
293
          /* Scan for matches in right half.  */
294
4.40k
          i = suffix;
295
4.83k
          while (i < needle_len && (CANON_ELEMENT (needle[i])
296
4.40k
                                    == CANON_ELEMENT (haystack[i + j])))
297
434
            ++i;
298
4.40k
          if (needle_len <= i)
299
434
            {
300
              /* Scan for matches in left half.  */
301
434
              i = suffix - 1;
302
1.72k
              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
303
1.66k
                                       == CANON_ELEMENT (haystack[i + j])))
304
1.29k
                --i;
305
434
              if (i == SIZE_MAX)
306
68
                return (RETURN_TYPE) (haystack + j);
307
366
              j += period;
308
366
            }
309
3.96k
          else
310
3.96k
            j += i - suffix + 1;
311
4.40k
        }
312
542
    }
313
474
  return NULL;
314
542
}
315
316
/* Return the first location of non-empty NEEDLE within HAYSTACK, or
317
   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
318
   method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
319
   Performance is guaranteed to be linear, with an initialization cost
320
   of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
321
322
   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
323
   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
324
   and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
325
   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
326
   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
327
   sublinear performance is not possible.  */
328
static RETURN_TYPE _GL_ATTRIBUTE_PURE
329
two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
330
                     const unsigned char *needle, size_t needle_len)
331
0
{
332
0
  size_t i; /* Index into current byte of NEEDLE.  */
333
0
  size_t j; /* Index into current window of HAYSTACK.  */
334
0
  size_t period; /* The period of the right half of needle.  */
335
0
  size_t suffix; /* The index of the right half of needle.  */
336
0
  size_t shift_table[1U << CHAR_BIT]; /* See below.  */
337
338
  /* Factor the needle into two halves, such that the left half is
339
     smaller than the global period, and the right half is
340
     periodic (with a period as large as NEEDLE_LEN - suffix).  */
341
0
  suffix = critical_factorization (needle, needle_len, &period);
342
343
  /* Populate shift_table.  For each possible byte value c,
344
     shift_table[c] is the distance from the last occurrence of c to
345
     the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
346
     shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
347
0
  for (i = 0; i < 1U << CHAR_BIT; i++)
348
0
    shift_table[i] = needle_len;
349
0
  for (i = 0; i < needle_len; i++)
350
0
    shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
351
352
  /* Perform the search.  Each iteration compares the right half
353
     first.  */
354
0
  if (CMP_FUNC (needle, needle + period, suffix) == 0)
355
0
    {
356
      /* Entire needle is periodic; a mismatch in the left half can
357
         only advance by the period, so use memory to avoid rescanning
358
         known occurrences of the period in the right half.  */
359
0
      size_t memory = 0;
360
0
      size_t shift;
361
0
      j = 0;
362
0
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
363
0
        {
364
          /* Check the last byte first; if it does not match, then
365
             shift to the next possible match location.  */
366
0
          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
367
0
          if (0 < shift)
368
0
            {
369
0
              if (memory && shift < period)
370
0
                {
371
                  /* Since needle is periodic, but the last period has
372
                     a byte out of place, there can be no match until
373
                     after the mismatch.  */
374
0
                  shift = needle_len - period;
375
0
                }
376
0
              memory = 0;
377
0
              j += shift;
378
0
              continue;
379
0
            }
380
          /* Scan for matches in right half.  The last byte has
381
             already been matched, by virtue of the shift table.  */
382
0
          i = MAX (suffix, memory);
383
0
          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
384
0
                                        == CANON_ELEMENT (haystack[i + j])))
385
0
            ++i;
386
0
          if (needle_len - 1 <= i)
387
0
            {
388
              /* Scan for matches in left half.  */
389
0
              i = suffix - 1;
390
0
              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
391
0
                                        == CANON_ELEMENT (haystack[i + j])))
392
0
                --i;
393
0
              if (i + 1 < memory + 1)
394
0
                return (RETURN_TYPE) (haystack + j);
395
              /* No match, so remember how many repetitions of period
396
                 on the right half were scanned.  */
397
0
              j += period;
398
0
              memory = needle_len - period;
399
0
            }
400
0
          else
401
0
            {
402
0
              j += i - suffix + 1;
403
0
              memory = 0;
404
0
            }
405
0
        }
406
0
    }
407
0
  else
408
0
    {
409
      /* The two halves of needle are distinct; no extra memory is
410
         required, and any mismatch results in a maximal shift.  */
411
0
      size_t shift;
412
0
      period = MAX (suffix, needle_len - suffix) + 1;
413
0
      j = 0;
414
0
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
415
0
        {
416
          /* Check the last byte first; if it does not match, then
417
             shift to the next possible match location.  */
418
0
          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
419
0
          if (0 < shift)
420
0
            {
421
0
              j += shift;
422
0
              continue;
423
0
            }
424
          /* Scan for matches in right half.  The last byte has
425
             already been matched, by virtue of the shift table.  */
426
0
          i = suffix;
427
0
          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
428
0
                                        == CANON_ELEMENT (haystack[i + j])))
429
0
            ++i;
430
0
          if (needle_len - 1 <= i)
431
0
            {
432
              /* Scan for matches in left half.  */
433
0
              i = suffix - 1;
434
0
              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
435
0
                                       == CANON_ELEMENT (haystack[i + j])))
436
0
                --i;
437
0
              if (i == SIZE_MAX)
438
0
                return (RETURN_TYPE) (haystack + j);
439
0
              j += period;
440
0
            }
441
0
          else
442
0
            j += i - suffix + 1;
443
0
        }
444
0
    }
445
0
  return NULL;
446
0
}
447
448
#undef AVAILABLE
449
#undef CANON_ELEMENT
450
#undef CMP_FUNC
451
#undef MAX
452
#undef RETURN_TYPE