Coverage Report

Created: 2025-08-12 06:43

/src/postgres/src/backend/utils/adt/like_match.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * like_match.c
4
 *    LIKE pattern matching internal code.
5
 *
6
 * This file is included by like.c four times, to provide matching code for
7
 * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
8
 * and (4) case insensitive matches in single-byte encodings.
9
 * (UTF8 is a special case because we can use a much more efficient version
10
 * of NextChar than can be used for general multi-byte encodings.)
11
 *
12
 * Before the inclusion, we need to define the following macros:
13
 *
14
 * NextChar
15
 * MatchText - to name of function wanted
16
 * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
17
 * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars
18
 *
19
 * Copyright (c) 1996-2025, PostgreSQL Global Development Group
20
 *
21
 * IDENTIFICATION
22
 *  src/backend/utils/adt/like_match.c
23
 *
24
 *-------------------------------------------------------------------------
25
 */
26
27
/*
28
 *  Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
29
 *  Rich $alz is now <rsalz@bbn.com>.
30
 *  Special thanks to Lars Mathiesen <thorinn@diku.dk> for the
31
 *  LIKE_ABORT code.
32
 *
33
 *  This code was shamelessly stolen from the "pql" code by myself and
34
 *  slightly modified :)
35
 *
36
 *  All references to the word "star" were replaced by "percent"
37
 *  All references to the word "wild" were replaced by "like"
38
 *
39
 *  All the nice shell RE matching stuff was replaced by just "_" and "%"
40
 *
41
 *  As I don't have a copy of the SQL standard handy I wasn't sure whether
42
 *  to leave in the '\' escape character handling.
43
 *
44
 *  Keith Parks. <keith@mtcc.demon.co.uk>
45
 *
46
 *  SQL lets you specify the escape character by saying
47
 *  LIKE <pattern> ESCAPE <escape character>. We are a small operation
48
 *  so we force you to use '\'. - ay 7/95
49
 *
50
 *  Now we have the like_escape() function that converts patterns with
51
 *  any specified escape character (or none at all) to the internal
52
 *  default escape character, which is still '\'. - tgl 9/2000
53
 *
54
 * The code is rewritten to avoid requiring null-terminated strings,
55
 * which in turn allows us to leave out some memcpy() operations.
56
 * This code should be faster and take less memory, but no promises...
57
 * - thomas 2000-08-06
58
 */
59
60
61
/*--------------------
62
 *  Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
63
 *
64
 *  LIKE_TRUE: they match
65
 *  LIKE_FALSE: they don't match
66
 *  LIKE_ABORT: not only don't they match, but the text is too short.
67
 *
68
 * If LIKE_ABORT is returned, then no suffix of the text can match the
69
 * pattern either, so an upper-level % scan can stop scanning now.
70
 *--------------------
71
 */
72
73
#ifdef MATCH_LOWER
74
0
#define GETCHAR(t, locale) MATCH_LOWER(t, locale)
75
#else
76
0
#define GETCHAR(t, locale) (t)
77
#endif
78
79
static int
80
MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
81
0
{
82
  /* Fast path for match-everything pattern */
83
0
  if (plen == 1 && *p == '%')
84
0
    return LIKE_TRUE;
85
86
  /* Since this function recurses, it could be driven to stack overflow */
87
0
  check_stack_depth();
88
89
  /*
90
   * In this loop, we advance by char when matching wildcards (and thus on
91
   * recursive entry to this function we are properly char-synced). On other
92
   * occasions it is safe to advance by byte, as the text and pattern will
93
   * be in lockstep. This allows us to perform all comparisons between the
94
   * text and pattern on a byte by byte basis, even for multi-byte
95
   * encodings.
96
   */
97
0
  while (tlen > 0 && plen > 0)
98
0
  {
99
0
    if (*p == '\\')
100
0
    {
101
      /* Next pattern byte must match literally, whatever it is */
102
0
      NextByte(p, plen);
103
      /* ... and there had better be one, per SQL standard */
104
0
      if (plen <= 0)
105
0
        ereport(ERROR,
106
0
            (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
107
0
             errmsg("LIKE pattern must not end with escape character")));
108
0
      if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
109
0
        return LIKE_FALSE;
110
0
    }
111
0
    else if (*p == '%')
112
0
    {
113
0
      char    firstpat;
114
115
      /*
116
       * % processing is essentially a search for a text position at
117
       * which the remainder of the text matches the remainder of the
118
       * pattern, using a recursive call to check each potential match.
119
       *
120
       * If there are wildcards immediately following the %, we can skip
121
       * over them first, using the idea that any sequence of N _'s and
122
       * one or more %'s is equivalent to N _'s and one % (ie, it will
123
       * match any sequence of at least N text characters).  In this way
124
       * we will always run the recursive search loop using a pattern
125
       * fragment that begins with a literal character-to-match, thereby
126
       * not recursing more than we have to.
127
       */
128
0
      NextByte(p, plen);
129
130
0
      while (plen > 0)
131
0
      {
132
0
        if (*p == '%')
133
0
          NextByte(p, plen);
134
0
        else if (*p == '_')
135
0
        {
136
          /* If not enough text left to match the pattern, ABORT */
137
0
          if (tlen <= 0)
138
0
            return LIKE_ABORT;
139
0
          NextChar(t, tlen);
140
0
          NextByte(p, plen);
141
0
        }
142
0
        else
143
0
          break;   /* Reached a non-wildcard pattern char */
144
0
      }
145
146
      /*
147
       * If we're at end of pattern, match: we have a trailing % which
148
       * matches any remaining text string.
149
       */
150
0
      if (plen <= 0)
151
0
        return LIKE_TRUE;
152
153
      /*
154
       * Otherwise, scan for a text position at which we can match the
155
       * rest of the pattern.  The first remaining pattern char is known
156
       * to be a regular or escaped literal character, so we can compare
157
       * the first pattern byte to each text byte to avoid recursing
158
       * more than we have to.  This fact also guarantees that we don't
159
       * have to consider a match to the zero-length substring at the
160
       * end of the text.  With a nondeterministic collation, we can't
161
       * rely on the first bytes being equal, so we have to recurse in
162
       * any case.
163
       */
164
0
      if (*p == '\\')
165
0
      {
166
0
        if (plen < 2)
167
0
          ereport(ERROR,
168
0
              (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
169
0
               errmsg("LIKE pattern must not end with escape character")));
170
0
        firstpat = GETCHAR(p[1], locale);
171
0
      }
172
0
      else
173
0
        firstpat = GETCHAR(*p, locale);
174
175
0
      while (tlen > 0)
176
0
      {
177
0
        if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic))
178
0
        {
179
0
          int     matched = MatchText(t, tlen, p, plen, locale);
180
181
0
          if (matched != LIKE_FALSE)
182
0
            return matched; /* TRUE or ABORT */
183
0
        }
184
185
0
        NextChar(t, tlen);
186
0
      }
187
188
      /*
189
       * End of text with no match, so no point in trying later places
190
       * to start matching this pattern.
191
       */
192
0
      return LIKE_ABORT;
193
0
    }
194
0
    else if (*p == '_')
195
0
    {
196
      /* _ matches any single character, and we know there is one */
197
0
      NextChar(t, tlen);
198
0
      NextByte(p, plen);
199
0
      continue;
200
0
    }
201
0
    else if (locale && !locale->deterministic)
202
0
    {
203
      /*
204
       * For nondeterministic locales, we find the next substring of the
205
       * pattern that does not contain wildcards and try to find a
206
       * matching substring in the text.  Crucially, we cannot do this
207
       * character by character, as in the normal case, but must do it
208
       * substring by substring, partitioned by the wildcard characters.
209
       * (This is per SQL standard.)
210
       */
211
0
      const char *p1;
212
0
      size_t    p1len;
213
0
      const char *t1;
214
0
      size_t    t1len;
215
0
      bool    found_escape;
216
0
      const char *subpat;
217
0
      size_t    subpatlen;
218
0
      char     *buf = NULL;
219
220
      /*
221
       * Determine next substring of pattern without wildcards.  p is
222
       * the start of the subpattern, p1 is one past the last byte. Also
223
       * track if we found an escape character.
224
       */
225
0
      p1 = p;
226
0
      p1len = plen;
227
0
      found_escape = false;
228
0
      while (p1len > 0)
229
0
      {
230
0
        if (*p1 == '\\')
231
0
        {
232
0
          found_escape = true;
233
0
          NextByte(p1, p1len);
234
0
          if (p1len == 0)
235
0
            ereport(ERROR,
236
0
                (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
237
0
                 errmsg("LIKE pattern must not end with escape character")));
238
0
        }
239
0
        else if (*p1 == '_' || *p1 == '%')
240
0
          break;
241
0
        NextByte(p1, p1len);
242
0
      }
243
244
      /*
245
       * If we found an escape character, then make an unescaped copy of
246
       * the subpattern.
247
       */
248
0
      if (found_escape)
249
0
      {
250
0
        char     *b;
251
252
0
        b = buf = palloc(p1 - p);
253
0
        for (const char *c = p; c < p1; c++)
254
0
        {
255
0
          if (*c == '\\')
256
0
            ;
257
0
          else
258
0
            *(b++) = *c;
259
0
        }
260
261
0
        subpat = buf;
262
0
        subpatlen = b - buf;
263
0
      }
264
0
      else
265
0
      {
266
0
        subpat = p;
267
0
        subpatlen = p1 - p;
268
0
      }
269
270
      /*
271
       * Shortcut: If this is the end of the pattern, then the rest of
272
       * the text has to match the rest of the pattern.
273
       */
274
0
      if (p1len == 0)
275
0
      {
276
0
        int     cmp;
277
278
0
        cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale);
279
280
0
        if (buf)
281
0
          pfree(buf);
282
0
        if (cmp == 0)
283
0
          return LIKE_TRUE;
284
0
        else
285
0
          return LIKE_FALSE;
286
0
      }
287
288
      /*
289
       * Now build a substring of the text and try to match it against
290
       * the subpattern.  t is the start of the text, t1 is one past the
291
       * last byte.  We start with a zero-length string.
292
       */
293
0
      t1 = t;
294
0
      t1len = tlen;
295
0
      for (;;)
296
0
      {
297
0
        int     cmp;
298
299
0
        CHECK_FOR_INTERRUPTS();
300
301
0
        cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale);
302
303
        /*
304
         * If we found a match, we have to test if the rest of pattern
305
         * can match against the rest of the string.  Otherwise we
306
         * have to continue here try matching with a longer substring.
307
         * (This is similar to the recursion for the '%' wildcard
308
         * above.)
309
         *
310
         * Note that we can't just wind forward p and t and continue
311
         * with the main loop.  This would fail for example with
312
         *
313
         * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents
314
         *
315
         * You'd find that t=\0061 matches p=\00E4, but then the rest
316
         * won't match; but t=\0061\0308 also matches p=\00E4, and
317
         * then the rest will match.
318
         */
319
0
        if (cmp == 0)
320
0
        {
321
0
          int     matched = MatchText(t1, t1len, p1, p1len, locale);
322
323
0
          if (matched == LIKE_TRUE)
324
0
          {
325
0
            if (buf)
326
0
              pfree(buf);
327
0
            return matched;
328
0
          }
329
0
        }
330
331
        /*
332
         * Didn't match.  If we used up the whole text, then the match
333
         * fails.  Otherwise, try again with a longer substring.
334
         */
335
0
        if (t1len == 0)
336
0
        {
337
0
          if (buf)
338
0
            pfree(buf);
339
0
          return LIKE_FALSE;
340
0
        }
341
0
        else
342
0
          NextChar(t1, t1len);
343
0
      }
344
0
    }
345
0
    else if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
346
0
    {
347
      /* non-wildcard pattern char fails to match text char */
348
0
      return LIKE_FALSE;
349
0
    }
350
351
    /*
352
     * Pattern and text match, so advance.
353
     *
354
     * It is safe to use NextByte instead of NextChar here, even for
355
     * multi-byte character sets, because we are not following immediately
356
     * after a wildcard character. If we are in the middle of a multibyte
357
     * character, we must already have matched at least one byte of the
358
     * character from both text and pattern; so we cannot get out-of-sync
359
     * on character boundaries.  And we know that no backend-legal
360
     * encoding allows ASCII characters such as '%' to appear as non-first
361
     * bytes of characters, so we won't mistakenly detect a new wildcard.
362
     */
363
0
    NextByte(t, tlen);
364
0
    NextByte(p, plen);
365
0
  }
366
367
0
  if (tlen > 0)
368
0
    return LIKE_FALSE;   /* end of pattern, but not of text */
369
370
  /*
371
   * End of text, but perhaps not of pattern.  Match iff the remaining
372
   * pattern can match a zero-length string, ie, it's zero or more %'s.
373
   */
374
0
  while (plen > 0 && *p == '%')
375
0
    NextByte(p, plen);
376
0
  if (plen <= 0)
377
0
    return LIKE_TRUE;
378
379
  /*
380
   * End of text with no match, so no point in trying later places to start
381
   * matching this pattern.
382
   */
383
0
  return LIKE_ABORT;
384
0
}                /* MatchText() */
Unexecuted instantiation: like.c:UTF8_MatchText
Unexecuted instantiation: like.c:MB_MatchText
Unexecuted instantiation: like.c:SB_MatchText
Unexecuted instantiation: like.c:SB_IMatchText
385
386
/*
387
 * like_escape() --- given a pattern and an ESCAPE string,
388
 * convert the pattern to use Postgres' standard backslash escape convention.
389
 */
390
#ifdef do_like_escape
391
392
static text *
393
do_like_escape(text *pat, text *esc)
394
0
{
395
0
  text     *result;
396
0
  char     *p,
397
0
         *e,
398
0
         *r;
399
0
  int     plen,
400
0
        elen;
401
0
  bool    afterescape;
402
403
0
  p = VARDATA_ANY(pat);
404
0
  plen = VARSIZE_ANY_EXHDR(pat);
405
0
  e = VARDATA_ANY(esc);
406
0
  elen = VARSIZE_ANY_EXHDR(esc);
407
408
  /*
409
   * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
410
   * trying to calculate the size more accurately than that.
411
   */
412
0
  result = (text *) palloc(plen * 2 + VARHDRSZ);
413
0
  r = VARDATA(result);
414
415
0
  if (elen == 0)
416
0
  {
417
    /*
418
     * No escape character is wanted.  Double any backslashes in the
419
     * pattern to make them act like ordinary characters.
420
     */
421
0
    while (plen > 0)
422
0
    {
423
0
      if (*p == '\\')
424
0
        *r++ = '\\';
425
0
      CopyAdvChar(r, p, plen);
426
0
    }
427
0
  }
428
0
  else
429
0
  {
430
    /*
431
     * The specified escape must be only a single character.
432
     */
433
0
    NextChar(e, elen);
434
0
    if (elen != 0)
435
0
      ereport(ERROR,
436
0
          (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
437
0
           errmsg("invalid escape string"),
438
0
           errhint("Escape string must be empty or one character.")));
439
440
0
    e = VARDATA_ANY(esc);
441
442
    /*
443
     * If specified escape is '\', just copy the pattern as-is.
444
     */
445
0
    if (*e == '\\')
446
0
    {
447
0
      memcpy(result, pat, VARSIZE_ANY(pat));
448
0
      return result;
449
0
    }
450
451
    /*
452
     * Otherwise, convert occurrences of the specified escape character to
453
     * '\', and double occurrences of '\' --- unless they immediately
454
     * follow an escape character!
455
     */
456
0
    afterescape = false;
457
0
    while (plen > 0)
458
0
    {
459
0
      if (CHAREQ(p, e) && !afterescape)
460
0
      {
461
0
        *r++ = '\\';
462
0
        NextChar(p, plen);
463
0
        afterescape = true;
464
0
      }
465
0
      else if (*p == '\\')
466
0
      {
467
0
        *r++ = '\\';
468
0
        if (!afterescape)
469
0
          *r++ = '\\';
470
0
        NextChar(p, plen);
471
0
        afterescape = false;
472
0
      }
473
0
      else
474
0
      {
475
0
        CopyAdvChar(r, p, plen);
476
0
        afterescape = false;
477
0
      }
478
0
    }
479
0
  }
480
481
0
  SET_VARSIZE(result, r - ((char *) result));
482
483
0
  return result;
484
0
}
Unexecuted instantiation: like.c:SB_do_like_escape
Unexecuted instantiation: like.c:MB_do_like_escape
485
#endif              /* do_like_escape */
486
487
#ifdef CHAREQ
488
#undef CHAREQ
489
#endif
490
491
#undef NextChar
492
#undef CopyAdvChar
493
#undef MatchText
494
495
#ifdef do_like_escape
496
#undef do_like_escape
497
#endif
498
499
#undef GETCHAR
500
501
#ifdef MATCH_LOWER
502
#undef MATCH_LOWER
503
504
#endif