/src/postgres/src/backend/utils/adt/like_match.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * like_match.c |
4 | | * LIKE pattern matching internal code. |
5 | | * |
6 | | * This file is included by like.c four times, to provide matching code for |
7 | | * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings, |
8 | | * and (4) case insensitive matches in single-byte encodings. |
9 | | * (UTF8 is a special case because we can use a much more efficient version |
10 | | * of NextChar than can be used for general multi-byte encodings.) |
11 | | * |
12 | | * Before the inclusion, we need to define the following macros: |
13 | | * |
14 | | * NextChar |
15 | | * MatchText - to name of function wanted |
16 | | * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar |
17 | | * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars |
18 | | * |
19 | | * Copyright (c) 1996-2025, PostgreSQL Global Development Group |
20 | | * |
21 | | * IDENTIFICATION |
22 | | * src/backend/utils/adt/like_match.c |
23 | | * |
24 | | *------------------------------------------------------------------------- |
25 | | */ |
26 | | |
27 | | /* |
28 | | * Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986. |
29 | | * Rich $alz is now <rsalz@bbn.com>. |
30 | | * Special thanks to Lars Mathiesen <thorinn@diku.dk> for the |
31 | | * LIKE_ABORT code. |
32 | | * |
33 | | * This code was shamelessly stolen from the "pql" code by myself and |
34 | | * slightly modified :) |
35 | | * |
36 | | * All references to the word "star" were replaced by "percent" |
37 | | * All references to the word "wild" were replaced by "like" |
38 | | * |
39 | | * All the nice shell RE matching stuff was replaced by just "_" and "%" |
40 | | * |
41 | | * As I don't have a copy of the SQL standard handy I wasn't sure whether |
42 | | * to leave in the '\' escape character handling. |
43 | | * |
44 | | * Keith Parks. <keith@mtcc.demon.co.uk> |
45 | | * |
46 | | * SQL lets you specify the escape character by saying |
47 | | * LIKE <pattern> ESCAPE <escape character>. We are a small operation |
48 | | * so we force you to use '\'. - ay 7/95 |
49 | | * |
50 | | * Now we have the like_escape() function that converts patterns with |
51 | | * any specified escape character (or none at all) to the internal |
52 | | * default escape character, which is still '\'. - tgl 9/2000 |
53 | | * |
54 | | * The code is rewritten to avoid requiring null-terminated strings, |
55 | | * which in turn allows us to leave out some memcpy() operations. |
56 | | * This code should be faster and take less memory, but no promises... |
57 | | * - thomas 2000-08-06 |
58 | | */ |
59 | | |
60 | | |
61 | | /*-------------------- |
62 | | * Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT. |
63 | | * |
64 | | * LIKE_TRUE: they match |
65 | | * LIKE_FALSE: they don't match |
66 | | * LIKE_ABORT: not only don't they match, but the text is too short. |
67 | | * |
68 | | * If LIKE_ABORT is returned, then no suffix of the text can match the |
69 | | * pattern either, so an upper-level % scan can stop scanning now. |
70 | | *-------------------- |
71 | | */ |
72 | | |
73 | | #ifdef MATCH_LOWER |
74 | 0 | #define GETCHAR(t, locale) MATCH_LOWER(t, locale) |
75 | | #else |
76 | 0 | #define GETCHAR(t, locale) (t) |
77 | | #endif |
78 | | |
79 | | static int |
80 | | MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) |
81 | 0 | { |
82 | | /* Fast path for match-everything pattern */ |
83 | 0 | if (plen == 1 && *p == '%') |
84 | 0 | return LIKE_TRUE; |
85 | | |
86 | | /* Since this function recurses, it could be driven to stack overflow */ |
87 | 0 | check_stack_depth(); |
88 | | |
89 | | /* |
90 | | * In this loop, we advance by char when matching wildcards (and thus on |
91 | | * recursive entry to this function we are properly char-synced). On other |
92 | | * occasions it is safe to advance by byte, as the text and pattern will |
93 | | * be in lockstep. This allows us to perform all comparisons between the |
94 | | * text and pattern on a byte by byte basis, even for multi-byte |
95 | | * encodings. |
96 | | */ |
97 | 0 | while (tlen > 0 && plen > 0) |
98 | 0 | { |
99 | 0 | if (*p == '\\') |
100 | 0 | { |
101 | | /* Next pattern byte must match literally, whatever it is */ |
102 | 0 | NextByte(p, plen); |
103 | | /* ... and there had better be one, per SQL standard */ |
104 | 0 | if (plen <= 0) |
105 | 0 | ereport(ERROR, |
106 | 0 | (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
107 | 0 | errmsg("LIKE pattern must not end with escape character"))); |
108 | 0 | if (GETCHAR(*p, locale) != GETCHAR(*t, locale)) |
109 | 0 | return LIKE_FALSE; |
110 | 0 | } |
111 | 0 | else if (*p == '%') |
112 | 0 | { |
113 | 0 | char firstpat; |
114 | | |
115 | | /* |
116 | | * % processing is essentially a search for a text position at |
117 | | * which the remainder of the text matches the remainder of the |
118 | | * pattern, using a recursive call to check each potential match. |
119 | | * |
120 | | * If there are wildcards immediately following the %, we can skip |
121 | | * over them first, using the idea that any sequence of N _'s and |
122 | | * one or more %'s is equivalent to N _'s and one % (ie, it will |
123 | | * match any sequence of at least N text characters). In this way |
124 | | * we will always run the recursive search loop using a pattern |
125 | | * fragment that begins with a literal character-to-match, thereby |
126 | | * not recursing more than we have to. |
127 | | */ |
128 | 0 | NextByte(p, plen); |
129 | |
|
130 | 0 | while (plen > 0) |
131 | 0 | { |
132 | 0 | if (*p == '%') |
133 | 0 | NextByte(p, plen); |
134 | 0 | else if (*p == '_') |
135 | 0 | { |
136 | | /* If not enough text left to match the pattern, ABORT */ |
137 | 0 | if (tlen <= 0) |
138 | 0 | return LIKE_ABORT; |
139 | 0 | NextChar(t, tlen); |
140 | 0 | NextByte(p, plen); |
141 | 0 | } |
142 | 0 | else |
143 | 0 | break; /* Reached a non-wildcard pattern char */ |
144 | 0 | } |
145 | | |
146 | | /* |
147 | | * If we're at end of pattern, match: we have a trailing % which |
148 | | * matches any remaining text string. |
149 | | */ |
150 | 0 | if (plen <= 0) |
151 | 0 | return LIKE_TRUE; |
152 | | |
153 | | /* |
154 | | * Otherwise, scan for a text position at which we can match the |
155 | | * rest of the pattern. The first remaining pattern char is known |
156 | | * to be a regular or escaped literal character, so we can compare |
157 | | * the first pattern byte to each text byte to avoid recursing |
158 | | * more than we have to. This fact also guarantees that we don't |
159 | | * have to consider a match to the zero-length substring at the |
160 | | * end of the text. With a nondeterministic collation, we can't |
161 | | * rely on the first bytes being equal, so we have to recurse in |
162 | | * any case. |
163 | | */ |
164 | 0 | if (*p == '\\') |
165 | 0 | { |
166 | 0 | if (plen < 2) |
167 | 0 | ereport(ERROR, |
168 | 0 | (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
169 | 0 | errmsg("LIKE pattern must not end with escape character"))); |
170 | 0 | firstpat = GETCHAR(p[1], locale); |
171 | 0 | } |
172 | 0 | else |
173 | 0 | firstpat = GETCHAR(*p, locale); |
174 | | |
175 | 0 | while (tlen > 0) |
176 | 0 | { |
177 | 0 | if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic)) |
178 | 0 | { |
179 | 0 | int matched = MatchText(t, tlen, p, plen, locale); |
180 | |
|
181 | 0 | if (matched != LIKE_FALSE) |
182 | 0 | return matched; /* TRUE or ABORT */ |
183 | 0 | } |
184 | | |
185 | 0 | NextChar(t, tlen); |
186 | 0 | } |
187 | | |
188 | | /* |
189 | | * End of text with no match, so no point in trying later places |
190 | | * to start matching this pattern. |
191 | | */ |
192 | 0 | return LIKE_ABORT; |
193 | 0 | } |
194 | 0 | else if (*p == '_') |
195 | 0 | { |
196 | | /* _ matches any single character, and we know there is one */ |
197 | 0 | NextChar(t, tlen); |
198 | 0 | NextByte(p, plen); |
199 | 0 | continue; |
200 | 0 | } |
201 | 0 | else if (locale && !locale->deterministic) |
202 | 0 | { |
203 | | /* |
204 | | * For nondeterministic locales, we find the next substring of the |
205 | | * pattern that does not contain wildcards and try to find a |
206 | | * matching substring in the text. Crucially, we cannot do this |
207 | | * character by character, as in the normal case, but must do it |
208 | | * substring by substring, partitioned by the wildcard characters. |
209 | | * (This is per SQL standard.) |
210 | | */ |
211 | 0 | const char *p1; |
212 | 0 | size_t p1len; |
213 | 0 | const char *t1; |
214 | 0 | size_t t1len; |
215 | 0 | bool found_escape; |
216 | 0 | const char *subpat; |
217 | 0 | size_t subpatlen; |
218 | 0 | char *buf = NULL; |
219 | | |
220 | | /* |
221 | | * Determine next substring of pattern without wildcards. p is |
222 | | * the start of the subpattern, p1 is one past the last byte. Also |
223 | | * track if we found an escape character. |
224 | | */ |
225 | 0 | p1 = p; |
226 | 0 | p1len = plen; |
227 | 0 | found_escape = false; |
228 | 0 | while (p1len > 0) |
229 | 0 | { |
230 | 0 | if (*p1 == '\\') |
231 | 0 | { |
232 | 0 | found_escape = true; |
233 | 0 | NextByte(p1, p1len); |
234 | 0 | if (p1len == 0) |
235 | 0 | ereport(ERROR, |
236 | 0 | (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
237 | 0 | errmsg("LIKE pattern must not end with escape character"))); |
238 | 0 | } |
239 | 0 | else if (*p1 == '_' || *p1 == '%') |
240 | 0 | break; |
241 | 0 | NextByte(p1, p1len); |
242 | 0 | } |
243 | | |
244 | | /* |
245 | | * If we found an escape character, then make an unescaped copy of |
246 | | * the subpattern. |
247 | | */ |
248 | 0 | if (found_escape) |
249 | 0 | { |
250 | 0 | char *b; |
251 | |
|
252 | 0 | b = buf = palloc(p1 - p); |
253 | 0 | for (const char *c = p; c < p1; c++) |
254 | 0 | { |
255 | 0 | if (*c == '\\') |
256 | 0 | ; |
257 | 0 | else |
258 | 0 | *(b++) = *c; |
259 | 0 | } |
260 | |
|
261 | 0 | subpat = buf; |
262 | 0 | subpatlen = b - buf; |
263 | 0 | } |
264 | 0 | else |
265 | 0 | { |
266 | 0 | subpat = p; |
267 | 0 | subpatlen = p1 - p; |
268 | 0 | } |
269 | | |
270 | | /* |
271 | | * Shortcut: If this is the end of the pattern, then the rest of |
272 | | * the text has to match the rest of the pattern. |
273 | | */ |
274 | 0 | if (p1len == 0) |
275 | 0 | { |
276 | 0 | int cmp; |
277 | |
|
278 | 0 | cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale); |
279 | |
|
280 | 0 | if (buf) |
281 | 0 | pfree(buf); |
282 | 0 | if (cmp == 0) |
283 | 0 | return LIKE_TRUE; |
284 | 0 | else |
285 | 0 | return LIKE_FALSE; |
286 | 0 | } |
287 | | |
288 | | /* |
289 | | * Now build a substring of the text and try to match it against |
290 | | * the subpattern. t is the start of the text, t1 is one past the |
291 | | * last byte. We start with a zero-length string. |
292 | | */ |
293 | 0 | t1 = t; |
294 | 0 | t1len = tlen; |
295 | 0 | for (;;) |
296 | 0 | { |
297 | 0 | int cmp; |
298 | |
|
299 | 0 | CHECK_FOR_INTERRUPTS(); |
300 | |
|
301 | 0 | cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale); |
302 | | |
303 | | /* |
304 | | * If we found a match, we have to test if the rest of pattern |
305 | | * can match against the rest of the string. Otherwise we |
306 | | * have to continue here try matching with a longer substring. |
307 | | * (This is similar to the recursion for the '%' wildcard |
308 | | * above.) |
309 | | * |
310 | | * Note that we can't just wind forward p and t and continue |
311 | | * with the main loop. This would fail for example with |
312 | | * |
313 | | * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents |
314 | | * |
315 | | * You'd find that t=\0061 matches p=\00E4, but then the rest |
316 | | * won't match; but t=\0061\0308 also matches p=\00E4, and |
317 | | * then the rest will match. |
318 | | */ |
319 | 0 | if (cmp == 0) |
320 | 0 | { |
321 | 0 | int matched = MatchText(t1, t1len, p1, p1len, locale); |
322 | |
|
323 | 0 | if (matched == LIKE_TRUE) |
324 | 0 | { |
325 | 0 | if (buf) |
326 | 0 | pfree(buf); |
327 | 0 | return matched; |
328 | 0 | } |
329 | 0 | } |
330 | | |
331 | | /* |
332 | | * Didn't match. If we used up the whole text, then the match |
333 | | * fails. Otherwise, try again with a longer substring. |
334 | | */ |
335 | 0 | if (t1len == 0) |
336 | 0 | { |
337 | 0 | if (buf) |
338 | 0 | pfree(buf); |
339 | 0 | return LIKE_FALSE; |
340 | 0 | } |
341 | 0 | else |
342 | 0 | NextChar(t1, t1len); |
343 | 0 | } |
344 | 0 | } |
345 | 0 | else if (GETCHAR(*p, locale) != GETCHAR(*t, locale)) |
346 | 0 | { |
347 | | /* non-wildcard pattern char fails to match text char */ |
348 | 0 | return LIKE_FALSE; |
349 | 0 | } |
350 | | |
351 | | /* |
352 | | * Pattern and text match, so advance. |
353 | | * |
354 | | * It is safe to use NextByte instead of NextChar here, even for |
355 | | * multi-byte character sets, because we are not following immediately |
356 | | * after a wildcard character. If we are in the middle of a multibyte |
357 | | * character, we must already have matched at least one byte of the |
358 | | * character from both text and pattern; so we cannot get out-of-sync |
359 | | * on character boundaries. And we know that no backend-legal |
360 | | * encoding allows ASCII characters such as '%' to appear as non-first |
361 | | * bytes of characters, so we won't mistakenly detect a new wildcard. |
362 | | */ |
363 | 0 | NextByte(t, tlen); |
364 | 0 | NextByte(p, plen); |
365 | 0 | } |
366 | | |
367 | 0 | if (tlen > 0) |
368 | 0 | return LIKE_FALSE; /* end of pattern, but not of text */ |
369 | | |
370 | | /* |
371 | | * End of text, but perhaps not of pattern. Match iff the remaining |
372 | | * pattern can match a zero-length string, ie, it's zero or more %'s. |
373 | | */ |
374 | 0 | while (plen > 0 && *p == '%') |
375 | 0 | NextByte(p, plen); |
376 | 0 | if (plen <= 0) |
377 | 0 | return LIKE_TRUE; |
378 | | |
379 | | /* |
380 | | * End of text with no match, so no point in trying later places to start |
381 | | * matching this pattern. |
382 | | */ |
383 | 0 | return LIKE_ABORT; |
384 | 0 | } /* MatchText() */ Unexecuted instantiation: like.c:UTF8_MatchText Unexecuted instantiation: like.c:MB_MatchText Unexecuted instantiation: like.c:SB_MatchText Unexecuted instantiation: like.c:SB_IMatchText |
385 | | |
386 | | /* |
387 | | * like_escape() --- given a pattern and an ESCAPE string, |
388 | | * convert the pattern to use Postgres' standard backslash escape convention. |
389 | | */ |
390 | | #ifdef do_like_escape |
391 | | |
392 | | static text * |
393 | | do_like_escape(text *pat, text *esc) |
394 | 0 | { |
395 | 0 | text *result; |
396 | 0 | char *p, |
397 | 0 | *e, |
398 | 0 | *r; |
399 | 0 | int plen, |
400 | 0 | elen; |
401 | 0 | bool afterescape; |
402 | |
|
403 | 0 | p = VARDATA_ANY(pat); |
404 | 0 | plen = VARSIZE_ANY_EXHDR(pat); |
405 | 0 | e = VARDATA_ANY(esc); |
406 | 0 | elen = VARSIZE_ANY_EXHDR(esc); |
407 | | |
408 | | /* |
409 | | * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth |
410 | | * trying to calculate the size more accurately than that. |
411 | | */ |
412 | 0 | result = (text *) palloc(plen * 2 + VARHDRSZ); |
413 | 0 | r = VARDATA(result); |
414 | |
|
415 | 0 | if (elen == 0) |
416 | 0 | { |
417 | | /* |
418 | | * No escape character is wanted. Double any backslashes in the |
419 | | * pattern to make them act like ordinary characters. |
420 | | */ |
421 | 0 | while (plen > 0) |
422 | 0 | { |
423 | 0 | if (*p == '\\') |
424 | 0 | *r++ = '\\'; |
425 | 0 | CopyAdvChar(r, p, plen); |
426 | 0 | } |
427 | 0 | } |
428 | 0 | else |
429 | 0 | { |
430 | | /* |
431 | | * The specified escape must be only a single character. |
432 | | */ |
433 | 0 | NextChar(e, elen); |
434 | 0 | if (elen != 0) |
435 | 0 | ereport(ERROR, |
436 | 0 | (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
437 | 0 | errmsg("invalid escape string"), |
438 | 0 | errhint("Escape string must be empty or one character."))); |
439 | | |
440 | 0 | e = VARDATA_ANY(esc); |
441 | | |
442 | | /* |
443 | | * If specified escape is '\', just copy the pattern as-is. |
444 | | */ |
445 | 0 | if (*e == '\\') |
446 | 0 | { |
447 | 0 | memcpy(result, pat, VARSIZE_ANY(pat)); |
448 | 0 | return result; |
449 | 0 | } |
450 | | |
451 | | /* |
452 | | * Otherwise, convert occurrences of the specified escape character to |
453 | | * '\', and double occurrences of '\' --- unless they immediately |
454 | | * follow an escape character! |
455 | | */ |
456 | 0 | afterescape = false; |
457 | 0 | while (plen > 0) |
458 | 0 | { |
459 | 0 | if (CHAREQ(p, e) && !afterescape) |
460 | 0 | { |
461 | 0 | *r++ = '\\'; |
462 | 0 | NextChar(p, plen); |
463 | 0 | afterescape = true; |
464 | 0 | } |
465 | 0 | else if (*p == '\\') |
466 | 0 | { |
467 | 0 | *r++ = '\\'; |
468 | 0 | if (!afterescape) |
469 | 0 | *r++ = '\\'; |
470 | 0 | NextChar(p, plen); |
471 | 0 | afterescape = false; |
472 | 0 | } |
473 | 0 | else |
474 | 0 | { |
475 | 0 | CopyAdvChar(r, p, plen); |
476 | 0 | afterescape = false; |
477 | 0 | } |
478 | 0 | } |
479 | 0 | } |
480 | | |
481 | 0 | SET_VARSIZE(result, r - ((char *) result)); |
482 | |
|
483 | 0 | return result; |
484 | 0 | } Unexecuted instantiation: like.c:SB_do_like_escape Unexecuted instantiation: like.c:MB_do_like_escape |
485 | | #endif /* do_like_escape */ |
486 | | |
487 | | #ifdef CHAREQ |
488 | | #undef CHAREQ |
489 | | #endif |
490 | | |
491 | | #undef NextChar |
492 | | #undef CopyAdvChar |
493 | | #undef MatchText |
494 | | |
495 | | #ifdef do_like_escape |
496 | | #undef do_like_escape |
497 | | #endif |
498 | | |
499 | | #undef GETCHAR |
500 | | |
501 | | #ifdef MATCH_LOWER |
502 | | #undef MATCH_LOWER |
503 | | |
504 | | #endif |