Coverage Report

Created: 2025-06-13 06:06

/src/postgres/src/backend/utils/adt/regexp.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * regexp.c
4
 *    Postgres' interface to the regular expression package.
5
 *
6
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7
 * Portions Copyright (c) 1994, Regents of the University of California
8
 *
9
 *
10
 * IDENTIFICATION
11
 *    src/backend/utils/adt/regexp.c
12
 *
13
 *    Alistair Crooks added the code for the regex caching
14
 *    agc - cached the regular expressions used - there's a good chance
15
 *    that we'll get a hit, so this saves a compile step for every
16
 *    attempted match. I haven't actually measured the speed improvement,
17
 *    but it `looks' a lot quicker visually when watching regression
18
 *    test output.
19
 *
20
 *    agc - incorporated Keith Bostic's Berkeley regex code into
21
 *    the tree for all ports. To distinguish this regex code from any that
22
 *    is existent on a platform, I've prepended the string "pg_" to
23
 *    the functions regcomp, regerror, regexec and regfree.
24
 *    Fixed a bug that was originally a typo by me, where `i' was used
25
 *    instead of `oldest' when compiling regular expressions - benign
26
 *    results mostly, although occasionally it bit you...
27
 *
28
 *-------------------------------------------------------------------------
29
 */
30
#include "postgres.h"
31
32
#include "catalog/pg_type.h"
33
#include "funcapi.h"
34
#include "regex/regex.h"
35
#include "utils/array.h"
36
#include "utils/builtins.h"
37
#include "utils/memutils.h"
38
#include "utils/varlena.h"
39
40
#define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
41
0
  (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
42
43
44
/* all the options of interest for regex functions */
45
typedef struct pg_re_flags
46
{
47
  int     cflags;     /* compile flags for Spencer's regex code */
48
  bool    glob;     /* do it globally (for each occurrence) */
49
} pg_re_flags;
50
51
/* cross-call state for regexp_match and regexp_split functions */
52
typedef struct regexp_matches_ctx
53
{
54
  text     *orig_str;   /* data string in original TEXT form */
55
  int     nmatches;   /* number of places where pattern matched */
56
  int     npatterns;    /* number of capturing subpatterns */
57
  /* We store start char index and end+1 char index for each match */
58
  /* so the number of entries in match_locs is nmatches * npatterns * 2 */
59
  int      *match_locs;   /* 0-based character indexes */
60
  int     next_match;   /* 0-based index of next match to process */
61
  /* workspace for build_regexp_match_result() */
62
  Datum    *elems;      /* has npatterns elements */
63
  bool     *nulls;      /* has npatterns elements */
64
  pg_wchar   *wide_str;   /* wide-char version of original string */
65
  char     *conv_buf;   /* conversion buffer, if needed */
66
  int     conv_bufsiz;  /* size thereof */
67
} regexp_matches_ctx;
68
69
/*
70
 * We cache precompiled regular expressions using a "self organizing list"
71
 * structure, in which recently-used items tend to be near the front.
72
 * Whenever we use an entry, it's moved up to the front of the list.
73
 * Over time, an item's average position corresponds to its frequency of use.
74
 *
75
 * When we first create an entry, it's inserted at the front of
76
 * the array, dropping the entry at the end of the array if necessary to
77
 * make room.  (This might seem to be weighting the new entry too heavily,
78
 * but if we insert new entries further back, we'll be unable to adjust to
79
 * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
80
 * never-before-seen items used circularly.  We ought to be able to handle
81
 * that case, so we have to insert at the front.)
82
 *
83
 * Knuth mentions a variant strategy in which a used item is moved up just
84
 * one place in the list.  Although he says this uses fewer comparisons on
85
 * average, it seems not to adapt very well to the situation where you have
86
 * both some reusable patterns and a steady stream of non-reusable patterns.
87
 * A reusable pattern that isn't used at least as often as non-reusable
88
 * patterns are seen will "fail to keep up" and will drop off the end of the
89
 * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
90
 * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
91
 */
92
93
/* this is the maximum number of cached regular expressions */
94
#ifndef MAX_CACHED_RES
95
0
#define MAX_CACHED_RES  32
96
#endif
97
98
/* A parent memory context for regular expressions. */
99
static MemoryContext RegexpCacheMemoryContext;
100
101
/* this structure describes one cached regular expression */
102
typedef struct cached_re_str
103
{
104
  MemoryContext cre_context;  /* memory context for this regexp */
105
  char     *cre_pat;    /* original RE (not null terminated!) */
106
  int     cre_pat_len;  /* length of original RE, in bytes */
107
  int     cre_flags;    /* compile flags: extended,icase etc */
108
  Oid     cre_collation;  /* collation to use */
109
  regex_t   cre_re;     /* the compiled regular expression */
110
} cached_re_str;
111
112
static int  num_res = 0;    /* # of cached re's */
113
static cached_re_str re_array[MAX_CACHED_RES];  /* cached re's */
114
115
116
/* Local functions */
117
static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
118
                        pg_re_flags *re_flags,
119
                        int start_search,
120
                        Oid collation,
121
                        bool use_subpatterns,
122
                        bool ignore_degenerate,
123
                        bool fetching_unmatched);
124
static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
125
static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
126
127
128
/*
129
 * RE_compile_and_cache - compile a RE, caching if possible
130
 *
131
 * Returns regex_t *
132
 *
133
 *  text_re --- the pattern, expressed as a TEXT object
134
 *  cflags --- compile options for the pattern
135
 *  collation --- collation to use for LC_CTYPE-dependent behavior
136
 *
137
 * Pattern is given in the database encoding.  We internally convert to
138
 * an array of pg_wchar, which is what Spencer's regex package wants.
139
 */
140
regex_t *
141
RE_compile_and_cache(text *text_re, int cflags, Oid collation)
142
0
{
143
0
  int     text_re_len = VARSIZE_ANY_EXHDR(text_re);
144
0
  char     *text_re_val = VARDATA_ANY(text_re);
145
0
  pg_wchar   *pattern;
146
0
  int     pattern_len;
147
0
  int     i;
148
0
  int     regcomp_result;
149
0
  cached_re_str re_temp;
150
0
  char    errMsg[100];
151
0
  MemoryContext oldcontext;
152
153
  /*
154
   * Look for a match among previously compiled REs.  Since the data
155
   * structure is self-organizing with most-used entries at the front, our
156
   * search strategy can just be to scan from the front.
157
   */
158
0
  for (i = 0; i < num_res; i++)
159
0
  {
160
0
    if (re_array[i].cre_pat_len == text_re_len &&
161
0
      re_array[i].cre_flags == cflags &&
162
0
      re_array[i].cre_collation == collation &&
163
0
      memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
164
0
    {
165
      /*
166
       * Found a match; move it to front if not there already.
167
       */
168
0
      if (i > 0)
169
0
      {
170
0
        re_temp = re_array[i];
171
0
        memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
172
0
        re_array[0] = re_temp;
173
0
      }
174
175
0
      return &re_array[0].cre_re;
176
0
    }
177
0
  }
178
179
  /* Set up the cache memory on first go through. */
180
0
  if (unlikely(RegexpCacheMemoryContext == NULL))
181
0
    RegexpCacheMemoryContext =
182
0
      AllocSetContextCreate(TopMemoryContext,
183
0
                  "RegexpCacheMemoryContext",
184
0
                  ALLOCSET_SMALL_SIZES);
185
186
  /*
187
   * Couldn't find it, so try to compile the new RE.  To avoid leaking
188
   * resources on failure, we build into the re_temp local.
189
   */
190
191
  /* Convert pattern string to wide characters */
192
0
  pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
193
0
  pattern_len = pg_mb2wchar_with_len(text_re_val,
194
0
                     pattern,
195
0
                     text_re_len);
196
197
  /*
198
   * Make a memory context for this compiled regexp.  This is initially a
199
   * child of the current memory context, so it will be cleaned up
200
   * automatically if compilation is interrupted and throws an ERROR. We'll
201
   * re-parent it under the longer lived cache context if we make it to the
202
   * bottom of this function.
203
   */
204
0
  re_temp.cre_context = AllocSetContextCreate(CurrentMemoryContext,
205
0
                        "RegexpMemoryContext",
206
0
                        ALLOCSET_SMALL_SIZES);
207
0
  oldcontext = MemoryContextSwitchTo(re_temp.cre_context);
208
209
0
  regcomp_result = pg_regcomp(&re_temp.cre_re,
210
0
                pattern,
211
0
                pattern_len,
212
0
                cflags,
213
0
                collation);
214
215
0
  pfree(pattern);
216
217
0
  if (regcomp_result != REG_OKAY)
218
0
  {
219
    /* re didn't compile (no need for pg_regfree, if so) */
220
0
    pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
221
0
    ereport(ERROR,
222
0
        (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
223
0
         errmsg("invalid regular expression: %s", errMsg)));
224
0
  }
225
226
  /* Copy the pattern into the per-regexp memory context. */
227
0
  re_temp.cre_pat = palloc(text_re_len + 1);
228
0
  memcpy(re_temp.cre_pat, text_re_val, text_re_len);
229
230
  /*
231
   * NUL-terminate it only for the benefit of the identifier used for the
232
   * memory context, visible in the pg_backend_memory_contexts view.
233
   */
234
0
  re_temp.cre_pat[text_re_len] = 0;
235
0
  MemoryContextSetIdentifier(re_temp.cre_context, re_temp.cre_pat);
236
237
0
  re_temp.cre_pat_len = text_re_len;
238
0
  re_temp.cre_flags = cflags;
239
0
  re_temp.cre_collation = collation;
240
241
  /*
242
   * Okay, we have a valid new item in re_temp; insert it into the storage
243
   * array.  Discard last entry if needed.
244
   */
245
0
  if (num_res >= MAX_CACHED_RES)
246
0
  {
247
0
    --num_res;
248
0
    Assert(num_res < MAX_CACHED_RES);
249
    /* Delete the memory context holding the regexp and pattern. */
250
0
    MemoryContextDelete(re_array[num_res].cre_context);
251
0
  }
252
253
  /* Re-parent the memory context to our long-lived cache context. */
254
0
  MemoryContextSetParent(re_temp.cre_context, RegexpCacheMemoryContext);
255
256
0
  if (num_res > 0)
257
0
    memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
258
259
0
  re_array[0] = re_temp;
260
0
  num_res++;
261
262
0
  MemoryContextSwitchTo(oldcontext);
263
264
0
  return &re_array[0].cre_re;
265
0
}
266
267
/*
268
 * RE_wchar_execute - execute a RE on pg_wchar data
269
 *
270
 * Returns true on match, false on no match
271
 *
272
 *  re --- the compiled pattern as returned by RE_compile_and_cache
273
 *  data --- the data to match against (need not be null-terminated)
274
 *  data_len --- the length of the data string
275
 *  start_search -- the offset in the data to start searching
276
 *  nmatch, pmatch  --- optional return area for match details
277
 *
278
 * Data is given as array of pg_wchar which is what Spencer's regex package
279
 * wants.
280
 */
281
static bool
282
RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
283
         int start_search, int nmatch, regmatch_t *pmatch)
284
0
{
285
0
  int     regexec_result;
286
0
  char    errMsg[100];
287
288
  /* Perform RE match and return result */
289
0
  regexec_result = pg_regexec(re,
290
0
                data,
291
0
                data_len,
292
0
                start_search,
293
0
                NULL, /* no details */
294
0
                nmatch,
295
0
                pmatch,
296
0
                0);
297
298
0
  if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
299
0
  {
300
    /* re failed??? */
301
0
    pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
302
0
    ereport(ERROR,
303
0
        (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
304
0
         errmsg("regular expression failed: %s", errMsg)));
305
0
  }
306
307
0
  return (regexec_result == REG_OKAY);
308
0
}
309
310
/*
311
 * RE_execute - execute a RE
312
 *
313
 * Returns true on match, false on no match
314
 *
315
 *  re --- the compiled pattern as returned by RE_compile_and_cache
316
 *  dat --- the data to match against (need not be null-terminated)
317
 *  dat_len --- the length of the data string
318
 *  nmatch, pmatch  --- optional return area for match details
319
 *
320
 * Data is given in the database encoding.  We internally
321
 * convert to array of pg_wchar which is what Spencer's regex package wants.
322
 */
323
static bool
324
RE_execute(regex_t *re, char *dat, int dat_len,
325
       int nmatch, regmatch_t *pmatch)
326
0
{
327
0
  pg_wchar   *data;
328
0
  int     data_len;
329
0
  bool    match;
330
331
  /* Convert data string to wide characters */
332
0
  data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
333
0
  data_len = pg_mb2wchar_with_len(dat, data, dat_len);
334
335
  /* Perform RE match and return result */
336
0
  match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
337
338
0
  pfree(data);
339
0
  return match;
340
0
}
341
342
/*
343
 * RE_compile_and_execute - compile and execute a RE
344
 *
345
 * Returns true on match, false on no match
346
 *
347
 *  text_re --- the pattern, expressed as a TEXT object
348
 *  dat --- the data to match against (need not be null-terminated)
349
 *  dat_len --- the length of the data string
350
 *  cflags --- compile options for the pattern
351
 *  collation --- collation to use for LC_CTYPE-dependent behavior
352
 *  nmatch, pmatch  --- optional return area for match details
353
 *
354
 * Both pattern and data are given in the database encoding.  We internally
355
 * convert to array of pg_wchar which is what Spencer's regex package wants.
356
 */
357
bool
358
RE_compile_and_execute(text *text_re, char *dat, int dat_len,
359
             int cflags, Oid collation,
360
             int nmatch, regmatch_t *pmatch)
361
0
{
362
0
  regex_t    *re;
363
364
  /* Use REG_NOSUB if caller does not want sub-match details */
365
0
  if (nmatch < 2)
366
0
    cflags |= REG_NOSUB;
367
368
  /* Compile RE */
369
0
  re = RE_compile_and_cache(text_re, cflags, collation);
370
371
0
  return RE_execute(re, dat, dat_len, nmatch, pmatch);
372
0
}
373
374
375
/*
376
 * parse_re_flags - parse the options argument of regexp_match and friends
377
 *
378
 *  flags --- output argument, filled with desired options
379
 *  opts --- TEXT object, or NULL for defaults
380
 *
381
 * This accepts all the options allowed by any of the callers; callers that
382
 * don't want some have to reject them after the fact.
383
 */
384
static void
385
parse_re_flags(pg_re_flags *flags, text *opts)
386
0
{
387
  /* regex flavor is always folded into the compile flags */
388
0
  flags->cflags = REG_ADVANCED;
389
0
  flags->glob = false;
390
391
0
  if (opts)
392
0
  {
393
0
    char     *opt_p = VARDATA_ANY(opts);
394
0
    int     opt_len = VARSIZE_ANY_EXHDR(opts);
395
0
    int     i;
396
397
0
    for (i = 0; i < opt_len; i++)
398
0
    {
399
0
      switch (opt_p[i])
400
0
      {
401
0
        case 'g':
402
0
          flags->glob = true;
403
0
          break;
404
0
        case 'b':   /* BREs (but why???) */
405
0
          flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
406
0
          break;
407
0
        case 'c':   /* case sensitive */
408
0
          flags->cflags &= ~REG_ICASE;
409
0
          break;
410
0
        case 'e':   /* plain EREs */
411
0
          flags->cflags |= REG_EXTENDED;
412
0
          flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
413
0
          break;
414
0
        case 'i':   /* case insensitive */
415
0
          flags->cflags |= REG_ICASE;
416
0
          break;
417
0
        case 'm':   /* Perloid synonym for n */
418
0
        case 'n':   /* \n affects ^ $ . [^ */
419
0
          flags->cflags |= REG_NEWLINE;
420
0
          break;
421
0
        case 'p':   /* ~Perl, \n affects . [^ */
422
0
          flags->cflags |= REG_NLSTOP;
423
0
          flags->cflags &= ~REG_NLANCH;
424
0
          break;
425
0
        case 'q':   /* literal string */
426
0
          flags->cflags |= REG_QUOTE;
427
0
          flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
428
0
          break;
429
0
        case 's':   /* single line, \n ordinary */
430
0
          flags->cflags &= ~REG_NEWLINE;
431
0
          break;
432
0
        case 't':   /* tight syntax */
433
0
          flags->cflags &= ~REG_EXPANDED;
434
0
          break;
435
0
        case 'w':   /* weird, \n affects ^ $ only */
436
0
          flags->cflags &= ~REG_NLSTOP;
437
0
          flags->cflags |= REG_NLANCH;
438
0
          break;
439
0
        case 'x':   /* expanded syntax */
440
0
          flags->cflags |= REG_EXPANDED;
441
0
          break;
442
0
        default:
443
0
          ereport(ERROR,
444
0
              (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
445
0
               errmsg("invalid regular expression option: \"%.*s\"",
446
0
                  pg_mblen(opt_p + i), opt_p + i)));
447
0
          break;
448
0
      }
449
0
    }
450
0
  }
451
0
}
452
453
454
/*
455
 *  interface routines called by the function manager
456
 */
457
458
Datum
459
nameregexeq(PG_FUNCTION_ARGS)
460
0
{
461
0
  Name    n = PG_GETARG_NAME(0);
462
0
  text     *p = PG_GETARG_TEXT_PP(1);
463
464
0
  PG_RETURN_BOOL(RE_compile_and_execute(p,
465
0
                      NameStr(*n),
466
0
                      strlen(NameStr(*n)),
467
0
                      REG_ADVANCED,
468
0
                      PG_GET_COLLATION(),
469
0
                      0, NULL));
470
0
}
471
472
Datum
473
nameregexne(PG_FUNCTION_ARGS)
474
0
{
475
0
  Name    n = PG_GETARG_NAME(0);
476
0
  text     *p = PG_GETARG_TEXT_PP(1);
477
478
0
  PG_RETURN_BOOL(!RE_compile_and_execute(p,
479
0
                       NameStr(*n),
480
0
                       strlen(NameStr(*n)),
481
0
                       REG_ADVANCED,
482
0
                       PG_GET_COLLATION(),
483
0
                       0, NULL));
484
0
}
485
486
Datum
487
textregexeq(PG_FUNCTION_ARGS)
488
0
{
489
0
  text     *s = PG_GETARG_TEXT_PP(0);
490
0
  text     *p = PG_GETARG_TEXT_PP(1);
491
492
0
  PG_RETURN_BOOL(RE_compile_and_execute(p,
493
0
                      VARDATA_ANY(s),
494
0
                      VARSIZE_ANY_EXHDR(s),
495
0
                      REG_ADVANCED,
496
0
                      PG_GET_COLLATION(),
497
0
                      0, NULL));
498
0
}
499
500
Datum
501
textregexne(PG_FUNCTION_ARGS)
502
0
{
503
0
  text     *s = PG_GETARG_TEXT_PP(0);
504
0
  text     *p = PG_GETARG_TEXT_PP(1);
505
506
0
  PG_RETURN_BOOL(!RE_compile_and_execute(p,
507
0
                       VARDATA_ANY(s),
508
0
                       VARSIZE_ANY_EXHDR(s),
509
0
                       REG_ADVANCED,
510
0
                       PG_GET_COLLATION(),
511
0
                       0, NULL));
512
0
}
513
514
515
/*
516
 *  routines that use the regexp stuff, but ignore the case.
517
 *  for this, we use the REG_ICASE flag to pg_regcomp
518
 */
519
520
521
Datum
522
nameicregexeq(PG_FUNCTION_ARGS)
523
0
{
524
0
  Name    n = PG_GETARG_NAME(0);
525
0
  text     *p = PG_GETARG_TEXT_PP(1);
526
527
0
  PG_RETURN_BOOL(RE_compile_and_execute(p,
528
0
                      NameStr(*n),
529
0
                      strlen(NameStr(*n)),
530
0
                      REG_ADVANCED | REG_ICASE,
531
0
                      PG_GET_COLLATION(),
532
0
                      0, NULL));
533
0
}
534
535
Datum
536
nameicregexne(PG_FUNCTION_ARGS)
537
0
{
538
0
  Name    n = PG_GETARG_NAME(0);
539
0
  text     *p = PG_GETARG_TEXT_PP(1);
540
541
0
  PG_RETURN_BOOL(!RE_compile_and_execute(p,
542
0
                       NameStr(*n),
543
0
                       strlen(NameStr(*n)),
544
0
                       REG_ADVANCED | REG_ICASE,
545
0
                       PG_GET_COLLATION(),
546
0
                       0, NULL));
547
0
}
548
549
Datum
550
texticregexeq(PG_FUNCTION_ARGS)
551
0
{
552
0
  text     *s = PG_GETARG_TEXT_PP(0);
553
0
  text     *p = PG_GETARG_TEXT_PP(1);
554
555
0
  PG_RETURN_BOOL(RE_compile_and_execute(p,
556
0
                      VARDATA_ANY(s),
557
0
                      VARSIZE_ANY_EXHDR(s),
558
0
                      REG_ADVANCED | REG_ICASE,
559
0
                      PG_GET_COLLATION(),
560
0
                      0, NULL));
561
0
}
562
563
Datum
564
texticregexne(PG_FUNCTION_ARGS)
565
0
{
566
0
  text     *s = PG_GETARG_TEXT_PP(0);
567
0
  text     *p = PG_GETARG_TEXT_PP(1);
568
569
0
  PG_RETURN_BOOL(!RE_compile_and_execute(p,
570
0
                       VARDATA_ANY(s),
571
0
                       VARSIZE_ANY_EXHDR(s),
572
0
                       REG_ADVANCED | REG_ICASE,
573
0
                       PG_GET_COLLATION(),
574
0
                       0, NULL));
575
0
}
576
577
578
/*
579
 * textregexsubstr()
580
 *    Return a substring matched by a regular expression.
581
 */
582
Datum
583
textregexsubstr(PG_FUNCTION_ARGS)
584
0
{
585
0
  text     *s = PG_GETARG_TEXT_PP(0);
586
0
  text     *p = PG_GETARG_TEXT_PP(1);
587
0
  regex_t    *re;
588
0
  regmatch_t  pmatch[2];
589
0
  int     so,
590
0
        eo;
591
592
  /* Compile RE */
593
0
  re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
594
595
  /*
596
   * We pass two regmatch_t structs to get info about the overall match and
597
   * the match for the first parenthesized subexpression (if any). If there
598
   * is a parenthesized subexpression, we return what it matched; else
599
   * return what the whole regexp matched.
600
   */
601
0
  if (!RE_execute(re,
602
0
          VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
603
0
          2, pmatch))
604
0
    PG_RETURN_NULL();   /* definitely no match */
605
606
0
  if (re->re_nsub > 0)
607
0
  {
608
    /* has parenthesized subexpressions, use the first one */
609
0
    so = pmatch[1].rm_so;
610
0
    eo = pmatch[1].rm_eo;
611
0
  }
612
0
  else
613
0
  {
614
    /* no parenthesized subexpression, use whole match */
615
0
    so = pmatch[0].rm_so;
616
0
    eo = pmatch[0].rm_eo;
617
0
  }
618
619
  /*
620
   * It is possible to have a match to the whole pattern but no match for a
621
   * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
622
   * there is no subexpression match.  So this extra test for match failure
623
   * is not redundant.
624
   */
625
0
  if (so < 0 || eo < 0)
626
0
    PG_RETURN_NULL();
627
628
0
  return DirectFunctionCall3(text_substr,
629
0
                 PointerGetDatum(s),
630
0
                 Int32GetDatum(so + 1),
631
0
                 Int32GetDatum(eo - so));
632
0
}
633
634
/*
635
 * textregexreplace_noopt()
636
 *    Return a string matched by a regular expression, with replacement.
637
 *
638
 * This version doesn't have an option argument: we default to case
639
 * sensitive match, replace the first instance only.
640
 */
641
Datum
642
textregexreplace_noopt(PG_FUNCTION_ARGS)
643
0
{
644
0
  text     *s = PG_GETARG_TEXT_PP(0);
645
0
  text     *p = PG_GETARG_TEXT_PP(1);
646
0
  text     *r = PG_GETARG_TEXT_PP(2);
647
648
0
  PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
649
0
                     REG_ADVANCED, PG_GET_COLLATION(),
650
0
                     0, 1));
651
0
}
652
653
/*
654
 * textregexreplace()
655
 *    Return a string matched by a regular expression, with replacement.
656
 */
657
Datum
658
textregexreplace(PG_FUNCTION_ARGS)
659
0
{
660
0
  text     *s = PG_GETARG_TEXT_PP(0);
661
0
  text     *p = PG_GETARG_TEXT_PP(1);
662
0
  text     *r = PG_GETARG_TEXT_PP(2);
663
0
  text     *opt = PG_GETARG_TEXT_PP(3);
664
0
  pg_re_flags flags;
665
666
  /*
667
   * regexp_replace() with four arguments will be preferentially resolved as
668
   * this form when the fourth argument is of type UNKNOWN.  However, the
669
   * user might have intended to call textregexreplace_extended_no_n.  If we
670
   * see flags that look like an integer, emit the same error that
671
   * parse_re_flags would, but add a HINT about how to fix it.
672
   */
673
0
  if (VARSIZE_ANY_EXHDR(opt) > 0)
674
0
  {
675
0
    char     *opt_p = VARDATA_ANY(opt);
676
677
0
    if (*opt_p >= '0' && *opt_p <= '9')
678
0
      ereport(ERROR,
679
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
680
0
           errmsg("invalid regular expression option: \"%.*s\"",
681
0
              pg_mblen(opt_p), opt_p),
682
0
           errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
683
0
  }
684
685
0
  parse_re_flags(&flags, opt);
686
687
0
  PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
688
0
                     flags.cflags, PG_GET_COLLATION(),
689
0
                     0, flags.glob ? 0 : 1));
690
0
}
691
692
/*
693
 * textregexreplace_extended()
694
 *    Return a string matched by a regular expression, with replacement.
695
 *    Extends textregexreplace by allowing a start position and the
696
 *    choice of the occurrence to replace (0 means all occurrences).
697
 */
698
Datum
699
textregexreplace_extended(PG_FUNCTION_ARGS)
700
0
{
701
0
  text     *s = PG_GETARG_TEXT_PP(0);
702
0
  text     *p = PG_GETARG_TEXT_PP(1);
703
0
  text     *r = PG_GETARG_TEXT_PP(2);
704
0
  int     start = 1;
705
0
  int     n = 1;
706
0
  text     *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
707
0
  pg_re_flags re_flags;
708
709
  /* Collect optional parameters */
710
0
  if (PG_NARGS() > 3)
711
0
  {
712
0
    start = PG_GETARG_INT32(3);
713
0
    if (start <= 0)
714
0
      ereport(ERROR,
715
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
716
0
           errmsg("invalid value for parameter \"%s\": %d",
717
0
              "start", start)));
718
0
  }
719
0
  if (PG_NARGS() > 4)
720
0
  {
721
0
    n = PG_GETARG_INT32(4);
722
0
    if (n < 0)
723
0
      ereport(ERROR,
724
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
725
0
           errmsg("invalid value for parameter \"%s\": %d",
726
0
              "n", n)));
727
0
  }
728
729
  /* Determine options */
730
0
  parse_re_flags(&re_flags, flags);
731
732
  /* If N was not specified, deduce it from the 'g' flag */
733
0
  if (PG_NARGS() <= 4)
734
0
    n = re_flags.glob ? 0 : 1;
735
736
  /* Do the replacement(s) */
737
0
  PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
738
0
                     re_flags.cflags, PG_GET_COLLATION(),
739
0
                     start - 1, n));
740
0
}
741
742
/* This is separate to keep the opr_sanity regression test from complaining */
743
Datum
744
textregexreplace_extended_no_n(PG_FUNCTION_ARGS)
745
0
{
746
0
  return textregexreplace_extended(fcinfo);
747
0
}
748
749
/* This is separate to keep the opr_sanity regression test from complaining */
750
Datum
751
textregexreplace_extended_no_flags(PG_FUNCTION_ARGS)
752
0
{
753
0
  return textregexreplace_extended(fcinfo);
754
0
}
755
756
/*
757
 * similar_to_escape(), similar_escape()
758
 *
759
 * Convert a SQL "SIMILAR TO" regexp pattern to POSIX style, so it can be
760
 * used by our regexp engine.
761
 *
762
 * similar_escape_internal() is the common workhorse for three SQL-exposed
763
 * functions.  esc_text can be passed as NULL to select the default escape
764
 * (which is '\'), or as an empty string to select no escape character.
765
 */
766
static text *
767
similar_escape_internal(text *pat_text, text *esc_text)
768
0
{
769
0
  text     *result;
770
0
  char     *p,
771
0
         *e,
772
0
         *r;
773
0
  int     plen,
774
0
        elen;
775
0
  bool    afterescape = false;
776
0
  int     nquotes = 0;
777
0
  int     charclass_depth = 0;  /* Nesting level of character classes,
778
                     * encompassed by square brackets */
779
0
  int     charclass_start = 0;  /* State of the character class start,
780
                     * for carets */
781
782
0
  p = VARDATA_ANY(pat_text);
783
0
  plen = VARSIZE_ANY_EXHDR(pat_text);
784
0
  if (esc_text == NULL)
785
0
  {
786
    /* No ESCAPE clause provided; default to backslash as escape */
787
0
    e = "\\";
788
0
    elen = 1;
789
0
  }
790
0
  else
791
0
  {
792
0
    e = VARDATA_ANY(esc_text);
793
0
    elen = VARSIZE_ANY_EXHDR(esc_text);
794
0
    if (elen == 0)
795
0
      e = NULL;     /* no escape character */
796
0
    else if (elen > 1)
797
0
    {
798
0
      int     escape_mblen = pg_mbstrlen_with_len(e, elen);
799
800
0
      if (escape_mblen > 1)
801
0
        ereport(ERROR,
802
0
            (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
803
0
             errmsg("invalid escape string"),
804
0
             errhint("Escape string must be empty or one character.")));
805
0
    }
806
0
  }
807
808
  /*----------
809
   * We surround the transformed input string with
810
   *      ^(?: ... )$
811
   * which requires some explanation.  We need "^" and "$" to force
812
   * the pattern to match the entire input string as per the SQL spec.
813
   * The "(?:" and ")" are a non-capturing set of parens; we have to have
814
   * parens in case the string contains "|", else the "^" and "$" will
815
   * be bound into the first and last alternatives which is not what we
816
   * want, and the parens must be non capturing because we don't want them
817
   * to count when selecting output for SUBSTRING.
818
   *
819
   * When the pattern is divided into three parts by escape-double-quotes,
820
   * what we emit is
821
   *      ^(?:part1){1,1}?(part2){1,1}(?:part3)$
822
   * which requires even more explanation.  The "{1,1}?" on part1 makes it
823
   * non-greedy so that it will match the smallest possible amount of text
824
   * not the largest, as required by SQL.  The plain parens around part2
825
   * are capturing parens so that that part is what controls the result of
826
   * SUBSTRING.  The "{1,1}" forces part2 to be greedy, so that it matches
827
   * the largest possible amount of text; hence part3 must match the
828
   * smallest amount of text, as required by SQL.  We don't need an explicit
829
   * greediness marker on part3.  Note that this also confines the effects
830
   * of any "|" characters to the respective part, which is what we want.
831
   *
832
   * The SQL spec says that SUBSTRING's pattern must contain exactly two
833
   * escape-double-quotes, but we only complain if there's more than two.
834
   * With none, we act as though part1 and part3 are empty; with one, we
835
   * act as though part3 is empty.  Both behaviors fall out of omitting
836
   * the relevant part separators in the above expansion.  If the result
837
   * of this function is used in a plain regexp match (SIMILAR TO), the
838
   * escape-double-quotes have no effect on the match behavior.
839
   *----------
840
   */
841
842
  /*
843
   * We need room for the prefix/postfix and part separators, plus as many
844
   * as 3 output bytes per input byte; since the input is at most 1GB this
845
   * can't overflow size_t.
846
   */
847
0
  result = (text *) palloc(VARHDRSZ + 23 + 3 * (size_t) plen);
848
0
  r = VARDATA(result);
849
850
0
  *r++ = '^';
851
0
  *r++ = '(';
852
0
  *r++ = '?';
853
0
  *r++ = ':';
854
855
0
  while (plen > 0)
856
0
  {
857
0
    char    pchar = *p;
858
859
    /*
860
     * If both the escape character and the current character from the
861
     * pattern are multi-byte, we need to take the slow path.
862
     *
863
     * But if one of them is single-byte, we can process the pattern one
864
     * byte at a time, ignoring multi-byte characters.  (This works
865
     * because all server-encodings have the property that a valid
866
     * multi-byte character representation cannot contain the
867
     * representation of a valid single-byte character.)
868
     */
869
870
0
    if (elen > 1)
871
0
    {
872
0
      int     mblen = pg_mblen(p);
873
874
0
      if (mblen > 1)
875
0
      {
876
        /* slow, multi-byte path */
877
0
        if (afterescape)
878
0
        {
879
0
          *r++ = '\\';
880
0
          memcpy(r, p, mblen);
881
0
          r += mblen;
882
0
          afterescape = false;
883
0
        }
884
0
        else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
885
0
        {
886
          /* SQL escape character; do not send to output */
887
0
          afterescape = true;
888
0
        }
889
0
        else
890
0
        {
891
          /*
892
           * We know it's a multi-byte character, so we don't need
893
           * to do all the comparisons to single-byte characters
894
           * that we do below.
895
           */
896
0
          memcpy(r, p, mblen);
897
0
          r += mblen;
898
0
        }
899
900
0
        p += mblen;
901
0
        plen -= mblen;
902
903
0
        continue;
904
0
      }
905
0
    }
906
907
    /* fast path */
908
0
    if (afterescape)
909
0
    {
910
0
      if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */
911
0
      {
912
        /* emit appropriate part separator, per notes above */
913
0
        if (nquotes == 0)
914
0
        {
915
0
          *r++ = ')';
916
0
          *r++ = '{';
917
0
          *r++ = '1';
918
0
          *r++ = ',';
919
0
          *r++ = '1';
920
0
          *r++ = '}';
921
0
          *r++ = '?';
922
0
          *r++ = '(';
923
0
        }
924
0
        else if (nquotes == 1)
925
0
        {
926
0
          *r++ = ')';
927
0
          *r++ = '{';
928
0
          *r++ = '1';
929
0
          *r++ = ',';
930
0
          *r++ = '1';
931
0
          *r++ = '}';
932
0
          *r++ = '(';
933
0
          *r++ = '?';
934
0
          *r++ = ':';
935
0
        }
936
0
        else
937
0
          ereport(ERROR,
938
0
              (errcode(ERRCODE_INVALID_USE_OF_ESCAPE_CHARACTER),
939
0
               errmsg("SQL regular expression may not contain more than two escape-double-quote separators")));
940
0
        nquotes++;
941
0
      }
942
0
      else
943
0
      {
944
        /*
945
         * We allow any character at all to be escaped; notably, this
946
         * allows access to POSIX character-class escapes such as
947
         * "\d".  The SQL spec is considerably more restrictive.
948
         */
949
0
        *r++ = '\\';
950
0
        *r++ = pchar;
951
0
      }
952
0
      afterescape = false;
953
0
    }
954
0
    else if (e && pchar == *e)
955
0
    {
956
      /* SQL escape character; do not send to output */
957
0
      afterescape = true;
958
0
    }
959
0
    else if (charclass_depth > 0)
960
0
    {
961
0
      if (pchar == '\\')
962
0
        *r++ = '\\';
963
0
      *r++ = pchar;
964
965
      /*
966
       * Ignore a closing bracket at the start of a character class.
967
       * Such a bracket is taken literally rather than closing the
968
       * class.  "charclass_start" is 1 right at the beginning of a
969
       * class and 2 after an initial caret.
970
       */
971
0
      if (pchar == ']' && charclass_start > 2)
972
0
        charclass_depth--;
973
0
      else if (pchar == '[')
974
0
        charclass_depth++;
975
976
      /*
977
       * If there is a caret right after the opening bracket, it negates
978
       * the character class, but a following closing bracket should
979
       * still be treated as a normal character.  That holds only for
980
       * the first caret, so only the values 1 and 2 mean that closing
981
       * brackets should be taken literally.
982
       */
983
0
      if (pchar == '^')
984
0
        charclass_start++;
985
0
      else
986
0
        charclass_start = 3; /* definitely past the start */
987
0
    }
988
0
    else if (pchar == '[')
989
0
    {
990
      /* start of a character class */
991
0
      *r++ = pchar;
992
0
      charclass_depth++;
993
0
      charclass_start = 1;
994
0
    }
995
0
    else if (pchar == '%')
996
0
    {
997
0
      *r++ = '.';
998
0
      *r++ = '*';
999
0
    }
1000
0
    else if (pchar == '_')
1001
0
      *r++ = '.';
1002
0
    else if (pchar == '(')
1003
0
    {
1004
      /* convert to non-capturing parenthesis */
1005
0
      *r++ = '(';
1006
0
      *r++ = '?';
1007
0
      *r++ = ':';
1008
0
    }
1009
0
    else if (pchar == '\\' || pchar == '.' ||
1010
0
         pchar == '^' || pchar == '$')
1011
0
    {
1012
0
      *r++ = '\\';
1013
0
      *r++ = pchar;
1014
0
    }
1015
0
    else
1016
0
      *r++ = pchar;
1017
0
    p++, plen--;
1018
0
  }
1019
1020
0
  *r++ = ')';
1021
0
  *r++ = '$';
1022
1023
0
  SET_VARSIZE(result, r - ((char *) result));
1024
1025
0
  return result;
1026
0
}
1027
1028
/*
1029
 * similar_to_escape(pattern, escape)
1030
 */
1031
Datum
1032
similar_to_escape_2(PG_FUNCTION_ARGS)
1033
0
{
1034
0
  text     *pat_text = PG_GETARG_TEXT_PP(0);
1035
0
  text     *esc_text = PG_GETARG_TEXT_PP(1);
1036
0
  text     *result;
1037
1038
0
  result = similar_escape_internal(pat_text, esc_text);
1039
1040
0
  PG_RETURN_TEXT_P(result);
1041
0
}
1042
1043
/*
1044
 * similar_to_escape(pattern)
1045
 * Inserts a default escape character.
1046
 */
1047
Datum
1048
similar_to_escape_1(PG_FUNCTION_ARGS)
1049
0
{
1050
0
  text     *pat_text = PG_GETARG_TEXT_PP(0);
1051
0
  text     *result;
1052
1053
0
  result = similar_escape_internal(pat_text, NULL);
1054
1055
0
  PG_RETURN_TEXT_P(result);
1056
0
}
1057
1058
/*
1059
 * similar_escape(pattern, escape)
1060
 *
1061
 * Legacy function for compatibility with views stored using the
1062
 * pre-v13 expansion of SIMILAR TO.  Unlike the above functions, this
1063
 * is non-strict, which leads to not-per-spec handling of "ESCAPE NULL".
1064
 */
1065
Datum
1066
similar_escape(PG_FUNCTION_ARGS)
1067
0
{
1068
0
  text     *pat_text;
1069
0
  text     *esc_text;
1070
0
  text     *result;
1071
1072
  /* This function is not strict, so must test explicitly */
1073
0
  if (PG_ARGISNULL(0))
1074
0
    PG_RETURN_NULL();
1075
0
  pat_text = PG_GETARG_TEXT_PP(0);
1076
1077
0
  if (PG_ARGISNULL(1))
1078
0
    esc_text = NULL;   /* use default escape character */
1079
0
  else
1080
0
    esc_text = PG_GETARG_TEXT_PP(1);
1081
1082
0
  result = similar_escape_internal(pat_text, esc_text);
1083
1084
0
  PG_RETURN_TEXT_P(result);
1085
0
}
1086
1087
/*
1088
 * regexp_count()
1089
 *    Return the number of matches of a pattern within a string.
1090
 */
1091
Datum
1092
regexp_count(PG_FUNCTION_ARGS)
1093
0
{
1094
0
  text     *str = PG_GETARG_TEXT_PP(0);
1095
0
  text     *pattern = PG_GETARG_TEXT_PP(1);
1096
0
  int     start = 1;
1097
0
  text     *flags = PG_GETARG_TEXT_PP_IF_EXISTS(3);
1098
0
  pg_re_flags re_flags;
1099
0
  regexp_matches_ctx *matchctx;
1100
1101
  /* Collect optional parameters */
1102
0
  if (PG_NARGS() > 2)
1103
0
  {
1104
0
    start = PG_GETARG_INT32(2);
1105
0
    if (start <= 0)
1106
0
      ereport(ERROR,
1107
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1108
0
           errmsg("invalid value for parameter \"%s\": %d",
1109
0
              "start", start)));
1110
0
  }
1111
1112
  /* Determine options */
1113
0
  parse_re_flags(&re_flags, flags);
1114
  /* User mustn't specify 'g' */
1115
0
  if (re_flags.glob)
1116
0
    ereport(ERROR,
1117
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1118
    /* translator: %s is a SQL function name */
1119
0
         errmsg("%s does not support the \"global\" option",
1120
0
            "regexp_count()")));
1121
  /* But we find all the matches anyway */
1122
0
  re_flags.glob = true;
1123
1124
  /* Do the matching */
1125
0
  matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
1126
0
                  PG_GET_COLLATION(),
1127
0
                  false,  /* can ignore subexprs */
1128
0
                  false, false);
1129
1130
0
  PG_RETURN_INT32(matchctx->nmatches);
1131
0
}
1132
1133
/* This is separate to keep the opr_sanity regression test from complaining */
1134
Datum
1135
regexp_count_no_start(PG_FUNCTION_ARGS)
1136
0
{
1137
0
  return regexp_count(fcinfo);
1138
0
}
1139
1140
/* This is separate to keep the opr_sanity regression test from complaining */
1141
Datum
1142
regexp_count_no_flags(PG_FUNCTION_ARGS)
1143
0
{
1144
0
  return regexp_count(fcinfo);
1145
0
}
1146
1147
/*
1148
 * regexp_instr()
1149
 *    Return the match's position within the string
1150
 */
1151
Datum
1152
regexp_instr(PG_FUNCTION_ARGS)
1153
0
{
1154
0
  text     *str = PG_GETARG_TEXT_PP(0);
1155
0
  text     *pattern = PG_GETARG_TEXT_PP(1);
1156
0
  int     start = 1;
1157
0
  int     n = 1;
1158
0
  int     endoption = 0;
1159
0
  text     *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
1160
0
  int     subexpr = 0;
1161
0
  int     pos;
1162
0
  pg_re_flags re_flags;
1163
0
  regexp_matches_ctx *matchctx;
1164
1165
  /* Collect optional parameters */
1166
0
  if (PG_NARGS() > 2)
1167
0
  {
1168
0
    start = PG_GETARG_INT32(2);
1169
0
    if (start <= 0)
1170
0
      ereport(ERROR,
1171
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1172
0
           errmsg("invalid value for parameter \"%s\": %d",
1173
0
              "start", start)));
1174
0
  }
1175
0
  if (PG_NARGS() > 3)
1176
0
  {
1177
0
    n = PG_GETARG_INT32(3);
1178
0
    if (n <= 0)
1179
0
      ereport(ERROR,
1180
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1181
0
           errmsg("invalid value for parameter \"%s\": %d",
1182
0
              "n", n)));
1183
0
  }
1184
0
  if (PG_NARGS() > 4)
1185
0
  {
1186
0
    endoption = PG_GETARG_INT32(4);
1187
0
    if (endoption != 0 && endoption != 1)
1188
0
      ereport(ERROR,
1189
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1190
0
           errmsg("invalid value for parameter \"%s\": %d",
1191
0
              "endoption", endoption)));
1192
0
  }
1193
0
  if (PG_NARGS() > 6)
1194
0
  {
1195
0
    subexpr = PG_GETARG_INT32(6);
1196
0
    if (subexpr < 0)
1197
0
      ereport(ERROR,
1198
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1199
0
           errmsg("invalid value for parameter \"%s\": %d",
1200
0
              "subexpr", subexpr)));
1201
0
  }
1202
1203
  /* Determine options */
1204
0
  parse_re_flags(&re_flags, flags);
1205
  /* User mustn't specify 'g' */
1206
0
  if (re_flags.glob)
1207
0
    ereport(ERROR,
1208
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1209
    /* translator: %s is a SQL function name */
1210
0
         errmsg("%s does not support the \"global\" option",
1211
0
            "regexp_instr()")));
1212
  /* But we find all the matches anyway */
1213
0
  re_flags.glob = true;
1214
1215
  /* Do the matching */
1216
0
  matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
1217
0
                  PG_GET_COLLATION(),
1218
0
                  (subexpr > 0),  /* need submatches? */
1219
0
                  false, false);
1220
1221
  /* When n exceeds matches return 0 (includes case of no matches) */
1222
0
  if (n > matchctx->nmatches)
1223
0
    PG_RETURN_INT32(0);
1224
1225
  /* When subexpr exceeds number of subexpressions return 0 */
1226
0
  if (subexpr > matchctx->npatterns)
1227
0
    PG_RETURN_INT32(0);
1228
1229
  /* Select the appropriate match position to return */
1230
0
  pos = (n - 1) * matchctx->npatterns;
1231
0
  if (subexpr > 0)
1232
0
    pos += subexpr - 1;
1233
0
  pos *= 2;
1234
0
  if (endoption == 1)
1235
0
    pos += 1;
1236
1237
0
  if (matchctx->match_locs[pos] >= 0)
1238
0
    PG_RETURN_INT32(matchctx->match_locs[pos] + 1);
1239
0
  else
1240
0
    PG_RETURN_INT32(0);    /* position not identifiable */
1241
0
}
1242
1243
/* This is separate to keep the opr_sanity regression test from complaining */
1244
Datum
1245
regexp_instr_no_start(PG_FUNCTION_ARGS)
1246
0
{
1247
0
  return regexp_instr(fcinfo);
1248
0
}
1249
1250
/* This is separate to keep the opr_sanity regression test from complaining */
1251
Datum
1252
regexp_instr_no_n(PG_FUNCTION_ARGS)
1253
0
{
1254
0
  return regexp_instr(fcinfo);
1255
0
}
1256
1257
/* This is separate to keep the opr_sanity regression test from complaining */
1258
Datum
1259
regexp_instr_no_endoption(PG_FUNCTION_ARGS)
1260
0
{
1261
0
  return regexp_instr(fcinfo);
1262
0
}
1263
1264
/* This is separate to keep the opr_sanity regression test from complaining */
1265
Datum
1266
regexp_instr_no_flags(PG_FUNCTION_ARGS)
1267
0
{
1268
0
  return regexp_instr(fcinfo);
1269
0
}
1270
1271
/* This is separate to keep the opr_sanity regression test from complaining */
1272
Datum
1273
regexp_instr_no_subexpr(PG_FUNCTION_ARGS)
1274
0
{
1275
0
  return regexp_instr(fcinfo);
1276
0
}
1277
1278
/*
1279
 * regexp_like()
1280
 *    Test for a pattern match within a string.
1281
 */
1282
Datum
1283
regexp_like(PG_FUNCTION_ARGS)
1284
0
{
1285
0
  text     *str = PG_GETARG_TEXT_PP(0);
1286
0
  text     *pattern = PG_GETARG_TEXT_PP(1);
1287
0
  text     *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
1288
0
  pg_re_flags re_flags;
1289
1290
  /* Determine options */
1291
0
  parse_re_flags(&re_flags, flags);
1292
  /* User mustn't specify 'g' */
1293
0
  if (re_flags.glob)
1294
0
    ereport(ERROR,
1295
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1296
    /* translator: %s is a SQL function name */
1297
0
         errmsg("%s does not support the \"global\" option",
1298
0
            "regexp_like()")));
1299
1300
  /* Otherwise it's like textregexeq/texticregexeq */
1301
0
  PG_RETURN_BOOL(RE_compile_and_execute(pattern,
1302
0
                      VARDATA_ANY(str),
1303
0
                      VARSIZE_ANY_EXHDR(str),
1304
0
                      re_flags.cflags,
1305
0
                      PG_GET_COLLATION(),
1306
0
                      0, NULL));
1307
0
}
1308
1309
/* This is separate to keep the opr_sanity regression test from complaining */
1310
Datum
1311
regexp_like_no_flags(PG_FUNCTION_ARGS)
1312
0
{
1313
0
  return regexp_like(fcinfo);
1314
0
}
1315
1316
/*
1317
 * regexp_match()
1318
 *    Return the first substring(s) matching a pattern within a string.
1319
 */
1320
Datum
1321
regexp_match(PG_FUNCTION_ARGS)
1322
0
{
1323
0
  text     *orig_str = PG_GETARG_TEXT_PP(0);
1324
0
  text     *pattern = PG_GETARG_TEXT_PP(1);
1325
0
  text     *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
1326
0
  pg_re_flags re_flags;
1327
0
  regexp_matches_ctx *matchctx;
1328
1329
  /* Determine options */
1330
0
  parse_re_flags(&re_flags, flags);
1331
  /* User mustn't specify 'g' */
1332
0
  if (re_flags.glob)
1333
0
    ereport(ERROR,
1334
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1335
    /* translator: %s is a SQL function name */
1336
0
         errmsg("%s does not support the \"global\" option",
1337
0
            "regexp_match()"),
1338
0
         errhint("Use the regexp_matches function instead.")));
1339
1340
0
  matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, 0,
1341
0
                  PG_GET_COLLATION(), true, false, false);
1342
1343
0
  if (matchctx->nmatches == 0)
1344
0
    PG_RETURN_NULL();
1345
1346
0
  Assert(matchctx->nmatches == 1);
1347
1348
  /* Create workspace that build_regexp_match_result needs */
1349
0
  matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
1350
0
  matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
1351
1352
0
  PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx)));
1353
0
}
1354
1355
/* This is separate to keep the opr_sanity regression test from complaining */
1356
Datum
1357
regexp_match_no_flags(PG_FUNCTION_ARGS)
1358
0
{
1359
0
  return regexp_match(fcinfo);
1360
0
}
1361
1362
/*
1363
 * regexp_matches()
1364
 *    Return a table of all matches of a pattern within a string.
1365
 */
1366
Datum
1367
regexp_matches(PG_FUNCTION_ARGS)
1368
0
{
1369
0
  FuncCallContext *funcctx;
1370
0
  regexp_matches_ctx *matchctx;
1371
1372
0
  if (SRF_IS_FIRSTCALL())
1373
0
  {
1374
0
    text     *pattern = PG_GETARG_TEXT_PP(1);
1375
0
    text     *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
1376
0
    pg_re_flags re_flags;
1377
0
    MemoryContext oldcontext;
1378
1379
0
    funcctx = SRF_FIRSTCALL_INIT();
1380
0
    oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1381
1382
    /* Determine options */
1383
0
    parse_re_flags(&re_flags, flags);
1384
1385
    /* be sure to copy the input string into the multi-call ctx */
1386
0
    matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
1387
0
                    &re_flags, 0,
1388
0
                    PG_GET_COLLATION(),
1389
0
                    true, false, false);
1390
1391
    /* Pre-create workspace that build_regexp_match_result needs */
1392
0
    matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
1393
0
    matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
1394
1395
0
    MemoryContextSwitchTo(oldcontext);
1396
0
    funcctx->user_fctx = matchctx;
1397
0
  }
1398
1399
0
  funcctx = SRF_PERCALL_SETUP();
1400
0
  matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
1401
1402
0
  if (matchctx->next_match < matchctx->nmatches)
1403
0
  {
1404
0
    ArrayType  *result_ary;
1405
1406
0
    result_ary = build_regexp_match_result(matchctx);
1407
0
    matchctx->next_match++;
1408
0
    SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
1409
0
  }
1410
1411
0
  SRF_RETURN_DONE(funcctx);
1412
0
}
1413
1414
/* This is separate to keep the opr_sanity regression test from complaining */
1415
Datum
1416
regexp_matches_no_flags(PG_FUNCTION_ARGS)
1417
0
{
1418
0
  return regexp_matches(fcinfo);
1419
0
}
1420
1421
/*
1422
 * setup_regexp_matches --- do the initial matching for regexp_match,
1423
 *    regexp_split, and related functions
1424
 *
1425
 * To avoid having to re-find the compiled pattern on each call, we do
1426
 * all the matching in one swoop.  The returned regexp_matches_ctx contains
1427
 * the locations of all the substrings matching the pattern.
1428
 *
1429
 * start_search: the character (not byte) offset in orig_str at which to
1430
 * begin the search.  Returned positions are relative to orig_str anyway.
1431
 * use_subpatterns: collect data about matches to parenthesized subexpressions.
1432
 * ignore_degenerate: ignore zero-length matches.
1433
 * fetching_unmatched: caller wants to fetch unmatched substrings.
1434
 *
1435
 * We don't currently assume that fetching_unmatched is exclusive of fetching
1436
 * the matched text too; if it's set, the conversion buffer is large enough to
1437
 * fetch any single matched or unmatched string, but not any larger
1438
 * substring.  (In practice, when splitting the matches are usually small
1439
 * anyway, and it didn't seem worth complicating the code further.)
1440
 */
1441
static regexp_matches_ctx *
1442
setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
1443
           int start_search,
1444
           Oid collation,
1445
           bool use_subpatterns,
1446
           bool ignore_degenerate,
1447
           bool fetching_unmatched)
1448
0
{
1449
0
  regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
1450
0
  int     eml = pg_database_encoding_max_length();
1451
0
  int     orig_len;
1452
0
  pg_wchar   *wide_str;
1453
0
  int     wide_len;
1454
0
  int     cflags;
1455
0
  regex_t    *cpattern;
1456
0
  regmatch_t *pmatch;
1457
0
  int     pmatch_len;
1458
0
  int     array_len;
1459
0
  int     array_idx;
1460
0
  int     prev_match_end;
1461
0
  int     prev_valid_match_end;
1462
0
  int     maxlen = 0;   /* largest fetch length in characters */
1463
1464
  /* save original string --- we'll extract result substrings from it */
1465
0
  matchctx->orig_str = orig_str;
1466
1467
  /* convert string to pg_wchar form for matching */
1468
0
  orig_len = VARSIZE_ANY_EXHDR(orig_str);
1469
0
  wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
1470
0
  wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
1471
1472
  /* set up the compiled pattern */
1473
0
  cflags = re_flags->cflags;
1474
0
  if (!use_subpatterns)
1475
0
    cflags |= REG_NOSUB;
1476
0
  cpattern = RE_compile_and_cache(pattern, cflags, collation);
1477
1478
  /* do we want to remember subpatterns? */
1479
0
  if (use_subpatterns && cpattern->re_nsub > 0)
1480
0
  {
1481
0
    matchctx->npatterns = cpattern->re_nsub;
1482
0
    pmatch_len = cpattern->re_nsub + 1;
1483
0
  }
1484
0
  else
1485
0
  {
1486
0
    use_subpatterns = false;
1487
0
    matchctx->npatterns = 1;
1488
0
    pmatch_len = 1;
1489
0
  }
1490
1491
  /* temporary output space for RE package */
1492
0
  pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
1493
1494
  /*
1495
   * the real output space (grown dynamically if needed)
1496
   *
1497
   * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
1498
   * than at 2^27
1499
   */
1500
0
  array_len = re_flags->glob ? 255 : 31;
1501
0
  matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
1502
0
  array_idx = 0;
1503
1504
  /* search for the pattern, perhaps repeatedly */
1505
0
  prev_match_end = 0;
1506
0
  prev_valid_match_end = 0;
1507
0
  while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
1508
0
              pmatch_len, pmatch))
1509
0
  {
1510
    /*
1511
     * If requested, ignore degenerate matches, which are zero-length
1512
     * matches occurring at the start or end of a string or just after a
1513
     * previous match.
1514
     */
1515
0
    if (!ignore_degenerate ||
1516
0
      (pmatch[0].rm_so < wide_len &&
1517
0
       pmatch[0].rm_eo > prev_match_end))
1518
0
    {
1519
      /* enlarge output space if needed */
1520
0
      while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
1521
0
      {
1522
0
        array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
1523
0
        if (array_len > MaxAllocSize / sizeof(int))
1524
0
          ereport(ERROR,
1525
0
              (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1526
0
               errmsg("too many regular expression matches")));
1527
0
        matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
1528
0
                            sizeof(int) * array_len);
1529
0
      }
1530
1531
      /* save this match's locations */
1532
0
      if (use_subpatterns)
1533
0
      {
1534
0
        int     i;
1535
1536
0
        for (i = 1; i <= matchctx->npatterns; i++)
1537
0
        {
1538
0
          int     so = pmatch[i].rm_so;
1539
0
          int     eo = pmatch[i].rm_eo;
1540
1541
0
          matchctx->match_locs[array_idx++] = so;
1542
0
          matchctx->match_locs[array_idx++] = eo;
1543
0
          if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
1544
0
            maxlen = (eo - so);
1545
0
        }
1546
0
      }
1547
0
      else
1548
0
      {
1549
0
        int     so = pmatch[0].rm_so;
1550
0
        int     eo = pmatch[0].rm_eo;
1551
1552
0
        matchctx->match_locs[array_idx++] = so;
1553
0
        matchctx->match_locs[array_idx++] = eo;
1554
0
        if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
1555
0
          maxlen = (eo - so);
1556
0
      }
1557
0
      matchctx->nmatches++;
1558
1559
      /*
1560
       * check length of unmatched portion between end of previous valid
1561
       * (nondegenerate, or degenerate but not ignored) match and start
1562
       * of current one
1563
       */
1564
0
      if (fetching_unmatched &&
1565
0
        pmatch[0].rm_so >= 0 &&
1566
0
        (pmatch[0].rm_so - prev_valid_match_end) > maxlen)
1567
0
        maxlen = (pmatch[0].rm_so - prev_valid_match_end);
1568
0
      prev_valid_match_end = pmatch[0].rm_eo;
1569
0
    }
1570
0
    prev_match_end = pmatch[0].rm_eo;
1571
1572
    /* if not glob, stop after one match */
1573
0
    if (!re_flags->glob)
1574
0
      break;
1575
1576
    /*
1577
     * Advance search position.  Normally we start the next search at the
1578
     * end of the previous match; but if the match was of zero length, we
1579
     * have to advance by one character, or we'd just find the same match
1580
     * again.
1581
     */
1582
0
    start_search = prev_match_end;
1583
0
    if (pmatch[0].rm_so == pmatch[0].rm_eo)
1584
0
      start_search++;
1585
0
    if (start_search > wide_len)
1586
0
      break;
1587
0
  }
1588
1589
  /*
1590
   * check length of unmatched portion between end of last match and end of
1591
   * input string
1592
   */
1593
0
  if (fetching_unmatched &&
1594
0
    (wide_len - prev_valid_match_end) > maxlen)
1595
0
    maxlen = (wide_len - prev_valid_match_end);
1596
1597
  /*
1598
   * Keep a note of the end position of the string for the benefit of
1599
   * splitting code.
1600
   */
1601
0
  matchctx->match_locs[array_idx] = wide_len;
1602
1603
0
  if (eml > 1)
1604
0
  {
1605
0
    int64   maxsiz = eml * (int64) maxlen;
1606
0
    int     conv_bufsiz;
1607
1608
    /*
1609
     * Make the conversion buffer large enough for any substring of
1610
     * interest.
1611
     *
1612
     * Worst case: assume we need the maximum size (maxlen*eml), but take
1613
     * advantage of the fact that the original string length in bytes is
1614
     * an upper bound on the byte length of any fetched substring (and we
1615
     * know that len+1 is safe to allocate because the varlena header is
1616
     * longer than 1 byte).
1617
     */
1618
0
    if (maxsiz > orig_len)
1619
0
      conv_bufsiz = orig_len + 1;
1620
0
    else
1621
0
      conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */
1622
1623
0
    matchctx->conv_buf = palloc(conv_bufsiz);
1624
0
    matchctx->conv_bufsiz = conv_bufsiz;
1625
0
    matchctx->wide_str = wide_str;
1626
0
  }
1627
0
  else
1628
0
  {
1629
    /* No need to keep the wide string if we're in a single-byte charset. */
1630
0
    pfree(wide_str);
1631
0
    matchctx->wide_str = NULL;
1632
0
    matchctx->conv_buf = NULL;
1633
0
    matchctx->conv_bufsiz = 0;
1634
0
  }
1635
1636
  /* Clean up temp storage */
1637
0
  pfree(pmatch);
1638
1639
0
  return matchctx;
1640
0
}
1641
1642
/*
1643
 * build_regexp_match_result - build output array for current match
1644
 */
1645
static ArrayType *
1646
build_regexp_match_result(regexp_matches_ctx *matchctx)
1647
0
{
1648
0
  char     *buf = matchctx->conv_buf;
1649
0
  Datum    *elems = matchctx->elems;
1650
0
  bool     *nulls = matchctx->nulls;
1651
0
  int     dims[1];
1652
0
  int     lbs[1];
1653
0
  int     loc;
1654
0
  int     i;
1655
1656
  /* Extract matching substrings from the original string */
1657
0
  loc = matchctx->next_match * matchctx->npatterns * 2;
1658
0
  for (i = 0; i < matchctx->npatterns; i++)
1659
0
  {
1660
0
    int     so = matchctx->match_locs[loc++];
1661
0
    int     eo = matchctx->match_locs[loc++];
1662
1663
0
    if (so < 0 || eo < 0)
1664
0
    {
1665
0
      elems[i] = (Datum) 0;
1666
0
      nulls[i] = true;
1667
0
    }
1668
0
    else if (buf)
1669
0
    {
1670
0
      int     len = pg_wchar2mb_with_len(matchctx->wide_str + so,
1671
0
                           buf,
1672
0
                           eo - so);
1673
1674
0
      Assert(len < matchctx->conv_bufsiz);
1675
0
      elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
1676
0
      nulls[i] = false;
1677
0
    }
1678
0
    else
1679
0
    {
1680
0
      elems[i] = DirectFunctionCall3(text_substr,
1681
0
                       PointerGetDatum(matchctx->orig_str),
1682
0
                       Int32GetDatum(so + 1),
1683
0
                       Int32GetDatum(eo - so));
1684
0
      nulls[i] = false;
1685
0
    }
1686
0
  }
1687
1688
  /* And form an array */
1689
0
  dims[0] = matchctx->npatterns;
1690
0
  lbs[0] = 1;
1691
  /* XXX: this hardcodes assumptions about the text type */
1692
0
  return construct_md_array(elems, nulls, 1, dims, lbs,
1693
0
                TEXTOID, -1, false, TYPALIGN_INT);
1694
0
}
1695
1696
/*
1697
 * regexp_split_to_table()
1698
 *    Split the string at matches of the pattern, returning the
1699
 *    split-out substrings as a table.
1700
 */
1701
Datum
1702
regexp_split_to_table(PG_FUNCTION_ARGS)
1703
0
{
1704
0
  FuncCallContext *funcctx;
1705
0
  regexp_matches_ctx *splitctx;
1706
1707
0
  if (SRF_IS_FIRSTCALL())
1708
0
  {
1709
0
    text     *pattern = PG_GETARG_TEXT_PP(1);
1710
0
    text     *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
1711
0
    pg_re_flags re_flags;
1712
0
    MemoryContext oldcontext;
1713
1714
0
    funcctx = SRF_FIRSTCALL_INIT();
1715
0
    oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1716
1717
    /* Determine options */
1718
0
    parse_re_flags(&re_flags, flags);
1719
    /* User mustn't specify 'g' */
1720
0
    if (re_flags.glob)
1721
0
      ereport(ERROR,
1722
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1723
      /* translator: %s is a SQL function name */
1724
0
           errmsg("%s does not support the \"global\" option",
1725
0
              "regexp_split_to_table()")));
1726
    /* But we find all the matches anyway */
1727
0
    re_flags.glob = true;
1728
1729
    /* be sure to copy the input string into the multi-call ctx */
1730
0
    splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
1731
0
                    &re_flags, 0,
1732
0
                    PG_GET_COLLATION(),
1733
0
                    false, true, true);
1734
1735
0
    MemoryContextSwitchTo(oldcontext);
1736
0
    funcctx->user_fctx = splitctx;
1737
0
  }
1738
1739
0
  funcctx = SRF_PERCALL_SETUP();
1740
0
  splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
1741
1742
0
  if (splitctx->next_match <= splitctx->nmatches)
1743
0
  {
1744
0
    Datum   result = build_regexp_split_result(splitctx);
1745
1746
0
    splitctx->next_match++;
1747
0
    SRF_RETURN_NEXT(funcctx, result);
1748
0
  }
1749
1750
0
  SRF_RETURN_DONE(funcctx);
1751
0
}
1752
1753
/* This is separate to keep the opr_sanity regression test from complaining */
1754
Datum
1755
regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
1756
0
{
1757
0
  return regexp_split_to_table(fcinfo);
1758
0
}
1759
1760
/*
1761
 * regexp_split_to_array()
1762
 *    Split the string at matches of the pattern, returning the
1763
 *    split-out substrings as an array.
1764
 */
1765
Datum
1766
regexp_split_to_array(PG_FUNCTION_ARGS)
1767
0
{
1768
0
  ArrayBuildState *astate = NULL;
1769
0
  pg_re_flags re_flags;
1770
0
  regexp_matches_ctx *splitctx;
1771
1772
  /* Determine options */
1773
0
  parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(2));
1774
  /* User mustn't specify 'g' */
1775
0
  if (re_flags.glob)
1776
0
    ereport(ERROR,
1777
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1778
    /* translator: %s is a SQL function name */
1779
0
         errmsg("%s does not support the \"global\" option",
1780
0
            "regexp_split_to_array()")));
1781
  /* But we find all the matches anyway */
1782
0
  re_flags.glob = true;
1783
1784
0
  splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
1785
0
                  PG_GETARG_TEXT_PP(1),
1786
0
                  &re_flags, 0,
1787
0
                  PG_GET_COLLATION(),
1788
0
                  false, true, true);
1789
1790
0
  while (splitctx->next_match <= splitctx->nmatches)
1791
0
  {
1792
0
    astate = accumArrayResult(astate,
1793
0
                  build_regexp_split_result(splitctx),
1794
0
                  false,
1795
0
                  TEXTOID,
1796
0
                  CurrentMemoryContext);
1797
0
    splitctx->next_match++;
1798
0
  }
1799
1800
0
  PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
1801
0
}
1802
1803
/* This is separate to keep the opr_sanity regression test from complaining */
1804
Datum
1805
regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
1806
0
{
1807
0
  return regexp_split_to_array(fcinfo);
1808
0
}
1809
1810
/*
1811
 * build_regexp_split_result - build output string for current match
1812
 *
1813
 * We return the string between the current match and the previous one,
1814
 * or the string after the last match when next_match == nmatches.
1815
 */
1816
static Datum
1817
build_regexp_split_result(regexp_matches_ctx *splitctx)
1818
0
{
1819
0
  char     *buf = splitctx->conv_buf;
1820
0
  int     startpos;
1821
0
  int     endpos;
1822
1823
0
  if (splitctx->next_match > 0)
1824
0
    startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
1825
0
  else
1826
0
    startpos = 0;
1827
0
  if (startpos < 0)
1828
0
    elog(ERROR, "invalid match ending position");
1829
1830
0
  endpos = splitctx->match_locs[splitctx->next_match * 2];
1831
0
  if (endpos < startpos)
1832
0
    elog(ERROR, "invalid match starting position");
1833
1834
0
  if (buf)
1835
0
  {
1836
0
    int     len;
1837
1838
0
    len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
1839
0
                   buf,
1840
0
                   endpos - startpos);
1841
0
    Assert(len < splitctx->conv_bufsiz);
1842
0
    return PointerGetDatum(cstring_to_text_with_len(buf, len));
1843
0
  }
1844
0
  else
1845
0
  {
1846
0
    return DirectFunctionCall3(text_substr,
1847
0
                   PointerGetDatum(splitctx->orig_str),
1848
0
                   Int32GetDatum(startpos + 1),
1849
0
                   Int32GetDatum(endpos - startpos));
1850
0
  }
1851
0
}
1852
1853
/*
1854
 * regexp_substr()
1855
 *    Return the substring that matches a regular expression pattern
1856
 */
1857
Datum
1858
regexp_substr(PG_FUNCTION_ARGS)
1859
0
{
1860
0
  text     *str = PG_GETARG_TEXT_PP(0);
1861
0
  text     *pattern = PG_GETARG_TEXT_PP(1);
1862
0
  int     start = 1;
1863
0
  int     n = 1;
1864
0
  text     *flags = PG_GETARG_TEXT_PP_IF_EXISTS(4);
1865
0
  int     subexpr = 0;
1866
0
  int     so,
1867
0
        eo,
1868
0
        pos;
1869
0
  pg_re_flags re_flags;
1870
0
  regexp_matches_ctx *matchctx;
1871
1872
  /* Collect optional parameters */
1873
0
  if (PG_NARGS() > 2)
1874
0
  {
1875
0
    start = PG_GETARG_INT32(2);
1876
0
    if (start <= 0)
1877
0
      ereport(ERROR,
1878
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1879
0
           errmsg("invalid value for parameter \"%s\": %d",
1880
0
              "start", start)));
1881
0
  }
1882
0
  if (PG_NARGS() > 3)
1883
0
  {
1884
0
    n = PG_GETARG_INT32(3);
1885
0
    if (n <= 0)
1886
0
      ereport(ERROR,
1887
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1888
0
           errmsg("invalid value for parameter \"%s\": %d",
1889
0
              "n", n)));
1890
0
  }
1891
0
  if (PG_NARGS() > 5)
1892
0
  {
1893
0
    subexpr = PG_GETARG_INT32(5);
1894
0
    if (subexpr < 0)
1895
0
      ereport(ERROR,
1896
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1897
0
           errmsg("invalid value for parameter \"%s\": %d",
1898
0
              "subexpr", subexpr)));
1899
0
  }
1900
1901
  /* Determine options */
1902
0
  parse_re_flags(&re_flags, flags);
1903
  /* User mustn't specify 'g' */
1904
0
  if (re_flags.glob)
1905
0
    ereport(ERROR,
1906
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1907
    /* translator: %s is a SQL function name */
1908
0
         errmsg("%s does not support the \"global\" option",
1909
0
            "regexp_substr()")));
1910
  /* But we find all the matches anyway */
1911
0
  re_flags.glob = true;
1912
1913
  /* Do the matching */
1914
0
  matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
1915
0
                  PG_GET_COLLATION(),
1916
0
                  (subexpr > 0),  /* need submatches? */
1917
0
                  false, false);
1918
1919
  /* When n exceeds matches return NULL (includes case of no matches) */
1920
0
  if (n > matchctx->nmatches)
1921
0
    PG_RETURN_NULL();
1922
1923
  /* When subexpr exceeds number of subexpressions return NULL */
1924
0
  if (subexpr > matchctx->npatterns)
1925
0
    PG_RETURN_NULL();
1926
1927
  /* Select the appropriate match position to return */
1928
0
  pos = (n - 1) * matchctx->npatterns;
1929
0
  if (subexpr > 0)
1930
0
    pos += subexpr - 1;
1931
0
  pos *= 2;
1932
0
  so = matchctx->match_locs[pos];
1933
0
  eo = matchctx->match_locs[pos + 1];
1934
1935
0
  if (so < 0 || eo < 0)
1936
0
    PG_RETURN_NULL();   /* unidentifiable location */
1937
1938
0
  PG_RETURN_DATUM(DirectFunctionCall3(text_substr,
1939
0
                    PointerGetDatum(matchctx->orig_str),
1940
0
                    Int32GetDatum(so + 1),
1941
0
                    Int32GetDatum(eo - so)));
1942
0
}
1943
1944
/* This is separate to keep the opr_sanity regression test from complaining */
1945
Datum
1946
regexp_substr_no_start(PG_FUNCTION_ARGS)
1947
0
{
1948
0
  return regexp_substr(fcinfo);
1949
0
}
1950
1951
/* This is separate to keep the opr_sanity regression test from complaining */
1952
Datum
1953
regexp_substr_no_n(PG_FUNCTION_ARGS)
1954
0
{
1955
0
  return regexp_substr(fcinfo);
1956
0
}
1957
1958
/* This is separate to keep the opr_sanity regression test from complaining */
1959
Datum
1960
regexp_substr_no_flags(PG_FUNCTION_ARGS)
1961
0
{
1962
0
  return regexp_substr(fcinfo);
1963
0
}
1964
1965
/* This is separate to keep the opr_sanity regression test from complaining */
1966
Datum
1967
regexp_substr_no_subexpr(PG_FUNCTION_ARGS)
1968
0
{
1969
0
  return regexp_substr(fcinfo);
1970
0
}
1971
1972
/*
1973
 * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
1974
 *
1975
 * The result is NULL if there is no fixed prefix, else a palloc'd string.
1976
 * If it is an exact match, not just a prefix, *exact is returned as true.
1977
 */
1978
char *
1979
regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
1980
          bool *exact)
1981
0
{
1982
0
  char     *result;
1983
0
  regex_t    *re;
1984
0
  int     cflags;
1985
0
  int     re_result;
1986
0
  pg_wchar   *str;
1987
0
  size_t    slen;
1988
0
  size_t    maxlen;
1989
0
  char    errMsg[100];
1990
1991
0
  *exact = false;       /* default result */
1992
1993
  /* Compile RE */
1994
0
  cflags = REG_ADVANCED;
1995
0
  if (case_insensitive)
1996
0
    cflags |= REG_ICASE;
1997
1998
0
  re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);
1999
2000
  /* Examine it to see if there's a fixed prefix */
2001
0
  re_result = pg_regprefix(re, &str, &slen);
2002
2003
0
  switch (re_result)
2004
0
  {
2005
0
    case REG_NOMATCH:
2006
0
      return NULL;
2007
2008
0
    case REG_PREFIX:
2009
      /* continue with wchar conversion */
2010
0
      break;
2011
2012
0
    case REG_EXACT:
2013
0
      *exact = true;
2014
      /* continue with wchar conversion */
2015
0
      break;
2016
2017
0
    default:
2018
      /* re failed??? */
2019
0
      pg_regerror(re_result, re, errMsg, sizeof(errMsg));
2020
0
      ereport(ERROR,
2021
0
          (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
2022
0
           errmsg("regular expression failed: %s", errMsg)));
2023
0
      break;
2024
0
  }
2025
2026
  /* Convert pg_wchar result back to database encoding */
2027
0
  maxlen = pg_database_encoding_max_length() * slen + 1;
2028
0
  result = (char *) palloc(maxlen);
2029
0
  slen = pg_wchar2mb_with_len(str, result, slen);
2030
0
  Assert(slen < maxlen);
2031
2032
0
  pfree(str);
2033
2034
0
  return result;
2035
0
}