Coverage Report

Created: 2025-06-13 06:06

/src/postgres/src/backend/utils/adt/varlena.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * varlena.c
4
 *    Functions for the variable-length built-in types.
5
 *
6
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7
 * Portions Copyright (c) 1994, Regents of the University of California
8
 *
9
 *
10
 * IDENTIFICATION
11
 *    src/backend/utils/adt/varlena.c
12
 *
13
 *-------------------------------------------------------------------------
14
 */
15
#include "postgres.h"
16
17
#include <ctype.h>
18
#include <limits.h>
19
20
#include "access/detoast.h"
21
#include "access/toast_compression.h"
22
#include "catalog/pg_collation.h"
23
#include "catalog/pg_type.h"
24
#include "common/hashfn.h"
25
#include "common/int.h"
26
#include "common/unicode_category.h"
27
#include "common/unicode_norm.h"
28
#include "common/unicode_version.h"
29
#include "funcapi.h"
30
#include "lib/hyperloglog.h"
31
#include "libpq/pqformat.h"
32
#include "miscadmin.h"
33
#include "nodes/execnodes.h"
34
#include "parser/scansup.h"
35
#include "port/pg_bswap.h"
36
#include "regex/regex.h"
37
#include "utils/builtins.h"
38
#include "utils/bytea.h"
39
#include "utils/guc.h"
40
#include "utils/lsyscache.h"
41
#include "utils/memutils.h"
42
#include "utils/pg_locale.h"
43
#include "utils/sortsupport.h"
44
#include "utils/varlena.h"
45
46
47
/* GUC variable */
48
int     bytea_output = BYTEA_OUTPUT_HEX;
49
50
typedef struct varlena VarString;
51
52
/*
53
 * State for text_position_* functions.
54
 */
55
typedef struct
56
{
57
  pg_locale_t locale;     /* collation used for substring matching */
58
  bool    is_multibyte_char_in_char;  /* need to check char boundaries? */
59
  bool    greedy;     /* find longest possible substring? */
60
61
  char     *str1;     /* haystack string */
62
  char     *str2;     /* needle string */
63
  int     len1;     /* string lengths in bytes */
64
  int     len2;
65
66
  /* Skip table for Boyer-Moore-Horspool search algorithm: */
67
  int     skiptablemask;  /* mask for ANDing with skiptable subscripts */
68
  int     skiptable[256]; /* skip distance for given mismatched char */
69
70
  /*
71
   * Note that with nondeterministic collations, the length of the last
72
   * match is not necessarily equal to the length of the "needle" passed in.
73
   */
74
  char     *last_match;   /* pointer to last match in 'str1' */
75
  int     last_match_len; /* length of last match */
76
  int     last_match_len_tmp; /* same but for internal use */
77
78
  /*
79
   * Sometimes we need to convert the byte position of a match to a
80
   * character position.  These store the last position that was converted,
81
   * so that on the next call, we can continue from that point, rather than
82
   * count characters from the very beginning.
83
   */
84
  char     *refpoint;   /* pointer within original haystack string */
85
  int     refpos;     /* 0-based character offset of the same point */
86
} TextPositionState;
87
88
typedef struct
89
{
90
  char     *buf1;     /* 1st string, or abbreviation original string
91
                 * buf */
92
  char     *buf2;     /* 2nd string, or abbreviation strxfrm() buf */
93
  int     buflen1;    /* Allocated length of buf1 */
94
  int     buflen2;    /* Allocated length of buf2 */
95
  int     last_len1;    /* Length of last buf1 string/strxfrm() input */
96
  int     last_len2;    /* Length of last buf2 string/strxfrm() blob */
97
  int     last_returned;  /* Last comparison result (cache) */
98
  bool    cache_blob;   /* Does buf2 contain strxfrm() blob, etc? */
99
  bool    collate_c;
100
  Oid     typid;      /* Actual datatype (text/bpchar/bytea/name) */
101
  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
102
  hyperLogLogState full_card; /* Full key cardinality state */
103
  double    prop_card;    /* Required cardinality proportion */
104
  pg_locale_t locale;
105
} VarStringSortSupport;
106
107
/*
108
 * Output data for split_text(): we output either to an array or a table.
109
 * tupstore and tupdesc must be set up in advance to output to a table.
110
 */
111
typedef struct
112
{
113
  ArrayBuildState *astate;
114
  Tuplestorestate *tupstore;
115
  TupleDesc tupdesc;
116
} SplitTextOutputData;
117
118
/*
119
 * This should be large enough that most strings will fit, but small enough
120
 * that we feel comfortable putting it on the stack
121
 */
122
0
#define TEXTBUFLEN    1024
123
124
#define DatumGetVarStringP(X)   ((VarString *) PG_DETOAST_DATUM(X))
125
0
#define DatumGetVarStringPP(X)    ((VarString *) PG_DETOAST_DATUM_PACKED(X))
126
127
static int  varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
128
static int  bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
129
static int  namefastcmp_c(Datum x, Datum y, SortSupport ssup);
130
static int  varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
131
static int  namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
132
static int  varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
133
static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
134
static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
135
static int32 text_length(Datum str);
136
static text *text_catenate(text *t1, text *t2);
137
static text *text_substring(Datum str,
138
              int32 start,
139
              int32 length,
140
              bool length_not_specified);
141
static text *text_overlay(text *t1, text *t2, int sp, int sl);
142
static int  text_position(text *t1, text *t2, Oid collid);
143
static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
144
static bool text_position_next(TextPositionState *state);
145
static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
146
static char *text_position_get_match_ptr(TextPositionState *state);
147
static int  text_position_get_match_pos(TextPositionState *state);
148
static void text_position_cleanup(TextPositionState *state);
149
static void check_collation_set(Oid collid);
150
static int  text_cmp(text *arg1, text *arg2, Oid collid);
151
static bytea *bytea_catenate(bytea *t1, bytea *t2);
152
static bytea *bytea_substring(Datum str,
153
                int S,
154
                int L,
155
                bool length_not_specified);
156
static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
157
static void appendStringInfoText(StringInfo str, const text *t);
158
static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
159
static void split_text_accum_result(SplitTextOutputData *tstate,
160
                  text *field_value,
161
                  text *null_string,
162
                  Oid collation);
163
static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
164
                  const char *fldsep, const char *null_string);
165
static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
166
static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
167
                   int *value);
168
static const char *text_format_parse_format(const char *start_ptr,
169
                      const char *end_ptr,
170
                      int *argpos, int *widthpos,
171
                      int *flags, int *width);
172
static void text_format_string_conversion(StringInfo buf, char conversion,
173
                      FmgrInfo *typOutputInfo,
174
                      Datum value, bool isNull,
175
                      int flags, int width);
176
static void text_format_append_string(StringInfo buf, const char *str,
177
                    int flags, int width);
178
179
180
/*****************************************************************************
181
 *   CONVERSION ROUTINES EXPORTED FOR USE BY C CODE              *
182
 *****************************************************************************/
183
184
/*
185
 * cstring_to_text
186
 *
187
 * Create a text value from a null-terminated C string.
188
 *
189
 * The new text value is freshly palloc'd with a full-size VARHDR.
190
 */
191
text *
192
cstring_to_text(const char *s)
193
0
{
194
0
  return cstring_to_text_with_len(s, strlen(s));
195
0
}
196
197
/*
198
 * cstring_to_text_with_len
199
 *
200
 * Same as cstring_to_text except the caller specifies the string length;
201
 * the string need not be null_terminated.
202
 */
203
text *
204
cstring_to_text_with_len(const char *s, int len)
205
0
{
206
0
  text     *result = (text *) palloc(len + VARHDRSZ);
207
208
0
  SET_VARSIZE(result, len + VARHDRSZ);
209
0
  memcpy(VARDATA(result), s, len);
210
211
0
  return result;
212
0
}
213
214
/*
215
 * text_to_cstring
216
 *
217
 * Create a palloc'd, null-terminated C string from a text value.
218
 *
219
 * We support being passed a compressed or toasted text value.
220
 * This is a bit bogus since such values shouldn't really be referred to as
221
 * "text *", but it seems useful for robustness.  If we didn't handle that
222
 * case here, we'd need another routine that did, anyway.
223
 */
224
char *
225
text_to_cstring(const text *t)
226
0
{
227
  /* must cast away the const, unfortunately */
228
0
  text     *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
229
0
  int     len = VARSIZE_ANY_EXHDR(tunpacked);
230
0
  char     *result;
231
232
0
  result = (char *) palloc(len + 1);
233
0
  memcpy(result, VARDATA_ANY(tunpacked), len);
234
0
  result[len] = '\0';
235
236
0
  if (tunpacked != t)
237
0
    pfree(tunpacked);
238
239
0
  return result;
240
0
}
241
242
/*
243
 * text_to_cstring_buffer
244
 *
245
 * Copy a text value into a caller-supplied buffer of size dst_len.
246
 *
247
 * The text string is truncated if necessary to fit.  The result is
248
 * guaranteed null-terminated (unless dst_len == 0).
249
 *
250
 * We support being passed a compressed or toasted text value.
251
 * This is a bit bogus since such values shouldn't really be referred to as
252
 * "text *", but it seems useful for robustness.  If we didn't handle that
253
 * case here, we'd need another routine that did, anyway.
254
 */
255
void
256
text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
257
0
{
258
  /* must cast away the const, unfortunately */
259
0
  text     *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
260
0
  size_t    src_len = VARSIZE_ANY_EXHDR(srcunpacked);
261
262
0
  if (dst_len > 0)
263
0
  {
264
0
    dst_len--;
265
0
    if (dst_len >= src_len)
266
0
      dst_len = src_len;
267
0
    else          /* ensure truncation is encoding-safe */
268
0
      dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
269
0
    memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
270
0
    dst[dst_len] = '\0';
271
0
  }
272
273
0
  if (srcunpacked != src)
274
0
    pfree(srcunpacked);
275
0
}
276
277
278
/*****************************************************************************
279
 *   USER I/O ROUTINES                             *
280
 *****************************************************************************/
281
282
283
0
#define VAL(CH)     ((CH) - '0')
284
0
#define DIG(VAL)    ((VAL) + '0')
285
286
/*
287
 *    byteain     - converts from printable representation of byte array
288
 *
289
 *    Non-printable characters must be passed as '\nnn' (octal) and are
290
 *    converted to internal form.  '\' must be passed as '\\'.
291
 *    ereport(ERROR, ...) if bad form.
292
 *
293
 *    BUGS:
294
 *        The input is scanned twice.
295
 *        The error checking of input is minimal.
296
 */
297
Datum
298
byteain(PG_FUNCTION_ARGS)
299
0
{
300
0
  char     *inputText = PG_GETARG_CSTRING(0);
301
0
  Node     *escontext = fcinfo->context;
302
0
  char     *tp;
303
0
  char     *rp;
304
0
  int     bc;
305
0
  bytea    *result;
306
307
  /* Recognize hex input */
308
0
  if (inputText[0] == '\\' && inputText[1] == 'x')
309
0
  {
310
0
    size_t    len = strlen(inputText);
311
312
0
    bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
313
0
    result = palloc(bc);
314
0
    bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
315
0
               escontext);
316
0
    SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
317
318
0
    PG_RETURN_BYTEA_P(result);
319
0
  }
320
321
  /* Else, it's the traditional escaped style */
322
0
  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
323
0
  {
324
0
    if (tp[0] != '\\')
325
0
      tp++;
326
0
    else if ((tp[0] == '\\') &&
327
0
         (tp[1] >= '0' && tp[1] <= '3') &&
328
0
         (tp[2] >= '0' && tp[2] <= '7') &&
329
0
         (tp[3] >= '0' && tp[3] <= '7'))
330
0
      tp += 4;
331
0
    else if ((tp[0] == '\\') &&
332
0
         (tp[1] == '\\'))
333
0
      tp += 2;
334
0
    else
335
0
    {
336
      /*
337
       * one backslash, not followed by another or ### valid octal
338
       */
339
0
      ereturn(escontext, (Datum) 0,
340
0
          (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
341
0
           errmsg("invalid input syntax for type %s", "bytea")));
342
0
    }
343
0
  }
344
345
0
  bc += VARHDRSZ;
346
347
0
  result = (bytea *) palloc(bc);
348
0
  SET_VARSIZE(result, bc);
349
350
0
  tp = inputText;
351
0
  rp = VARDATA(result);
352
0
  while (*tp != '\0')
353
0
  {
354
0
    if (tp[0] != '\\')
355
0
      *rp++ = *tp++;
356
0
    else if ((tp[0] == '\\') &&
357
0
         (tp[1] >= '0' && tp[1] <= '3') &&
358
0
         (tp[2] >= '0' && tp[2] <= '7') &&
359
0
         (tp[3] >= '0' && tp[3] <= '7'))
360
0
    {
361
0
      bc = VAL(tp[1]);
362
0
      bc <<= 3;
363
0
      bc += VAL(tp[2]);
364
0
      bc <<= 3;
365
0
      *rp++ = bc + VAL(tp[3]);
366
367
0
      tp += 4;
368
0
    }
369
0
    else if ((tp[0] == '\\') &&
370
0
         (tp[1] == '\\'))
371
0
    {
372
0
      *rp++ = '\\';
373
0
      tp += 2;
374
0
    }
375
0
    else
376
0
    {
377
      /*
378
       * We should never get here. The first pass should not allow it.
379
       */
380
0
      ereturn(escontext, (Datum) 0,
381
0
          (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
382
0
           errmsg("invalid input syntax for type %s", "bytea")));
383
0
    }
384
0
  }
385
386
0
  PG_RETURN_BYTEA_P(result);
387
0
}
388
389
/*
390
 *    byteaout    - converts to printable representation of byte array
391
 *
392
 *    In the traditional escaped format, non-printable characters are
393
 *    printed as '\nnn' (octal) and '\' as '\\'.
394
 */
395
Datum
396
byteaout(PG_FUNCTION_ARGS)
397
0
{
398
0
  bytea    *vlena = PG_GETARG_BYTEA_PP(0);
399
0
  char     *result;
400
0
  char     *rp;
401
402
0
  if (bytea_output == BYTEA_OUTPUT_HEX)
403
0
  {
404
    /* Print hex format */
405
0
    rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
406
0
    *rp++ = '\\';
407
0
    *rp++ = 'x';
408
0
    rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
409
0
  }
410
0
  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
411
0
  {
412
    /* Print traditional escaped format */
413
0
    char     *vp;
414
0
    uint64    len;
415
0
    int     i;
416
417
0
    len = 1;        /* empty string has 1 char */
418
0
    vp = VARDATA_ANY(vlena);
419
0
    for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
420
0
    {
421
0
      if (*vp == '\\')
422
0
        len += 2;
423
0
      else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
424
0
        len += 4;
425
0
      else
426
0
        len++;
427
0
    }
428
429
    /*
430
     * In principle len can't overflow uint32 if the input fit in 1GB, but
431
     * for safety let's check rather than relying on palloc's internal
432
     * check.
433
     */
434
0
    if (len > MaxAllocSize)
435
0
      ereport(ERROR,
436
0
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
437
0
           errmsg_internal("result of bytea output conversion is too large")));
438
0
    rp = result = (char *) palloc(len);
439
440
0
    vp = VARDATA_ANY(vlena);
441
0
    for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
442
0
    {
443
0
      if (*vp == '\\')
444
0
      {
445
0
        *rp++ = '\\';
446
0
        *rp++ = '\\';
447
0
      }
448
0
      else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
449
0
      {
450
0
        int     val;  /* holds unprintable chars */
451
452
0
        val = *vp;
453
0
        rp[0] = '\\';
454
0
        rp[3] = DIG(val & 07);
455
0
        val >>= 3;
456
0
        rp[2] = DIG(val & 07);
457
0
        val >>= 3;
458
0
        rp[1] = DIG(val & 03);
459
0
        rp += 4;
460
0
      }
461
0
      else
462
0
        *rp++ = *vp;
463
0
    }
464
0
  }
465
0
  else
466
0
  {
467
0
    elog(ERROR, "unrecognized \"bytea_output\" setting: %d",
468
0
       bytea_output);
469
0
    rp = result = NULL;   /* keep compiler quiet */
470
0
  }
471
0
  *rp = '\0';
472
0
  PG_RETURN_CSTRING(result);
473
0
}
474
475
/*
476
 *    bytearecv     - converts external binary format to bytea
477
 */
478
Datum
479
bytearecv(PG_FUNCTION_ARGS)
480
0
{
481
0
  StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
482
0
  bytea    *result;
483
0
  int     nbytes;
484
485
0
  nbytes = buf->len - buf->cursor;
486
0
  result = (bytea *) palloc(nbytes + VARHDRSZ);
487
0
  SET_VARSIZE(result, nbytes + VARHDRSZ);
488
0
  pq_copymsgbytes(buf, VARDATA(result), nbytes);
489
0
  PG_RETURN_BYTEA_P(result);
490
0
}
491
492
/*
493
 *    byteasend     - converts bytea to binary format
494
 *
495
 * This is a special case: just copy the input...
496
 */
497
Datum
498
byteasend(PG_FUNCTION_ARGS)
499
0
{
500
0
  bytea    *vlena = PG_GETARG_BYTEA_P_COPY(0);
501
502
0
  PG_RETURN_BYTEA_P(vlena);
503
0
}
504
505
Datum
506
bytea_string_agg_transfn(PG_FUNCTION_ARGS)
507
0
{
508
0
  StringInfo  state;
509
510
0
  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
511
512
  /* Append the value unless null, preceding it with the delimiter. */
513
0
  if (!PG_ARGISNULL(1))
514
0
  {
515
0
    bytea    *value = PG_GETARG_BYTEA_PP(1);
516
0
    bool    isfirst = false;
517
518
    /*
519
     * You might think we can just throw away the first delimiter, however
520
     * we must keep it as we may be a parallel worker doing partial
521
     * aggregation building a state to send to the main process.  We need
522
     * to keep the delimiter of every aggregation so that the combine
523
     * function can properly join up the strings of two separately
524
     * partially aggregated results.  The first delimiter is only stripped
525
     * off in the final function.  To know how much to strip off the front
526
     * of the string, we store the length of the first delimiter in the
527
     * StringInfo's cursor field, which we don't otherwise need here.
528
     */
529
0
    if (state == NULL)
530
0
    {
531
0
      state = makeStringAggState(fcinfo);
532
0
      isfirst = true;
533
0
    }
534
535
0
    if (!PG_ARGISNULL(2))
536
0
    {
537
0
      bytea    *delim = PG_GETARG_BYTEA_PP(2);
538
539
0
      appendBinaryStringInfo(state, VARDATA_ANY(delim),
540
0
                   VARSIZE_ANY_EXHDR(delim));
541
0
      if (isfirst)
542
0
        state->cursor = VARSIZE_ANY_EXHDR(delim);
543
0
    }
544
545
0
    appendBinaryStringInfo(state, VARDATA_ANY(value),
546
0
                 VARSIZE_ANY_EXHDR(value));
547
0
  }
548
549
  /*
550
   * The transition type for string_agg() is declared to be "internal",
551
   * which is a pass-by-value type the same size as a pointer.
552
   */
553
0
  if (state)
554
0
    PG_RETURN_POINTER(state);
555
0
  PG_RETURN_NULL();
556
0
}
557
558
Datum
559
bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
560
0
{
561
0
  StringInfo  state;
562
563
  /* cannot be called directly because of internal-type argument */
564
0
  Assert(AggCheckCallContext(fcinfo, NULL));
565
566
0
  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
567
568
0
  if (state != NULL)
569
0
  {
570
    /* As per comment in transfn, strip data before the cursor position */
571
0
    bytea    *result;
572
0
    int     strippedlen = state->len - state->cursor;
573
574
0
    result = (bytea *) palloc(strippedlen + VARHDRSZ);
575
0
    SET_VARSIZE(result, strippedlen + VARHDRSZ);
576
0
    memcpy(VARDATA(result), &state->data[state->cursor], strippedlen);
577
0
    PG_RETURN_BYTEA_P(result);
578
0
  }
579
0
  else
580
0
    PG_RETURN_NULL();
581
0
}
582
583
/*
584
 *    textin      - converts cstring to internal representation
585
 */
586
Datum
587
textin(PG_FUNCTION_ARGS)
588
0
{
589
0
  char     *inputText = PG_GETARG_CSTRING(0);
590
591
0
  PG_RETURN_TEXT_P(cstring_to_text(inputText));
592
0
}
593
594
/*
595
 *    textout     - converts internal representation to cstring
596
 */
597
Datum
598
textout(PG_FUNCTION_ARGS)
599
0
{
600
0
  Datum   txt = PG_GETARG_DATUM(0);
601
602
0
  PG_RETURN_CSTRING(TextDatumGetCString(txt));
603
0
}
604
605
/*
606
 *    textrecv      - converts external binary format to text
607
 */
608
Datum
609
textrecv(PG_FUNCTION_ARGS)
610
0
{
611
0
  StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
612
0
  text     *result;
613
0
  char     *str;
614
0
  int     nbytes;
615
616
0
  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
617
618
0
  result = cstring_to_text_with_len(str, nbytes);
619
0
  pfree(str);
620
0
  PG_RETURN_TEXT_P(result);
621
0
}
622
623
/*
624
 *    textsend      - converts text to binary format
625
 */
626
Datum
627
textsend(PG_FUNCTION_ARGS)
628
0
{
629
0
  text     *t = PG_GETARG_TEXT_PP(0);
630
0
  StringInfoData buf;
631
632
0
  pq_begintypsend(&buf);
633
0
  pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
634
0
  PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
635
0
}
636
637
638
/*
639
 *    unknownin     - converts cstring to internal representation
640
 */
641
Datum
642
unknownin(PG_FUNCTION_ARGS)
643
0
{
644
0
  char     *str = PG_GETARG_CSTRING(0);
645
646
  /* representation is same as cstring */
647
0
  PG_RETURN_CSTRING(pstrdup(str));
648
0
}
649
650
/*
651
 *    unknownout      - converts internal representation to cstring
652
 */
653
Datum
654
unknownout(PG_FUNCTION_ARGS)
655
0
{
656
  /* representation is same as cstring */
657
0
  char     *str = PG_GETARG_CSTRING(0);
658
659
0
  PG_RETURN_CSTRING(pstrdup(str));
660
0
}
661
662
/*
663
 *    unknownrecv     - converts external binary format to unknown
664
 */
665
Datum
666
unknownrecv(PG_FUNCTION_ARGS)
667
0
{
668
0
  StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
669
0
  char     *str;
670
0
  int     nbytes;
671
672
0
  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
673
  /* representation is same as cstring */
674
0
  PG_RETURN_CSTRING(str);
675
0
}
676
677
/*
678
 *    unknownsend     - converts unknown to binary format
679
 */
680
Datum
681
unknownsend(PG_FUNCTION_ARGS)
682
0
{
683
  /* representation is same as cstring */
684
0
  char     *str = PG_GETARG_CSTRING(0);
685
0
  StringInfoData buf;
686
687
0
  pq_begintypsend(&buf);
688
0
  pq_sendtext(&buf, str, strlen(str));
689
0
  PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
690
0
}
691
692
693
/* ========== PUBLIC ROUTINES ========== */
694
695
/*
696
 * textlen -
697
 *    returns the logical length of a text*
698
 *     (which is less than the VARSIZE of the text*)
699
 */
700
Datum
701
textlen(PG_FUNCTION_ARGS)
702
0
{
703
0
  Datum   str = PG_GETARG_DATUM(0);
704
705
  /* try to avoid decompressing argument */
706
0
  PG_RETURN_INT32(text_length(str));
707
0
}
708
709
/*
710
 * text_length -
711
 *  Does the real work for textlen()
712
 *
713
 *  This is broken out so it can be called directly by other string processing
714
 *  functions.  Note that the argument is passed as a Datum, to indicate that
715
 *  it may still be in compressed form.  We can avoid decompressing it at all
716
 *  in some cases.
717
 */
718
static int32
719
text_length(Datum str)
720
0
{
721
  /* fastpath when max encoding length is one */
722
0
  if (pg_database_encoding_max_length() == 1)
723
0
    PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
724
0
  else
725
0
  {
726
0
    text     *t = DatumGetTextPP(str);
727
728
0
    PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
729
0
                       VARSIZE_ANY_EXHDR(t)));
730
0
  }
731
0
}
732
733
/*
734
 * textoctetlen -
735
 *    returns the physical length of a text*
736
 *     (which is less than the VARSIZE of the text*)
737
 */
738
Datum
739
textoctetlen(PG_FUNCTION_ARGS)
740
0
{
741
0
  Datum   str = PG_GETARG_DATUM(0);
742
743
  /* We need not detoast the input at all */
744
0
  PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
745
0
}
746
747
/*
748
 * textcat -
749
 *    takes two text* and returns a text* that is the concatenation of
750
 *    the two.
751
 *
752
 * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
753
 * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
754
 * Allocate space for output in all cases.
755
 * XXX - thomas 1997-07-10
756
 */
757
Datum
758
textcat(PG_FUNCTION_ARGS)
759
0
{
760
0
  text     *t1 = PG_GETARG_TEXT_PP(0);
761
0
  text     *t2 = PG_GETARG_TEXT_PP(1);
762
763
0
  PG_RETURN_TEXT_P(text_catenate(t1, t2));
764
0
}
765
766
/*
767
 * text_catenate
768
 *  Guts of textcat(), broken out so it can be used by other functions
769
 *
770
 * Arguments can be in short-header form, but not compressed or out-of-line
771
 */
772
static text *
773
text_catenate(text *t1, text *t2)
774
0
{
775
0
  text     *result;
776
0
  int     len1,
777
0
        len2,
778
0
        len;
779
0
  char     *ptr;
780
781
0
  len1 = VARSIZE_ANY_EXHDR(t1);
782
0
  len2 = VARSIZE_ANY_EXHDR(t2);
783
784
  /* paranoia ... probably should throw error instead? */
785
0
  if (len1 < 0)
786
0
    len1 = 0;
787
0
  if (len2 < 0)
788
0
    len2 = 0;
789
790
0
  len = len1 + len2 + VARHDRSZ;
791
0
  result = (text *) palloc(len);
792
793
  /* Set size of result string... */
794
0
  SET_VARSIZE(result, len);
795
796
  /* Fill data field of result string... */
797
0
  ptr = VARDATA(result);
798
0
  if (len1 > 0)
799
0
    memcpy(ptr, VARDATA_ANY(t1), len1);
800
0
  if (len2 > 0)
801
0
    memcpy(ptr + len1, VARDATA_ANY(t2), len2);
802
803
0
  return result;
804
0
}
805
806
/*
807
 * charlen_to_bytelen()
808
 *  Compute the number of bytes occupied by n characters starting at *p
809
 *
810
 * It is caller's responsibility that there actually are n characters;
811
 * the string need not be null-terminated.
812
 */
813
static int
814
charlen_to_bytelen(const char *p, int n)
815
0
{
816
0
  if (pg_database_encoding_max_length() == 1)
817
0
  {
818
    /* Optimization for single-byte encodings */
819
0
    return n;
820
0
  }
821
0
  else
822
0
  {
823
0
    const char *s;
824
825
0
    for (s = p; n > 0; n--)
826
0
      s += pg_mblen(s);
827
828
0
    return s - p;
829
0
  }
830
0
}
831
832
/*
833
 * text_substr()
834
 * Return a substring starting at the specified position.
835
 * - thomas 1997-12-31
836
 *
837
 * Input:
838
 *  - string
839
 *  - starting position (is one-based)
840
 *  - string length
841
 *
842
 * If the starting position is zero or less, then return from the start of the string
843
 *  adjusting the length to be consistent with the "negative start" per SQL.
844
 * If the length is less than zero, return the remaining string.
845
 *
846
 * Added multibyte support.
847
 * - Tatsuo Ishii 1998-4-21
848
 * Changed behavior if starting position is less than one to conform to SQL behavior.
849
 * Formerly returned the entire string; now returns a portion.
850
 * - Thomas Lockhart 1998-12-10
851
 * Now uses faster TOAST-slicing interface
852
 * - John Gray 2002-02-22
853
 * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
854
 * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
855
 * error; if E < 1, return '', not entire string). Fixed MB related bug when
856
 * S > LC and < LC + 4 sometimes garbage characters are returned.
857
 * - Joe Conway 2002-08-10
858
 */
859
Datum
860
text_substr(PG_FUNCTION_ARGS)
861
0
{
862
0
  PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
863
0
                  PG_GETARG_INT32(1),
864
0
                  PG_GETARG_INT32(2),
865
0
                  false));
866
0
}
867
868
/*
869
 * text_substr_no_len -
870
 *    Wrapper to avoid opr_sanity failure due to
871
 *    one function accepting a different number of args.
872
 */
873
Datum
874
text_substr_no_len(PG_FUNCTION_ARGS)
875
0
{
876
0
  PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
877
0
                  PG_GETARG_INT32(1),
878
0
                  -1, true));
879
0
}
880
881
/*
882
 * text_substring -
883
 *  Does the real work for text_substr() and text_substr_no_len()
884
 *
885
 *  This is broken out so it can be called directly by other string processing
886
 *  functions.  Note that the argument is passed as a Datum, to indicate that
887
 *  it may still be in compressed/toasted form.  We can avoid detoasting all
888
 *  of it in some cases.
889
 *
890
 *  The result is always a freshly palloc'd datum.
891
 */
892
static text *
893
text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
894
0
{
895
0
  int32   eml = pg_database_encoding_max_length();
896
0
  int32   S = start;    /* start position */
897
0
  int32   S1;       /* adjusted start position */
898
0
  int32   L1;       /* adjusted substring length */
899
0
  int32   E;        /* end position */
900
901
  /*
902
   * SQL99 says S can be zero or negative (which we don't document), but we
903
   * still must fetch from the start of the string.
904
   * https://www.postgresql.org/message-id/170905442373.643.11536838320909376197%40wrigleys.postgresql.org
905
   */
906
0
  S1 = Max(S, 1);
907
908
  /* life is easy if the encoding max length is 1 */
909
0
  if (eml == 1)
910
0
  {
911
0
    if (length_not_specified) /* special case - get length to end of
912
                   * string */
913
0
      L1 = -1;
914
0
    else if (length < 0)
915
0
    {
916
      /* SQL99 says to throw an error for E < S, i.e., negative length */
917
0
      ereport(ERROR,
918
0
          (errcode(ERRCODE_SUBSTRING_ERROR),
919
0
           errmsg("negative substring length not allowed")));
920
0
      L1 = -1;      /* silence stupider compilers */
921
0
    }
922
0
    else if (pg_add_s32_overflow(S, length, &E))
923
0
    {
924
      /*
925
       * L could be large enough for S + L to overflow, in which case
926
       * the substring must run to end of string.
927
       */
928
0
      L1 = -1;
929
0
    }
930
0
    else
931
0
    {
932
      /*
933
       * A zero or negative value for the end position can happen if the
934
       * start was negative or one. SQL99 says to return a zero-length
935
       * string.
936
       */
937
0
      if (E < 1)
938
0
        return cstring_to_text("");
939
940
0
      L1 = E - S1;
941
0
    }
942
943
    /*
944
     * If the start position is past the end of the string, SQL99 says to
945
     * return a zero-length string -- DatumGetTextPSlice() will do that
946
     * for us.  We need only convert S1 to zero-based starting position.
947
     */
948
0
    return DatumGetTextPSlice(str, S1 - 1, L1);
949
0
  }
950
0
  else if (eml > 1)
951
0
  {
952
    /*
953
     * When encoding max length is > 1, we can't get LC without
954
     * detoasting, so we'll grab a conservatively large slice now and go
955
     * back later to do the right thing
956
     */
957
0
    int32   slice_start;
958
0
    int32   slice_size;
959
0
    int32   slice_strlen;
960
0
    text     *slice;
961
0
    int32   E1;
962
0
    int32   i;
963
0
    char     *p;
964
0
    char     *s;
965
0
    text     *ret;
966
967
    /*
968
     * We need to start at position zero because there is no way to know
969
     * in advance which byte offset corresponds to the supplied start
970
     * position.
971
     */
972
0
    slice_start = 0;
973
974
0
    if (length_not_specified) /* special case - get length to end of
975
                   * string */
976
0
      slice_size = L1 = -1;
977
0
    else if (length < 0)
978
0
    {
979
      /* SQL99 says to throw an error for E < S, i.e., negative length */
980
0
      ereport(ERROR,
981
0
          (errcode(ERRCODE_SUBSTRING_ERROR),
982
0
           errmsg("negative substring length not allowed")));
983
0
      slice_size = L1 = -1; /* silence stupider compilers */
984
0
    }
985
0
    else if (pg_add_s32_overflow(S, length, &E))
986
0
    {
987
      /*
988
       * L could be large enough for S + L to overflow, in which case
989
       * the substring must run to end of string.
990
       */
991
0
      slice_size = L1 = -1;
992
0
    }
993
0
    else
994
0
    {
995
      /*
996
       * A zero or negative value for the end position can happen if the
997
       * start was negative or one. SQL99 says to return a zero-length
998
       * string.
999
       */
1000
0
      if (E < 1)
1001
0
        return cstring_to_text("");
1002
1003
      /*
1004
       * if E is past the end of the string, the tuple toaster will
1005
       * truncate the length for us
1006
       */
1007
0
      L1 = E - S1;
1008
1009
      /*
1010
       * Total slice size in bytes can't be any longer than the start
1011
       * position plus substring length times the encoding max length.
1012
       * If that overflows, we can just use -1.
1013
       */
1014
0
      if (pg_mul_s32_overflow(E, eml, &slice_size))
1015
0
        slice_size = -1;
1016
0
    }
1017
1018
    /*
1019
     * If we're working with an untoasted source, no need to do an extra
1020
     * copying step.
1021
     */
1022
0
    if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
1023
0
      VARATT_IS_EXTERNAL(DatumGetPointer(str)))
1024
0
      slice = DatumGetTextPSlice(str, slice_start, slice_size);
1025
0
    else
1026
0
      slice = (text *) DatumGetPointer(str);
1027
1028
    /* see if we got back an empty string */
1029
0
    if (VARSIZE_ANY_EXHDR(slice) == 0)
1030
0
    {
1031
0
      if (slice != (text *) DatumGetPointer(str))
1032
0
        pfree(slice);
1033
0
      return cstring_to_text("");
1034
0
    }
1035
1036
    /* Now we can get the actual length of the slice in MB characters */
1037
0
    slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1038
0
                      VARSIZE_ANY_EXHDR(slice));
1039
1040
    /*
1041
     * Check that the start position wasn't > slice_strlen. If so, SQL99
1042
     * says to return a zero-length string.
1043
     */
1044
0
    if (S1 > slice_strlen)
1045
0
    {
1046
0
      if (slice != (text *) DatumGetPointer(str))
1047
0
        pfree(slice);
1048
0
      return cstring_to_text("");
1049
0
    }
1050
1051
    /*
1052
     * Adjust L1 and E1 now that we know the slice string length. Again
1053
     * remember that S1 is one based, and slice_start is zero based.
1054
     */
1055
0
    if (L1 > -1)
1056
0
      E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1057
0
    else
1058
0
      E1 = slice_start + 1 + slice_strlen;
1059
1060
    /*
1061
     * Find the start position in the slice; remember S1 is not zero based
1062
     */
1063
0
    p = VARDATA_ANY(slice);
1064
0
    for (i = 0; i < S1 - 1; i++)
1065
0
      p += pg_mblen(p);
1066
1067
    /* hang onto a pointer to our start position */
1068
0
    s = p;
1069
1070
    /*
1071
     * Count the actual bytes used by the substring of the requested
1072
     * length.
1073
     */
1074
0
    for (i = S1; i < E1; i++)
1075
0
      p += pg_mblen(p);
1076
1077
0
    ret = (text *) palloc(VARHDRSZ + (p - s));
1078
0
    SET_VARSIZE(ret, VARHDRSZ + (p - s));
1079
0
    memcpy(VARDATA(ret), s, (p - s));
1080
1081
0
    if (slice != (text *) DatumGetPointer(str))
1082
0
      pfree(slice);
1083
1084
0
    return ret;
1085
0
  }
1086
0
  else
1087
0
    elog(ERROR, "invalid backend encoding: encoding max length < 1");
1088
1089
  /* not reached: suppress compiler warning */
1090
0
  return NULL;
1091
0
}
1092
1093
/*
1094
 * textoverlay
1095
 *  Replace specified substring of first string with second
1096
 *
1097
 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1098
 * This code is a direct implementation of what the standard says.
1099
 */
1100
Datum
1101
textoverlay(PG_FUNCTION_ARGS)
1102
0
{
1103
0
  text     *t1 = PG_GETARG_TEXT_PP(0);
1104
0
  text     *t2 = PG_GETARG_TEXT_PP(1);
1105
0
  int     sp = PG_GETARG_INT32(2);  /* substring start position */
1106
0
  int     sl = PG_GETARG_INT32(3);  /* substring length */
1107
1108
0
  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1109
0
}
1110
1111
Datum
1112
textoverlay_no_len(PG_FUNCTION_ARGS)
1113
0
{
1114
0
  text     *t1 = PG_GETARG_TEXT_PP(0);
1115
0
  text     *t2 = PG_GETARG_TEXT_PP(1);
1116
0
  int     sp = PG_GETARG_INT32(2);  /* substring start position */
1117
0
  int     sl;
1118
1119
0
  sl = text_length(PointerGetDatum(t2));  /* defaults to length(t2) */
1120
0
  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1121
0
}
1122
1123
static text *
1124
text_overlay(text *t1, text *t2, int sp, int sl)
1125
0
{
1126
0
  text     *result;
1127
0
  text     *s1;
1128
0
  text     *s2;
1129
0
  int     sp_pl_sl;
1130
1131
  /*
1132
   * Check for possible integer-overflow cases.  For negative sp, throw a
1133
   * "substring length" error because that's what should be expected
1134
   * according to the spec's definition of OVERLAY().
1135
   */
1136
0
  if (sp <= 0)
1137
0
    ereport(ERROR,
1138
0
        (errcode(ERRCODE_SUBSTRING_ERROR),
1139
0
         errmsg("negative substring length not allowed")));
1140
0
  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1141
0
    ereport(ERROR,
1142
0
        (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1143
0
         errmsg("integer out of range")));
1144
1145
0
  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1146
0
  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1147
0
  result = text_catenate(s1, t2);
1148
0
  result = text_catenate(result, s2);
1149
1150
0
  return result;
1151
0
}
1152
1153
/*
1154
 * textpos -
1155
 *    Return the position of the specified substring.
1156
 *    Implements the SQL POSITION() function.
1157
 *    Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1158
 * - thomas 1997-07-27
1159
 */
1160
Datum
1161
textpos(PG_FUNCTION_ARGS)
1162
0
{
1163
0
  text     *str = PG_GETARG_TEXT_PP(0);
1164
0
  text     *search_str = PG_GETARG_TEXT_PP(1);
1165
1166
0
  PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1167
0
}
1168
1169
/*
1170
 * text_position -
1171
 *  Does the real work for textpos()
1172
 *
1173
 * Inputs:
1174
 *    t1 - string to be searched
1175
 *    t2 - pattern to match within t1
1176
 * Result:
1177
 *    Character index of the first matched char, starting from 1,
1178
 *    or 0 if no match.
1179
 *
1180
 *  This is broken out so it can be called directly by other string processing
1181
 *  functions.
1182
 */
1183
static int
1184
text_position(text *t1, text *t2, Oid collid)
1185
0
{
1186
0
  TextPositionState state;
1187
0
  int     result;
1188
1189
0
  check_collation_set(collid);
1190
1191
  /* Empty needle always matches at position 1 */
1192
0
  if (VARSIZE_ANY_EXHDR(t2) < 1)
1193
0
    return 1;
1194
1195
  /* Otherwise, can't match if haystack is shorter than needle */
1196
0
  if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) &&
1197
0
    pg_newlocale_from_collation(collid)->deterministic)
1198
0
    return 0;
1199
1200
0
  text_position_setup(t1, t2, collid, &state);
1201
  /* don't need greedy mode here */
1202
0
  state.greedy = false;
1203
1204
0
  if (!text_position_next(&state))
1205
0
    result = 0;
1206
0
  else
1207
0
    result = text_position_get_match_pos(&state);
1208
0
  text_position_cleanup(&state);
1209
0
  return result;
1210
0
}
1211
1212
1213
/*
1214
 * text_position_setup, text_position_next, text_position_cleanup -
1215
 *  Component steps of text_position()
1216
 *
1217
 * These are broken out so that a string can be efficiently searched for
1218
 * multiple occurrences of the same pattern.  text_position_next may be
1219
 * called multiple times, and it advances to the next match on each call.
1220
 * text_position_get_match_ptr() and text_position_get_match_pos() return
1221
 * a pointer or 1-based character position of the last match, respectively.
1222
 *
1223
 * The "state" variable is normally just a local variable in the caller.
1224
 *
1225
 * NOTE: text_position_next skips over the matched portion.  For example,
1226
 * searching for "xx" in "xxx" returns only one match, not two.
1227
 */
1228
1229
static void
1230
text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1231
0
{
1232
0
  int     len1 = VARSIZE_ANY_EXHDR(t1);
1233
0
  int     len2 = VARSIZE_ANY_EXHDR(t2);
1234
1235
0
  check_collation_set(collid);
1236
1237
0
  state->locale = pg_newlocale_from_collation(collid);
1238
1239
  /*
1240
   * Most callers need greedy mode, but some might want to unset this to
1241
   * optimize.
1242
   */
1243
0
  state->greedy = true;
1244
1245
0
  Assert(len2 > 0);
1246
1247
  /*
1248
   * Even with a multi-byte encoding, we perform the search using the raw
1249
   * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
1250
   * because in UTF-8 the byte sequence of one character cannot contain
1251
   * another character.  For other multi-byte encodings, we do the search
1252
   * initially as a simple byte search, ignoring multibyte issues, but
1253
   * verify afterwards that the match we found is at a character boundary,
1254
   * and continue the search if it was a false match.
1255
   */
1256
0
  if (pg_database_encoding_max_length() == 1)
1257
0
    state->is_multibyte_char_in_char = false;
1258
0
  else if (GetDatabaseEncoding() == PG_UTF8)
1259
0
    state->is_multibyte_char_in_char = false;
1260
0
  else
1261
0
    state->is_multibyte_char_in_char = true;
1262
1263
0
  state->str1 = VARDATA_ANY(t1);
1264
0
  state->str2 = VARDATA_ANY(t2);
1265
0
  state->len1 = len1;
1266
0
  state->len2 = len2;
1267
0
  state->last_match = NULL;
1268
0
  state->refpoint = state->str1;
1269
0
  state->refpos = 0;
1270
1271
  /*
1272
   * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
1273
   * notes we use the terminology that the "haystack" is the string to be
1274
   * searched (t1) and the "needle" is the pattern being sought (t2).
1275
   *
1276
   * If the needle is empty or bigger than the haystack then there is no
1277
   * point in wasting cycles initializing the table.  We also choose not to
1278
   * use B-M-H for needles of length 1, since the skip table can't possibly
1279
   * save anything in that case.
1280
   *
1281
   * (With nondeterministic collations, the search is already
1282
   * multibyte-aware, so we don't need this.)
1283
   */
1284
0
  if (len1 >= len2 && len2 > 1 && state->locale->deterministic)
1285
0
  {
1286
0
    int     searchlength = len1 - len2;
1287
0
    int     skiptablemask;
1288
0
    int     last;
1289
0
    int     i;
1290
0
    const char *str2 = state->str2;
1291
1292
    /*
1293
     * First we must determine how much of the skip table to use.  The
1294
     * declaration of TextPositionState allows up to 256 elements, but for
1295
     * short search problems we don't really want to have to initialize so
1296
     * many elements --- it would take too long in comparison to the
1297
     * actual search time.  So we choose a useful skip table size based on
1298
     * the haystack length minus the needle length.  The closer the needle
1299
     * length is to the haystack length the less useful skipping becomes.
1300
     *
1301
     * Note: since we use bit-masking to select table elements, the skip
1302
     * table size MUST be a power of 2, and so the mask must be 2^N-1.
1303
     */
1304
0
    if (searchlength < 16)
1305
0
      skiptablemask = 3;
1306
0
    else if (searchlength < 64)
1307
0
      skiptablemask = 7;
1308
0
    else if (searchlength < 128)
1309
0
      skiptablemask = 15;
1310
0
    else if (searchlength < 512)
1311
0
      skiptablemask = 31;
1312
0
    else if (searchlength < 2048)
1313
0
      skiptablemask = 63;
1314
0
    else if (searchlength < 4096)
1315
0
      skiptablemask = 127;
1316
0
    else
1317
0
      skiptablemask = 255;
1318
0
    state->skiptablemask = skiptablemask;
1319
1320
    /*
1321
     * Initialize the skip table.  We set all elements to the needle
1322
     * length, since this is the correct skip distance for any character
1323
     * not found in the needle.
1324
     */
1325
0
    for (i = 0; i <= skiptablemask; i++)
1326
0
      state->skiptable[i] = len2;
1327
1328
    /*
1329
     * Now examine the needle.  For each character except the last one,
1330
     * set the corresponding table element to the appropriate skip
1331
     * distance.  Note that when two characters share the same skip table
1332
     * entry, the one later in the needle must determine the skip
1333
     * distance.
1334
     */
1335
0
    last = len2 - 1;
1336
1337
0
    for (i = 0; i < last; i++)
1338
0
      state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1339
0
  }
1340
0
}
1341
1342
/*
1343
 * Advance to the next match, starting from the end of the previous match
1344
 * (or the beginning of the string, on first call).  Returns true if a match
1345
 * is found.
1346
 *
1347
 * Note that this refuses to match an empty-string needle.  Most callers
1348
 * will have handled that case specially and we'll never see it here.
1349
 */
1350
static bool
1351
text_position_next(TextPositionState *state)
1352
0
{
1353
0
  int     needle_len = state->len2;
1354
0
  char     *start_ptr;
1355
0
  char     *matchptr;
1356
1357
0
  if (needle_len <= 0)
1358
0
    return false;     /* result for empty pattern */
1359
1360
  /* Start from the point right after the previous match. */
1361
0
  if (state->last_match)
1362
0
    start_ptr = state->last_match + state->last_match_len;
1363
0
  else
1364
0
    start_ptr = state->str1;
1365
1366
0
retry:
1367
0
  matchptr = text_position_next_internal(start_ptr, state);
1368
1369
0
  if (!matchptr)
1370
0
    return false;
1371
1372
  /*
1373
   * Found a match for the byte sequence.  If this is a multibyte encoding,
1374
   * where one character's byte sequence can appear inside a longer
1375
   * multi-byte character, we need to verify that the match was at a
1376
   * character boundary, not in the middle of a multi-byte character.
1377
   */
1378
0
  if (state->is_multibyte_char_in_char && state->locale->deterministic)
1379
0
  {
1380
    /* Walk one character at a time, until we reach the match. */
1381
1382
    /* the search should never move backwards. */
1383
0
    Assert(state->refpoint <= matchptr);
1384
1385
0
    while (state->refpoint < matchptr)
1386
0
    {
1387
      /* step to next character. */
1388
0
      state->refpoint += pg_mblen(state->refpoint);
1389
0
      state->refpos++;
1390
1391
      /*
1392
       * If we stepped over the match's start position, then it was a
1393
       * false positive, where the byte sequence appeared in the middle
1394
       * of a multi-byte character.  Skip it, and continue the search at
1395
       * the next character boundary.
1396
       */
1397
0
      if (state->refpoint > matchptr)
1398
0
      {
1399
0
        start_ptr = state->refpoint;
1400
0
        goto retry;
1401
0
      }
1402
0
    }
1403
0
  }
1404
1405
0
  state->last_match = matchptr;
1406
0
  state->last_match_len = state->last_match_len_tmp;
1407
0
  return true;
1408
0
}
1409
1410
/*
1411
 * Subroutine of text_position_next().  This searches for the raw byte
1412
 * sequence, ignoring any multi-byte encoding issues.  Returns the first
1413
 * match starting at 'start_ptr', or NULL if no match is found.
1414
 */
1415
static char *
1416
text_position_next_internal(char *start_ptr, TextPositionState *state)
1417
0
{
1418
0
  int     haystack_len = state->len1;
1419
0
  int     needle_len = state->len2;
1420
0
  int     skiptablemask = state->skiptablemask;
1421
0
  const char *haystack = state->str1;
1422
0
  const char *needle = state->str2;
1423
0
  const char *haystack_end = &haystack[haystack_len];
1424
0
  const char *hptr;
1425
1426
0
  Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1427
1428
0
  state->last_match_len_tmp = needle_len;
1429
1430
0
  if (!state->locale->deterministic)
1431
0
  {
1432
    /*
1433
     * With a nondeterministic collation, we have to use an unoptimized
1434
     * route.  We walk through the haystack and see if at each position
1435
     * there is a substring of the remaining string that is equal to the
1436
     * needle under the given collation.
1437
     *
1438
     * Note, the found substring could have a different length than the
1439
     * needle, including being empty.  Callers that want to skip over the
1440
     * found string need to read the length of the found substring from
1441
     * last_match_len rather than just using the length of their needle.
1442
     *
1443
     * Most callers will require "greedy" semantics, meaning that we need
1444
     * to find the longest such substring, not the shortest.  For callers
1445
     * that don't need greedy semantics, we can finish on the first match.
1446
     */
1447
0
    const char *result_hptr = NULL;
1448
1449
0
    hptr = start_ptr;
1450
0
    while (hptr < haystack_end)
1451
0
    {
1452
      /*
1453
       * First check the common case that there is a match in the
1454
       * haystack of exactly the length of the needle.
1455
       */
1456
0
      if (!state->greedy &&
1457
0
        haystack_end - hptr >= needle_len &&
1458
0
        pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0)
1459
0
        return (char *) hptr;
1460
1461
      /*
1462
       * Else check if any of the possible substrings starting at hptr
1463
       * are equal to the needle.
1464
       */
1465
0
      for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
1466
0
      {
1467
0
        if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
1468
0
        {
1469
0
          state->last_match_len_tmp = (test_end - hptr);
1470
0
          result_hptr = hptr;
1471
0
          if (!state->greedy)
1472
0
            break;
1473
0
        }
1474
0
      }
1475
0
      if (result_hptr)
1476
0
        break;
1477
1478
0
      hptr += pg_mblen(hptr);
1479
0
    }
1480
1481
0
    return (char *) result_hptr;
1482
0
  }
1483
0
  else if (needle_len == 1)
1484
0
  {
1485
    /* No point in using B-M-H for a one-character needle */
1486
0
    char    nchar = *needle;
1487
1488
0
    hptr = start_ptr;
1489
0
    while (hptr < haystack_end)
1490
0
    {
1491
0
      if (*hptr == nchar)
1492
0
        return (char *) hptr;
1493
0
      hptr++;
1494
0
    }
1495
0
  }
1496
0
  else
1497
0
  {
1498
0
    const char *needle_last = &needle[needle_len - 1];
1499
1500
    /* Start at startpos plus the length of the needle */
1501
0
    hptr = start_ptr + needle_len - 1;
1502
0
    while (hptr < haystack_end)
1503
0
    {
1504
      /* Match the needle scanning *backward* */
1505
0
      const char *nptr;
1506
0
      const char *p;
1507
1508
0
      nptr = needle_last;
1509
0
      p = hptr;
1510
0
      while (*nptr == *p)
1511
0
      {
1512
        /* Matched it all?  If so, return 1-based position */
1513
0
        if (nptr == needle)
1514
0
          return (char *) p;
1515
0
        nptr--, p--;
1516
0
      }
1517
1518
      /*
1519
       * No match, so use the haystack char at hptr to decide how far to
1520
       * advance.  If the needle had any occurrence of that character
1521
       * (or more precisely, one sharing the same skiptable entry)
1522
       * before its last character, then we advance far enough to align
1523
       * the last such needle character with that haystack position.
1524
       * Otherwise we can advance by the whole needle length.
1525
       */
1526
0
      hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1527
0
    }
1528
0
  }
1529
1530
0
  return 0;         /* not found */
1531
0
}
1532
1533
/*
1534
 * Return a pointer to the current match.
1535
 *
1536
 * The returned pointer points into the original haystack string.
1537
 */
1538
static char *
1539
text_position_get_match_ptr(TextPositionState *state)
1540
0
{
1541
0
  return state->last_match;
1542
0
}
1543
1544
/*
1545
 * Return the offset of the current match.
1546
 *
1547
 * The offset is in characters, 1-based.
1548
 */
1549
static int
1550
text_position_get_match_pos(TextPositionState *state)
1551
0
{
1552
  /* Convert the byte position to char position. */
1553
0
  state->refpos += pg_mbstrlen_with_len(state->refpoint,
1554
0
                      state->last_match - state->refpoint);
1555
0
  state->refpoint = state->last_match;
1556
0
  return state->refpos + 1;
1557
0
}
1558
1559
/*
1560
 * Reset search state to the initial state installed by text_position_setup.
1561
 *
1562
 * The next call to text_position_next will search from the beginning
1563
 * of the string.
1564
 */
1565
static void
1566
text_position_reset(TextPositionState *state)
1567
0
{
1568
0
  state->last_match = NULL;
1569
0
  state->refpoint = state->str1;
1570
0
  state->refpos = 0;
1571
0
}
1572
1573
static void
1574
text_position_cleanup(TextPositionState *state)
1575
0
{
1576
  /* no cleanup needed */
1577
0
}
1578
1579
1580
static void
1581
check_collation_set(Oid collid)
1582
0
{
1583
0
  if (!OidIsValid(collid))
1584
0
  {
1585
    /*
1586
     * This typically means that the parser could not resolve a conflict
1587
     * of implicit collations, so report it that way.
1588
     */
1589
0
    ereport(ERROR,
1590
0
        (errcode(ERRCODE_INDETERMINATE_COLLATION),
1591
0
         errmsg("could not determine which collation to use for string comparison"),
1592
0
         errhint("Use the COLLATE clause to set the collation explicitly.")));
1593
0
  }
1594
0
}
1595
1596
/*
1597
 * varstr_cmp()
1598
 *
1599
 * Comparison function for text strings with given lengths, using the
1600
 * appropriate locale. Returns an integer less than, equal to, or greater than
1601
 * zero, indicating whether arg1 is less than, equal to, or greater than arg2.
1602
 *
1603
 * Note: many functions that depend on this are marked leakproof; therefore,
1604
 * avoid reporting the actual contents of the input when throwing errors.
1605
 * All errors herein should be things that can't happen except on corrupt
1606
 * data, anyway; otherwise we will have trouble with indexing strings that
1607
 * would cause them.
1608
 */
1609
int
1610
varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1611
0
{
1612
0
  int     result;
1613
0
  pg_locale_t mylocale;
1614
1615
0
  check_collation_set(collid);
1616
1617
0
  mylocale = pg_newlocale_from_collation(collid);
1618
1619
0
  if (mylocale->collate_is_c)
1620
0
  {
1621
0
    result = memcmp(arg1, arg2, Min(len1, len2));
1622
0
    if ((result == 0) && (len1 != len2))
1623
0
      result = (len1 < len2) ? -1 : 1;
1624
0
  }
1625
0
  else
1626
0
  {
1627
    /*
1628
     * memcmp() can't tell us which of two unequal strings sorts first,
1629
     * but it's a cheap way to tell if they're equal.  Testing shows that
1630
     * memcmp() followed by strcoll() is only trivially slower than
1631
     * strcoll() by itself, so we don't lose much if this doesn't work out
1632
     * very often, and if it does - for example, because there are many
1633
     * equal strings in the input - then we win big by avoiding expensive
1634
     * collation-aware comparisons.
1635
     */
1636
0
    if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1637
0
      return 0;
1638
1639
0
    result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
1640
1641
    /* Break tie if necessary. */
1642
0
    if (result == 0 && mylocale->deterministic)
1643
0
    {
1644
0
      result = memcmp(arg1, arg2, Min(len1, len2));
1645
0
      if ((result == 0) && (len1 != len2))
1646
0
        result = (len1 < len2) ? -1 : 1;
1647
0
    }
1648
0
  }
1649
1650
0
  return result;
1651
0
}
1652
1653
/* text_cmp()
1654
 * Internal comparison function for text strings.
1655
 * Returns -1, 0 or 1
1656
 */
1657
static int
1658
text_cmp(text *arg1, text *arg2, Oid collid)
1659
0
{
1660
0
  char     *a1p,
1661
0
         *a2p;
1662
0
  int     len1,
1663
0
        len2;
1664
1665
0
  a1p = VARDATA_ANY(arg1);
1666
0
  a2p = VARDATA_ANY(arg2);
1667
1668
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
1669
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
1670
1671
0
  return varstr_cmp(a1p, len1, a2p, len2, collid);
1672
0
}
1673
1674
/*
1675
 * Comparison functions for text strings.
1676
 *
1677
 * Note: btree indexes need these routines not to leak memory; therefore,
1678
 * be careful to free working copies of toasted datums.  Most places don't
1679
 * need to be so careful.
1680
 */
1681
1682
Datum
1683
texteq(PG_FUNCTION_ARGS)
1684
0
{
1685
0
  Oid     collid = PG_GET_COLLATION();
1686
0
  pg_locale_t mylocale = 0;
1687
0
  bool    result;
1688
1689
0
  check_collation_set(collid);
1690
1691
0
  mylocale = pg_newlocale_from_collation(collid);
1692
1693
0
  if (mylocale->deterministic)
1694
0
  {
1695
0
    Datum   arg1 = PG_GETARG_DATUM(0);
1696
0
    Datum   arg2 = PG_GETARG_DATUM(1);
1697
0
    Size    len1,
1698
0
          len2;
1699
1700
    /*
1701
     * Since we only care about equality or not-equality, we can avoid all
1702
     * the expense of strcoll() here, and just do bitwise comparison.  In
1703
     * fact, we don't even have to do a bitwise comparison if we can show
1704
     * the lengths of the strings are unequal; which might save us from
1705
     * having to detoast one or both values.
1706
     */
1707
0
    len1 = toast_raw_datum_size(arg1);
1708
0
    len2 = toast_raw_datum_size(arg2);
1709
0
    if (len1 != len2)
1710
0
      result = false;
1711
0
    else
1712
0
    {
1713
0
      text     *targ1 = DatumGetTextPP(arg1);
1714
0
      text     *targ2 = DatumGetTextPP(arg2);
1715
1716
0
      result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1717
0
               len1 - VARHDRSZ) == 0);
1718
1719
0
      PG_FREE_IF_COPY(targ1, 0);
1720
0
      PG_FREE_IF_COPY(targ2, 1);
1721
0
    }
1722
0
  }
1723
0
  else
1724
0
  {
1725
0
    text     *arg1 = PG_GETARG_TEXT_PP(0);
1726
0
    text     *arg2 = PG_GETARG_TEXT_PP(1);
1727
1728
0
    result = (text_cmp(arg1, arg2, collid) == 0);
1729
1730
0
    PG_FREE_IF_COPY(arg1, 0);
1731
0
    PG_FREE_IF_COPY(arg2, 1);
1732
0
  }
1733
1734
0
  PG_RETURN_BOOL(result);
1735
0
}
1736
1737
Datum
1738
textne(PG_FUNCTION_ARGS)
1739
0
{
1740
0
  Oid     collid = PG_GET_COLLATION();
1741
0
  pg_locale_t mylocale;
1742
0
  bool    result;
1743
1744
0
  check_collation_set(collid);
1745
1746
0
  mylocale = pg_newlocale_from_collation(collid);
1747
1748
0
  if (mylocale->deterministic)
1749
0
  {
1750
0
    Datum   arg1 = PG_GETARG_DATUM(0);
1751
0
    Datum   arg2 = PG_GETARG_DATUM(1);
1752
0
    Size    len1,
1753
0
          len2;
1754
1755
    /* See comment in texteq() */
1756
0
    len1 = toast_raw_datum_size(arg1);
1757
0
    len2 = toast_raw_datum_size(arg2);
1758
0
    if (len1 != len2)
1759
0
      result = true;
1760
0
    else
1761
0
    {
1762
0
      text     *targ1 = DatumGetTextPP(arg1);
1763
0
      text     *targ2 = DatumGetTextPP(arg2);
1764
1765
0
      result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1766
0
               len1 - VARHDRSZ) != 0);
1767
1768
0
      PG_FREE_IF_COPY(targ1, 0);
1769
0
      PG_FREE_IF_COPY(targ2, 1);
1770
0
    }
1771
0
  }
1772
0
  else
1773
0
  {
1774
0
    text     *arg1 = PG_GETARG_TEXT_PP(0);
1775
0
    text     *arg2 = PG_GETARG_TEXT_PP(1);
1776
1777
0
    result = (text_cmp(arg1, arg2, collid) != 0);
1778
1779
0
    PG_FREE_IF_COPY(arg1, 0);
1780
0
    PG_FREE_IF_COPY(arg2, 1);
1781
0
  }
1782
1783
0
  PG_RETURN_BOOL(result);
1784
0
}
1785
1786
Datum
1787
text_lt(PG_FUNCTION_ARGS)
1788
0
{
1789
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
1790
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
1791
0
  bool    result;
1792
1793
0
  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1794
1795
0
  PG_FREE_IF_COPY(arg1, 0);
1796
0
  PG_FREE_IF_COPY(arg2, 1);
1797
1798
0
  PG_RETURN_BOOL(result);
1799
0
}
1800
1801
Datum
1802
text_le(PG_FUNCTION_ARGS)
1803
0
{
1804
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
1805
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
1806
0
  bool    result;
1807
1808
0
  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1809
1810
0
  PG_FREE_IF_COPY(arg1, 0);
1811
0
  PG_FREE_IF_COPY(arg2, 1);
1812
1813
0
  PG_RETURN_BOOL(result);
1814
0
}
1815
1816
Datum
1817
text_gt(PG_FUNCTION_ARGS)
1818
0
{
1819
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
1820
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
1821
0
  bool    result;
1822
1823
0
  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1824
1825
0
  PG_FREE_IF_COPY(arg1, 0);
1826
0
  PG_FREE_IF_COPY(arg2, 1);
1827
1828
0
  PG_RETURN_BOOL(result);
1829
0
}
1830
1831
Datum
1832
text_ge(PG_FUNCTION_ARGS)
1833
0
{
1834
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
1835
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
1836
0
  bool    result;
1837
1838
0
  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1839
1840
0
  PG_FREE_IF_COPY(arg1, 0);
1841
0
  PG_FREE_IF_COPY(arg2, 1);
1842
1843
0
  PG_RETURN_BOOL(result);
1844
0
}
1845
1846
Datum
1847
text_starts_with(PG_FUNCTION_ARGS)
1848
0
{
1849
0
  Datum   arg1 = PG_GETARG_DATUM(0);
1850
0
  Datum   arg2 = PG_GETARG_DATUM(1);
1851
0
  Oid     collid = PG_GET_COLLATION();
1852
0
  pg_locale_t mylocale;
1853
0
  bool    result;
1854
0
  Size    len1,
1855
0
        len2;
1856
1857
0
  check_collation_set(collid);
1858
1859
0
  mylocale = pg_newlocale_from_collation(collid);
1860
1861
0
  if (!mylocale->deterministic)
1862
0
    ereport(ERROR,
1863
0
        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1864
0
         errmsg("nondeterministic collations are not supported for substring searches")));
1865
1866
0
  len1 = toast_raw_datum_size(arg1);
1867
0
  len2 = toast_raw_datum_size(arg2);
1868
0
  if (len2 > len1)
1869
0
    result = false;
1870
0
  else
1871
0
  {
1872
0
    text     *targ1 = text_substring(arg1, 1, len2, false);
1873
0
    text     *targ2 = DatumGetTextPP(arg2);
1874
1875
0
    result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1876
0
             VARSIZE_ANY_EXHDR(targ2)) == 0);
1877
1878
0
    PG_FREE_IF_COPY(targ1, 0);
1879
0
    PG_FREE_IF_COPY(targ2, 1);
1880
0
  }
1881
1882
0
  PG_RETURN_BOOL(result);
1883
0
}
1884
1885
Datum
1886
bttextcmp(PG_FUNCTION_ARGS)
1887
0
{
1888
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
1889
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
1890
0
  int32   result;
1891
1892
0
  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1893
1894
0
  PG_FREE_IF_COPY(arg1, 0);
1895
0
  PG_FREE_IF_COPY(arg2, 1);
1896
1897
0
  PG_RETURN_INT32(result);
1898
0
}
1899
1900
Datum
1901
bttextsortsupport(PG_FUNCTION_ARGS)
1902
0
{
1903
0
  SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1904
0
  Oid     collid = ssup->ssup_collation;
1905
0
  MemoryContext oldcontext;
1906
1907
0
  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1908
1909
  /* Use generic string SortSupport */
1910
0
  varstr_sortsupport(ssup, TEXTOID, collid);
1911
1912
0
  MemoryContextSwitchTo(oldcontext);
1913
1914
0
  PG_RETURN_VOID();
1915
0
}
1916
1917
/*
1918
 * Generic sortsupport interface for character type's operator classes.
1919
 * Includes locale support, and support for BpChar semantics (i.e. removing
1920
 * trailing spaces before comparison).
1921
 *
1922
 * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1923
 * same representation.  Callers that always use the C collation (e.g.
1924
 * non-collatable type callers like bytea) may have NUL bytes in their strings;
1925
 * this will not work with any other collation, though.
1926
 */
1927
void
1928
varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
1929
0
{
1930
0
  bool    abbreviate = ssup->abbreviate;
1931
0
  bool    collate_c = false;
1932
0
  VarStringSortSupport *sss;
1933
0
  pg_locale_t locale;
1934
1935
0
  check_collation_set(collid);
1936
1937
0
  locale = pg_newlocale_from_collation(collid);
1938
1939
  /*
1940
   * If possible, set ssup->comparator to a function which can be used to
1941
   * directly compare two datums.  If we can do this, we'll avoid the
1942
   * overhead of a trip through the fmgr layer for every comparison, which
1943
   * can be substantial.
1944
   *
1945
   * Most typically, we'll set the comparator to varlenafastcmp_locale,
1946
   * which uses strcoll() to perform comparisons.  We use that for the
1947
   * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1948
   * LC_COLLATE = C, we can make things quite a bit faster with
1949
   * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1950
   * memcmp() rather than strcoll().
1951
   */
1952
0
  if (locale->collate_is_c)
1953
0
  {
1954
0
    if (typid == BPCHAROID)
1955
0
      ssup->comparator = bpcharfastcmp_c;
1956
0
    else if (typid == NAMEOID)
1957
0
    {
1958
0
      ssup->comparator = namefastcmp_c;
1959
      /* Not supporting abbreviation with type NAME, for now */
1960
0
      abbreviate = false;
1961
0
    }
1962
0
    else
1963
0
      ssup->comparator = varstrfastcmp_c;
1964
1965
0
    collate_c = true;
1966
0
  }
1967
0
  else
1968
0
  {
1969
    /*
1970
     * We use varlenafastcmp_locale except for type NAME.
1971
     */
1972
0
    if (typid == NAMEOID)
1973
0
    {
1974
0
      ssup->comparator = namefastcmp_locale;
1975
      /* Not supporting abbreviation with type NAME, for now */
1976
0
      abbreviate = false;
1977
0
    }
1978
0
    else
1979
0
      ssup->comparator = varlenafastcmp_locale;
1980
1981
    /*
1982
     * Unfortunately, it seems that abbreviation for non-C collations is
1983
     * broken on many common platforms; see pg_strxfrm_enabled().
1984
     *
1985
     * Even apart from the risk of broken locales, it's possible that
1986
     * there are platforms where the use of abbreviated keys should be
1987
     * disabled at compile time.  Having only 4 byte datums could make
1988
     * worst-case performance drastically more likely, for example.
1989
     * Moreover, macOS's strxfrm() implementation is known to not
1990
     * effectively concentrate a significant amount of entropy from the
1991
     * original string in earlier transformed blobs.  It's possible that
1992
     * other supported platforms are similarly encumbered.  So, if we ever
1993
     * get past disabling this categorically, we may still want or need to
1994
     * disable it for particular platforms.
1995
     */
1996
0
    if (!pg_strxfrm_enabled(locale))
1997
0
      abbreviate = false;
1998
0
  }
1999
2000
  /*
2001
   * If we're using abbreviated keys, or if we're using a locale-aware
2002
   * comparison, we need to initialize a VarStringSortSupport object. Both
2003
   * cases will make use of the temporary buffers we initialize here for
2004
   * scratch space (and to detect requirement for BpChar semantics from
2005
   * caller), and the abbreviation case requires additional state.
2006
   */
2007
0
  if (abbreviate || !collate_c)
2008
0
  {
2009
0
    sss = palloc(sizeof(VarStringSortSupport));
2010
0
    sss->buf1 = palloc(TEXTBUFLEN);
2011
0
    sss->buflen1 = TEXTBUFLEN;
2012
0
    sss->buf2 = palloc(TEXTBUFLEN);
2013
0
    sss->buflen2 = TEXTBUFLEN;
2014
    /* Start with invalid values */
2015
0
    sss->last_len1 = -1;
2016
0
    sss->last_len2 = -1;
2017
    /* Initialize */
2018
0
    sss->last_returned = 0;
2019
0
    if (collate_c)
2020
0
      sss->locale = NULL;
2021
0
    else
2022
0
      sss->locale = locale;
2023
2024
    /*
2025
     * To avoid somehow confusing a strxfrm() blob and an original string,
2026
     * constantly keep track of the variety of data that buf1 and buf2
2027
     * currently contain.
2028
     *
2029
     * Comparisons may be interleaved with conversion calls.  Frequently,
2030
     * conversions and comparisons are batched into two distinct phases,
2031
     * but the correctness of caching cannot hinge upon this.  For
2032
     * comparison caching, buffer state is only trusted if cache_blob is
2033
     * found set to false, whereas strxfrm() caching only trusts the state
2034
     * when cache_blob is found set to true.
2035
     *
2036
     * Arbitrarily initialize cache_blob to true.
2037
     */
2038
0
    sss->cache_blob = true;
2039
0
    sss->collate_c = collate_c;
2040
0
    sss->typid = typid;
2041
0
    ssup->ssup_extra = sss;
2042
2043
    /*
2044
     * If possible, plan to use the abbreviated keys optimization.  The
2045
     * core code may switch back to authoritative comparator should
2046
     * abbreviation be aborted.
2047
     */
2048
0
    if (abbreviate)
2049
0
    {
2050
0
      sss->prop_card = 0.20;
2051
0
      initHyperLogLog(&sss->abbr_card, 10);
2052
0
      initHyperLogLog(&sss->full_card, 10);
2053
0
      ssup->abbrev_full_comparator = ssup->comparator;
2054
0
      ssup->comparator = ssup_datum_unsigned_cmp;
2055
0
      ssup->abbrev_converter = varstr_abbrev_convert;
2056
0
      ssup->abbrev_abort = varstr_abbrev_abort;
2057
0
    }
2058
0
  }
2059
0
}
2060
2061
/*
2062
 * sortsupport comparison func (for C locale case)
2063
 */
2064
static int
2065
varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2066
0
{
2067
0
  VarString  *arg1 = DatumGetVarStringPP(x);
2068
0
  VarString  *arg2 = DatumGetVarStringPP(y);
2069
0
  char     *a1p,
2070
0
         *a2p;
2071
0
  int     len1,
2072
0
        len2,
2073
0
        result;
2074
2075
0
  a1p = VARDATA_ANY(arg1);
2076
0
  a2p = VARDATA_ANY(arg2);
2077
2078
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
2079
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
2080
2081
0
  result = memcmp(a1p, a2p, Min(len1, len2));
2082
0
  if ((result == 0) && (len1 != len2))
2083
0
    result = (len1 < len2) ? -1 : 1;
2084
2085
  /* We can't afford to leak memory here. */
2086
0
  if (PointerGetDatum(arg1) != x)
2087
0
    pfree(arg1);
2088
0
  if (PointerGetDatum(arg2) != y)
2089
0
    pfree(arg2);
2090
2091
0
  return result;
2092
0
}
2093
2094
/*
2095
 * sortsupport comparison func (for BpChar C locale case)
2096
 *
2097
 * BpChar outsources its sortsupport to this module.  Specialization for the
2098
 * varstr_sortsupport BpChar case, modeled on
2099
 * internal_bpchar_pattern_compare().
2100
 */
2101
static int
2102
bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2103
0
{
2104
0
  BpChar     *arg1 = DatumGetBpCharPP(x);
2105
0
  BpChar     *arg2 = DatumGetBpCharPP(y);
2106
0
  char     *a1p,
2107
0
         *a2p;
2108
0
  int     len1,
2109
0
        len2,
2110
0
        result;
2111
2112
0
  a1p = VARDATA_ANY(arg1);
2113
0
  a2p = VARDATA_ANY(arg2);
2114
2115
0
  len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2116
0
  len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2117
2118
0
  result = memcmp(a1p, a2p, Min(len1, len2));
2119
0
  if ((result == 0) && (len1 != len2))
2120
0
    result = (len1 < len2) ? -1 : 1;
2121
2122
  /* We can't afford to leak memory here. */
2123
0
  if (PointerGetDatum(arg1) != x)
2124
0
    pfree(arg1);
2125
0
  if (PointerGetDatum(arg2) != y)
2126
0
    pfree(arg2);
2127
2128
0
  return result;
2129
0
}
2130
2131
/*
2132
 * sortsupport comparison func (for NAME C locale case)
2133
 */
2134
static int
2135
namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2136
0
{
2137
0
  Name    arg1 = DatumGetName(x);
2138
0
  Name    arg2 = DatumGetName(y);
2139
2140
0
  return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2141
0
}
2142
2143
/*
2144
 * sortsupport comparison func (for locale case with all varlena types)
2145
 */
2146
static int
2147
varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2148
0
{
2149
0
  VarString  *arg1 = DatumGetVarStringPP(x);
2150
0
  VarString  *arg2 = DatumGetVarStringPP(y);
2151
0
  char     *a1p,
2152
0
         *a2p;
2153
0
  int     len1,
2154
0
        len2,
2155
0
        result;
2156
2157
0
  a1p = VARDATA_ANY(arg1);
2158
0
  a2p = VARDATA_ANY(arg2);
2159
2160
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
2161
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
2162
2163
0
  result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2164
2165
  /* We can't afford to leak memory here. */
2166
0
  if (PointerGetDatum(arg1) != x)
2167
0
    pfree(arg1);
2168
0
  if (PointerGetDatum(arg2) != y)
2169
0
    pfree(arg2);
2170
2171
0
  return result;
2172
0
}
2173
2174
/*
2175
 * sortsupport comparison func (for locale case with NAME type)
2176
 */
2177
static int
2178
namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2179
0
{
2180
0
  Name    arg1 = DatumGetName(x);
2181
0
  Name    arg2 = DatumGetName(y);
2182
2183
0
  return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2184
0
                NameStr(*arg2), strlen(NameStr(*arg2)),
2185
0
                ssup);
2186
0
}
2187
2188
/*
2189
 * sortsupport comparison func for locale cases
2190
 */
2191
static int
2192
varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2193
0
{
2194
0
  VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2195
0
  int     result;
2196
0
  bool    arg1_match;
2197
2198
  /* Fast pre-check for equality, as discussed in varstr_cmp() */
2199
0
  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2200
0
  {
2201
    /*
2202
     * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2203
     * last_len2.  Existing contents of buffers might still be used by
2204
     * next call.
2205
     *
2206
     * It's fine to allow the comparison of BpChar padding bytes here,
2207
     * even though that implies that the memcmp() will usually be
2208
     * performed for BpChar callers (though multibyte characters could
2209
     * still prevent that from occurring).  The memcmp() is still very
2210
     * cheap, and BpChar's funny semantics have us remove trailing spaces
2211
     * (not limited to padding), so we need make no distinction between
2212
     * padding space characters and "real" space characters.
2213
     */
2214
0
    return 0;
2215
0
  }
2216
2217
0
  if (sss->typid == BPCHAROID)
2218
0
  {
2219
    /* Get true number of bytes, ignoring trailing spaces */
2220
0
    len1 = bpchartruelen(a1p, len1);
2221
0
    len2 = bpchartruelen(a2p, len2);
2222
0
  }
2223
2224
0
  if (len1 >= sss->buflen1)
2225
0
  {
2226
0
    sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2227
0
    sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2228
0
  }
2229
0
  if (len2 >= sss->buflen2)
2230
0
  {
2231
0
    sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2232
0
    sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2233
0
  }
2234
2235
  /*
2236
   * We're likely to be asked to compare the same strings repeatedly, and
2237
   * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2238
   * comparisons, even though in general there is no reason to think that
2239
   * that will work out (every string datum may be unique).  Caching does
2240
   * not slow things down measurably when it doesn't work out, and can speed
2241
   * things up by rather a lot when it does.  In part, this is because the
2242
   * memcmp() compares data from cachelines that are needed in L1 cache even
2243
   * when the last comparison's result cannot be reused.
2244
   */
2245
0
  arg1_match = true;
2246
0
  if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2247
0
  {
2248
0
    arg1_match = false;
2249
0
    memcpy(sss->buf1, a1p, len1);
2250
0
    sss->buf1[len1] = '\0';
2251
0
    sss->last_len1 = len1;
2252
0
  }
2253
2254
  /*
2255
   * If we're comparing the same two strings as last time, we can return the
2256
   * same answer without calling strcoll() again.  This is more likely than
2257
   * it seems (at least with moderate to low cardinality sets), because
2258
   * quicksort compares the same pivot against many values.
2259
   */
2260
0
  if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2261
0
  {
2262
0
    memcpy(sss->buf2, a2p, len2);
2263
0
    sss->buf2[len2] = '\0';
2264
0
    sss->last_len2 = len2;
2265
0
  }
2266
0
  else if (arg1_match && !sss->cache_blob)
2267
0
  {
2268
    /* Use result cached following last actual strcoll() call */
2269
0
    return sss->last_returned;
2270
0
  }
2271
2272
0
  result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
2273
2274
  /* Break tie if necessary. */
2275
0
  if (result == 0 && sss->locale->deterministic)
2276
0
    result = strcmp(sss->buf1, sss->buf2);
2277
2278
  /* Cache result, perhaps saving an expensive strcoll() call next time */
2279
0
  sss->cache_blob = false;
2280
0
  sss->last_returned = result;
2281
0
  return result;
2282
0
}
2283
2284
/*
2285
 * Conversion routine for sortsupport.  Converts original to abbreviated key
2286
 * representation.  Our encoding strategy is simple -- pack the first 8 bytes
2287
 * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2288
 * stored in reverse order), and treat it as an unsigned integer.  When the "C"
2289
 * locale is used, or in case of bytea, just memcpy() from original instead.
2290
 */
2291
static Datum
2292
varstr_abbrev_convert(Datum original, SortSupport ssup)
2293
0
{
2294
0
  const size_t max_prefix_bytes = sizeof(Datum);
2295
0
  VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2296
0
  VarString  *authoritative = DatumGetVarStringPP(original);
2297
0
  char     *authoritative_data = VARDATA_ANY(authoritative);
2298
2299
  /* working state */
2300
0
  Datum   res;
2301
0
  char     *pres;
2302
0
  int     len;
2303
0
  uint32    hash;
2304
2305
0
  pres = (char *) &res;
2306
  /* memset(), so any non-overwritten bytes are NUL */
2307
0
  memset(pres, 0, max_prefix_bytes);
2308
0
  len = VARSIZE_ANY_EXHDR(authoritative);
2309
2310
  /* Get number of bytes, ignoring trailing spaces */
2311
0
  if (sss->typid == BPCHAROID)
2312
0
    len = bpchartruelen(authoritative_data, len);
2313
2314
  /*
2315
   * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2316
   * abbreviate keys.  The full comparator for the C locale is always
2317
   * memcmp().  It would be incorrect to allow bytea callers (callers that
2318
   * always force the C collation -- bytea isn't a collatable type, but this
2319
   * approach is convenient) to use strxfrm().  This is because bytea
2320
   * strings may contain NUL bytes.  Besides, this should be faster, too.
2321
   *
2322
   * More generally, it's okay that bytea callers can have NUL bytes in
2323
   * strings because abbreviated cmp need not make a distinction between
2324
   * terminating NUL bytes, and NUL bytes representing actual NULs in the
2325
   * authoritative representation.  Hopefully a comparison at or past one
2326
   * abbreviated key's terminating NUL byte will resolve the comparison
2327
   * without consulting the authoritative representation; specifically, some
2328
   * later non-NUL byte in the longer string can resolve the comparison
2329
   * against a subsequent terminating NUL in the shorter string.  There will
2330
   * usually be what is effectively a "length-wise" resolution there and
2331
   * then.
2332
   *
2333
   * If that doesn't work out -- if all bytes in the longer string
2334
   * positioned at or past the offset of the smaller string's (first)
2335
   * terminating NUL are actually representative of NUL bytes in the
2336
   * authoritative binary string (perhaps with some *terminating* NUL bytes
2337
   * towards the end of the longer string iff it happens to still be small)
2338
   * -- then an authoritative tie-breaker will happen, and do the right
2339
   * thing: explicitly consider string length.
2340
   */
2341
0
  if (sss->collate_c)
2342
0
    memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
2343
0
  else
2344
0
  {
2345
0
    Size    bsize;
2346
2347
    /*
2348
     * We're not using the C collation, so fall back on strxfrm or ICU
2349
     * analogs.
2350
     */
2351
2352
    /* By convention, we use buffer 1 to store and NUL-terminate */
2353
0
    if (len >= sss->buflen1)
2354
0
    {
2355
0
      sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2356
0
      sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2357
0
    }
2358
2359
    /* Might be able to reuse strxfrm() blob from last call */
2360
0
    if (sss->last_len1 == len && sss->cache_blob &&
2361
0
      memcmp(sss->buf1, authoritative_data, len) == 0)
2362
0
    {
2363
0
      memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
2364
      /* No change affecting cardinality, so no hashing required */
2365
0
      goto done;
2366
0
    }
2367
2368
0
    memcpy(sss->buf1, authoritative_data, len);
2369
2370
    /*
2371
     * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
2372
     */
2373
0
    sss->buf1[len] = '\0';
2374
0
    sss->last_len1 = len;
2375
2376
0
    if (pg_strxfrm_prefix_enabled(sss->locale))
2377
0
    {
2378
0
      if (sss->buflen2 < max_prefix_bytes)
2379
0
      {
2380
0
        sss->buflen2 = Max(max_prefix_bytes,
2381
0
                   Min(sss->buflen2 * 2, MaxAllocSize));
2382
0
        sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2383
0
      }
2384
2385
0
      bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
2386
0
                    max_prefix_bytes, sss->locale);
2387
0
      sss->last_len2 = bsize;
2388
0
    }
2389
0
    else
2390
0
    {
2391
      /*
2392
       * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
2393
       * again.  The pg_strxfrm() function leaves the result buffer
2394
       * content undefined if the result did not fit, so we need to
2395
       * retry until everything fits, even though we only need the first
2396
       * few bytes in the end.
2397
       */
2398
0
      for (;;)
2399
0
      {
2400
0
        bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
2401
0
                   sss->locale);
2402
2403
0
        sss->last_len2 = bsize;
2404
0
        if (bsize < sss->buflen2)
2405
0
          break;
2406
2407
        /*
2408
         * Grow buffer and retry.
2409
         */
2410
0
        sss->buflen2 = Max(bsize + 1,
2411
0
                   Min(sss->buflen2 * 2, MaxAllocSize));
2412
0
        sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2413
0
      }
2414
0
    }
2415
2416
    /*
2417
     * Every Datum byte is always compared.  This is safe because the
2418
     * strxfrm() blob is itself NUL terminated, leaving no danger of
2419
     * misinterpreting any NUL bytes not intended to be interpreted as
2420
     * logically representing termination.
2421
     *
2422
     * (Actually, even if there were NUL bytes in the blob it would be
2423
     * okay.  See remarks on bytea case above.)
2424
     */
2425
0
    memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
2426
0
  }
2427
2428
  /*
2429
   * Maintain approximate cardinality of both abbreviated keys and original,
2430
   * authoritative keys using HyperLogLog.  Used as cheap insurance against
2431
   * the worst case, where we do many string transformations for no saving
2432
   * in full strcoll()-based comparisons.  These statistics are used by
2433
   * varstr_abbrev_abort().
2434
   *
2435
   * First, Hash key proper, or a significant fraction of it.  Mix in length
2436
   * in order to compensate for cases where differences are past
2437
   * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2438
   */
2439
0
  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2440
0
                   Min(len, PG_CACHE_LINE_SIZE)));
2441
2442
0
  if (len > PG_CACHE_LINE_SIZE)
2443
0
    hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2444
2445
0
  addHyperLogLog(&sss->full_card, hash);
2446
2447
  /* Hash abbreviated key */
2448
0
#if SIZEOF_DATUM == 8
2449
0
  {
2450
0
    uint32    lohalf,
2451
0
          hihalf;
2452
2453
0
    lohalf = (uint32) res;
2454
0
    hihalf = (uint32) (res >> 32);
2455
0
    hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2456
0
  }
2457
#else             /* SIZEOF_DATUM != 8 */
2458
  hash = DatumGetUInt32(hash_uint32((uint32) res));
2459
#endif
2460
2461
0
  addHyperLogLog(&sss->abbr_card, hash);
2462
2463
  /* Cache result, perhaps saving an expensive strxfrm() call next time */
2464
0
  sss->cache_blob = true;
2465
0
done:
2466
2467
  /*
2468
   * Byteswap on little-endian machines.
2469
   *
2470
   * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2471
   * 3-way comparator) works correctly on all platforms.  If we didn't do
2472
   * this, the comparator would have to call memcmp() with a pair of
2473
   * pointers to the first byte of each abbreviated key, which is slower.
2474
   */
2475
0
  res = DatumBigEndianToNative(res);
2476
2477
  /* Don't leak memory here */
2478
0
  if (PointerGetDatum(authoritative) != original)
2479
0
    pfree(authoritative);
2480
2481
0
  return res;
2482
0
}
2483
2484
/*
2485
 * Callback for estimating effectiveness of abbreviated key optimization, using
2486
 * heuristic rules.  Returns value indicating if the abbreviation optimization
2487
 * should be aborted, based on its projected effectiveness.
2488
 */
2489
static bool
2490
varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2491
0
{
2492
0
  VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2493
0
  double    abbrev_distinct,
2494
0
        key_distinct;
2495
2496
0
  Assert(ssup->abbreviate);
2497
2498
  /* Have a little patience */
2499
0
  if (memtupcount < 100)
2500
0
    return false;
2501
2502
0
  abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2503
0
  key_distinct = estimateHyperLogLog(&sss->full_card);
2504
2505
  /*
2506
   * Clamp cardinality estimates to at least one distinct value.  While
2507
   * NULLs are generally disregarded, if only NULL values were seen so far,
2508
   * that might misrepresent costs if we failed to clamp.
2509
   */
2510
0
  if (abbrev_distinct <= 1.0)
2511
0
    abbrev_distinct = 1.0;
2512
2513
0
  if (key_distinct <= 1.0)
2514
0
    key_distinct = 1.0;
2515
2516
  /*
2517
   * In the worst case all abbreviated keys are identical, while at the same
2518
   * time there are differences within full key strings not captured in
2519
   * abbreviations.
2520
   */
2521
0
  if (trace_sort)
2522
0
  {
2523
0
    double    norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2524
2525
0
    elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2526
0
       "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2527
0
       memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2528
0
       sss->prop_card);
2529
0
  }
2530
2531
  /*
2532
   * If the number of distinct abbreviated keys approximately matches the
2533
   * number of distinct authoritative original keys, that's reason enough to
2534
   * proceed.  We can win even with a very low cardinality set if most
2535
   * tie-breakers only memcmp().  This is by far the most important
2536
   * consideration.
2537
   *
2538
   * While comparisons that are resolved at the abbreviated key level are
2539
   * considerably cheaper than tie-breakers resolved with memcmp(), both of
2540
   * those two outcomes are so much cheaper than a full strcoll() once
2541
   * sorting is underway that it doesn't seem worth it to weigh abbreviated
2542
   * cardinality against the overall size of the set in order to more
2543
   * accurately model costs.  Assume that an abbreviated comparison, and an
2544
   * abbreviated comparison with a cheap memcmp()-based authoritative
2545
   * resolution are equivalent.
2546
   */
2547
0
  if (abbrev_distinct > key_distinct * sss->prop_card)
2548
0
  {
2549
    /*
2550
     * When we have exceeded 10,000 tuples, decay required cardinality
2551
     * aggressively for next call.
2552
     *
2553
     * This is useful because the number of comparisons required on
2554
     * average increases at a linearithmic rate, and at roughly 10,000
2555
     * tuples that factor will start to dominate over the linear costs of
2556
     * string transformation (this is a conservative estimate).  The decay
2557
     * rate is chosen to be a little less aggressive than halving -- which
2558
     * (since we're called at points at which memtupcount has doubled)
2559
     * would never see the cost model actually abort past the first call
2560
     * following a decay.  This decay rate is mostly a precaution against
2561
     * a sudden, violent swing in how well abbreviated cardinality tracks
2562
     * full key cardinality.  The decay also serves to prevent a marginal
2563
     * case from being aborted too late, when too much has already been
2564
     * invested in string transformation.
2565
     *
2566
     * It's possible for sets of several million distinct strings with
2567
     * mere tens of thousands of distinct abbreviated keys to still
2568
     * benefit very significantly.  This will generally occur provided
2569
     * each abbreviated key is a proxy for a roughly uniform number of the
2570
     * set's full keys. If it isn't so, we hope to catch that early and
2571
     * abort.  If it isn't caught early, by the time the problem is
2572
     * apparent it's probably not worth aborting.
2573
     */
2574
0
    if (memtupcount > 10000)
2575
0
      sss->prop_card *= 0.65;
2576
2577
0
    return false;
2578
0
  }
2579
2580
  /*
2581
   * Abort abbreviation strategy.
2582
   *
2583
   * The worst case, where all abbreviated keys are identical while all
2584
   * original strings differ will typically only see a regression of about
2585
   * 10% in execution time for small to medium sized lists of strings.
2586
   * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2587
   * often expect very large improvements, particularly with sets of strings
2588
   * of moderately high to high abbreviated cardinality.  There is little to
2589
   * lose but much to gain, which our strategy reflects.
2590
   */
2591
0
  if (trace_sort)
2592
0
    elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2593
0
       "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2594
0
       memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2595
2596
0
  return true;
2597
0
}
2598
2599
/*
2600
 * Generic equalimage support function for character type's operator classes.
2601
 * Disables the use of deduplication with nondeterministic collations.
2602
 */
2603
Datum
2604
btvarstrequalimage(PG_FUNCTION_ARGS)
2605
0
{
2606
  /* Oid    opcintype = PG_GETARG_OID(0); */
2607
0
  Oid     collid = PG_GET_COLLATION();
2608
0
  pg_locale_t locale;
2609
2610
0
  check_collation_set(collid);
2611
2612
0
  locale = pg_newlocale_from_collation(collid);
2613
2614
0
  PG_RETURN_BOOL(locale->deterministic);
2615
0
}
2616
2617
Datum
2618
text_larger(PG_FUNCTION_ARGS)
2619
0
{
2620
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
2621
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
2622
0
  text     *result;
2623
2624
0
  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2625
2626
0
  PG_RETURN_TEXT_P(result);
2627
0
}
2628
2629
Datum
2630
text_smaller(PG_FUNCTION_ARGS)
2631
0
{
2632
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
2633
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
2634
0
  text     *result;
2635
2636
0
  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2637
2638
0
  PG_RETURN_TEXT_P(result);
2639
0
}
2640
2641
2642
/*
2643
 * Cross-type comparison functions for types text and name.
2644
 */
2645
2646
Datum
2647
nameeqtext(PG_FUNCTION_ARGS)
2648
0
{
2649
0
  Name    arg1 = PG_GETARG_NAME(0);
2650
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
2651
0
  size_t    len1 = strlen(NameStr(*arg1));
2652
0
  size_t    len2 = VARSIZE_ANY_EXHDR(arg2);
2653
0
  Oid     collid = PG_GET_COLLATION();
2654
0
  bool    result;
2655
2656
0
  check_collation_set(collid);
2657
2658
0
  if (collid == C_COLLATION_OID)
2659
0
    result = (len1 == len2 &&
2660
0
          memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2661
0
  else
2662
0
    result = (varstr_cmp(NameStr(*arg1), len1,
2663
0
               VARDATA_ANY(arg2), len2,
2664
0
               collid) == 0);
2665
2666
0
  PG_FREE_IF_COPY(arg2, 1);
2667
2668
0
  PG_RETURN_BOOL(result);
2669
0
}
2670
2671
Datum
2672
texteqname(PG_FUNCTION_ARGS)
2673
0
{
2674
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
2675
0
  Name    arg2 = PG_GETARG_NAME(1);
2676
0
  size_t    len1 = VARSIZE_ANY_EXHDR(arg1);
2677
0
  size_t    len2 = strlen(NameStr(*arg2));
2678
0
  Oid     collid = PG_GET_COLLATION();
2679
0
  bool    result;
2680
2681
0
  check_collation_set(collid);
2682
2683
0
  if (collid == C_COLLATION_OID)
2684
0
    result = (len1 == len2 &&
2685
0
          memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2686
0
  else
2687
0
    result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2688
0
               NameStr(*arg2), len2,
2689
0
               collid) == 0);
2690
2691
0
  PG_FREE_IF_COPY(arg1, 0);
2692
2693
0
  PG_RETURN_BOOL(result);
2694
0
}
2695
2696
Datum
2697
namenetext(PG_FUNCTION_ARGS)
2698
0
{
2699
0
  Name    arg1 = PG_GETARG_NAME(0);
2700
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
2701
0
  size_t    len1 = strlen(NameStr(*arg1));
2702
0
  size_t    len2 = VARSIZE_ANY_EXHDR(arg2);
2703
0
  Oid     collid = PG_GET_COLLATION();
2704
0
  bool    result;
2705
2706
0
  check_collation_set(collid);
2707
2708
0
  if (collid == C_COLLATION_OID)
2709
0
    result = !(len1 == len2 &&
2710
0
           memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2711
0
  else
2712
0
    result = !(varstr_cmp(NameStr(*arg1), len1,
2713
0
                VARDATA_ANY(arg2), len2,
2714
0
                collid) == 0);
2715
2716
0
  PG_FREE_IF_COPY(arg2, 1);
2717
2718
0
  PG_RETURN_BOOL(result);
2719
0
}
2720
2721
Datum
2722
textnename(PG_FUNCTION_ARGS)
2723
0
{
2724
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
2725
0
  Name    arg2 = PG_GETARG_NAME(1);
2726
0
  size_t    len1 = VARSIZE_ANY_EXHDR(arg1);
2727
0
  size_t    len2 = strlen(NameStr(*arg2));
2728
0
  Oid     collid = PG_GET_COLLATION();
2729
0
  bool    result;
2730
2731
0
  check_collation_set(collid);
2732
2733
0
  if (collid == C_COLLATION_OID)
2734
0
    result = !(len1 == len2 &&
2735
0
           memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2736
0
  else
2737
0
    result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2738
0
                NameStr(*arg2), len2,
2739
0
                collid) == 0);
2740
2741
0
  PG_FREE_IF_COPY(arg1, 0);
2742
2743
0
  PG_RETURN_BOOL(result);
2744
0
}
2745
2746
Datum
2747
btnametextcmp(PG_FUNCTION_ARGS)
2748
0
{
2749
0
  Name    arg1 = PG_GETARG_NAME(0);
2750
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
2751
0
  int32   result;
2752
2753
0
  result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2754
0
            VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2755
0
            PG_GET_COLLATION());
2756
2757
0
  PG_FREE_IF_COPY(arg2, 1);
2758
2759
0
  PG_RETURN_INT32(result);
2760
0
}
2761
2762
Datum
2763
bttextnamecmp(PG_FUNCTION_ARGS)
2764
0
{
2765
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
2766
0
  Name    arg2 = PG_GETARG_NAME(1);
2767
0
  int32   result;
2768
2769
0
  result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2770
0
            NameStr(*arg2), strlen(NameStr(*arg2)),
2771
0
            PG_GET_COLLATION());
2772
2773
0
  PG_FREE_IF_COPY(arg1, 0);
2774
2775
0
  PG_RETURN_INT32(result);
2776
0
}
2777
2778
#define CmpCall(cmpfunc) \
2779
  DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2780
                      PG_GET_COLLATION(), \
2781
                      PG_GETARG_DATUM(0), \
2782
                      PG_GETARG_DATUM(1)))
2783
2784
Datum
2785
namelttext(PG_FUNCTION_ARGS)
2786
0
{
2787
0
  PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2788
0
}
2789
2790
Datum
2791
nameletext(PG_FUNCTION_ARGS)
2792
0
{
2793
0
  PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
2794
0
}
2795
2796
Datum
2797
namegttext(PG_FUNCTION_ARGS)
2798
0
{
2799
0
  PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
2800
0
}
2801
2802
Datum
2803
namegetext(PG_FUNCTION_ARGS)
2804
0
{
2805
0
  PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
2806
0
}
2807
2808
Datum
2809
textltname(PG_FUNCTION_ARGS)
2810
0
{
2811
0
  PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
2812
0
}
2813
2814
Datum
2815
textlename(PG_FUNCTION_ARGS)
2816
0
{
2817
0
  PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
2818
0
}
2819
2820
Datum
2821
textgtname(PG_FUNCTION_ARGS)
2822
0
{
2823
0
  PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
2824
0
}
2825
2826
Datum
2827
textgename(PG_FUNCTION_ARGS)
2828
0
{
2829
0
  PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
2830
0
}
2831
2832
#undef CmpCall
2833
2834
2835
/*
2836
 * The following operators support character-by-character comparison
2837
 * of text datums, to allow building indexes suitable for LIKE clauses.
2838
 * Note that the regular texteq/textne comparison operators, and regular
2839
 * support functions 1 and 2 with "C" collation are assumed to be
2840
 * compatible with these!
2841
 */
2842
2843
static int
2844
internal_text_pattern_compare(text *arg1, text *arg2)
2845
0
{
2846
0
  int     result;
2847
0
  int     len1,
2848
0
        len2;
2849
2850
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
2851
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
2852
2853
0
  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2854
0
  if (result != 0)
2855
0
    return result;
2856
0
  else if (len1 < len2)
2857
0
    return -1;
2858
0
  else if (len1 > len2)
2859
0
    return 1;
2860
0
  else
2861
0
    return 0;
2862
0
}
2863
2864
2865
Datum
2866
text_pattern_lt(PG_FUNCTION_ARGS)
2867
0
{
2868
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
2869
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
2870
0
  int     result;
2871
2872
0
  result = internal_text_pattern_compare(arg1, arg2);
2873
2874
0
  PG_FREE_IF_COPY(arg1, 0);
2875
0
  PG_FREE_IF_COPY(arg2, 1);
2876
2877
0
  PG_RETURN_BOOL(result < 0);
2878
0
}
2879
2880
2881
Datum
2882
text_pattern_le(PG_FUNCTION_ARGS)
2883
0
{
2884
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
2885
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
2886
0
  int     result;
2887
2888
0
  result = internal_text_pattern_compare(arg1, arg2);
2889
2890
0
  PG_FREE_IF_COPY(arg1, 0);
2891
0
  PG_FREE_IF_COPY(arg2, 1);
2892
2893
0
  PG_RETURN_BOOL(result <= 0);
2894
0
}
2895
2896
2897
Datum
2898
text_pattern_ge(PG_FUNCTION_ARGS)
2899
0
{
2900
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
2901
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
2902
0
  int     result;
2903
2904
0
  result = internal_text_pattern_compare(arg1, arg2);
2905
2906
0
  PG_FREE_IF_COPY(arg1, 0);
2907
0
  PG_FREE_IF_COPY(arg2, 1);
2908
2909
0
  PG_RETURN_BOOL(result >= 0);
2910
0
}
2911
2912
2913
Datum
2914
text_pattern_gt(PG_FUNCTION_ARGS)
2915
0
{
2916
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
2917
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
2918
0
  int     result;
2919
2920
0
  result = internal_text_pattern_compare(arg1, arg2);
2921
2922
0
  PG_FREE_IF_COPY(arg1, 0);
2923
0
  PG_FREE_IF_COPY(arg2, 1);
2924
2925
0
  PG_RETURN_BOOL(result > 0);
2926
0
}
2927
2928
2929
Datum
2930
bttext_pattern_cmp(PG_FUNCTION_ARGS)
2931
0
{
2932
0
  text     *arg1 = PG_GETARG_TEXT_PP(0);
2933
0
  text     *arg2 = PG_GETARG_TEXT_PP(1);
2934
0
  int     result;
2935
2936
0
  result = internal_text_pattern_compare(arg1, arg2);
2937
2938
0
  PG_FREE_IF_COPY(arg1, 0);
2939
0
  PG_FREE_IF_COPY(arg2, 1);
2940
2941
0
  PG_RETURN_INT32(result);
2942
0
}
2943
2944
2945
Datum
2946
bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2947
0
{
2948
0
  SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2949
0
  MemoryContext oldcontext;
2950
2951
0
  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2952
2953
  /* Use generic string SortSupport, forcing "C" collation */
2954
0
  varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
2955
2956
0
  MemoryContextSwitchTo(oldcontext);
2957
2958
0
  PG_RETURN_VOID();
2959
0
}
2960
2961
2962
/*-------------------------------------------------------------
2963
 * byteaoctetlen
2964
 *
2965
 * get the number of bytes contained in an instance of type 'bytea'
2966
 *-------------------------------------------------------------
2967
 */
2968
Datum
2969
byteaoctetlen(PG_FUNCTION_ARGS)
2970
0
{
2971
0
  Datum   str = PG_GETARG_DATUM(0);
2972
2973
  /* We need not detoast the input at all */
2974
0
  PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2975
0
}
2976
2977
/*
2978
 * byteacat -
2979
 *    takes two bytea* and returns a bytea* that is the concatenation of
2980
 *    the two.
2981
 *
2982
 * Cloned from textcat and modified as required.
2983
 */
2984
Datum
2985
byteacat(PG_FUNCTION_ARGS)
2986
0
{
2987
0
  bytea    *t1 = PG_GETARG_BYTEA_PP(0);
2988
0
  bytea    *t2 = PG_GETARG_BYTEA_PP(1);
2989
2990
0
  PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2991
0
}
2992
2993
/*
2994
 * bytea_catenate
2995
 *  Guts of byteacat(), broken out so it can be used by other functions
2996
 *
2997
 * Arguments can be in short-header form, but not compressed or out-of-line
2998
 */
2999
static bytea *
3000
bytea_catenate(bytea *t1, bytea *t2)
3001
0
{
3002
0
  bytea    *result;
3003
0
  int     len1,
3004
0
        len2,
3005
0
        len;
3006
0
  char     *ptr;
3007
3008
0
  len1 = VARSIZE_ANY_EXHDR(t1);
3009
0
  len2 = VARSIZE_ANY_EXHDR(t2);
3010
3011
  /* paranoia ... probably should throw error instead? */
3012
0
  if (len1 < 0)
3013
0
    len1 = 0;
3014
0
  if (len2 < 0)
3015
0
    len2 = 0;
3016
3017
0
  len = len1 + len2 + VARHDRSZ;
3018
0
  result = (bytea *) palloc(len);
3019
3020
  /* Set size of result string... */
3021
0
  SET_VARSIZE(result, len);
3022
3023
  /* Fill data field of result string... */
3024
0
  ptr = VARDATA(result);
3025
0
  if (len1 > 0)
3026
0
    memcpy(ptr, VARDATA_ANY(t1), len1);
3027
0
  if (len2 > 0)
3028
0
    memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3029
3030
0
  return result;
3031
0
}
3032
3033
#define PG_STR_GET_BYTEA(str_) \
3034
0
  DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3035
3036
/*
3037
 * bytea_substr()
3038
 * Return a substring starting at the specified position.
3039
 * Cloned from text_substr and modified as required.
3040
 *
3041
 * Input:
3042
 *  - string
3043
 *  - starting position (is one-based)
3044
 *  - string length (optional)
3045
 *
3046
 * If the starting position is zero or less, then return from the start of the string
3047
 * adjusting the length to be consistent with the "negative start" per SQL.
3048
 * If the length is less than zero, an ERROR is thrown. If no third argument
3049
 * (length) is provided, the length to the end of the string is assumed.
3050
 */
3051
Datum
3052
bytea_substr(PG_FUNCTION_ARGS)
3053
0
{
3054
0
  PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3055
0
                    PG_GETARG_INT32(1),
3056
0
                    PG_GETARG_INT32(2),
3057
0
                    false));
3058
0
}
3059
3060
/*
3061
 * bytea_substr_no_len -
3062
 *    Wrapper to avoid opr_sanity failure due to
3063
 *    one function accepting a different number of args.
3064
 */
3065
Datum
3066
bytea_substr_no_len(PG_FUNCTION_ARGS)
3067
0
{
3068
0
  PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3069
0
                    PG_GETARG_INT32(1),
3070
0
                    -1,
3071
0
                    true));
3072
0
}
3073
3074
static bytea *
3075
bytea_substring(Datum str,
3076
        int S,
3077
        int L,
3078
        bool length_not_specified)
3079
0
{
3080
0
  int32   S1;       /* adjusted start position */
3081
0
  int32   L1;       /* adjusted substring length */
3082
0
  int32   E;        /* end position */
3083
3084
  /*
3085
   * The logic here should generally match text_substring().
3086
   */
3087
0
  S1 = Max(S, 1);
3088
3089
0
  if (length_not_specified)
3090
0
  {
3091
    /*
3092
     * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3093
     * end of the string if we pass it a negative value for length.
3094
     */
3095
0
    L1 = -1;
3096
0
  }
3097
0
  else if (L < 0)
3098
0
  {
3099
    /* SQL99 says to throw an error for E < S, i.e., negative length */
3100
0
    ereport(ERROR,
3101
0
        (errcode(ERRCODE_SUBSTRING_ERROR),
3102
0
         errmsg("negative substring length not allowed")));
3103
0
    L1 = -1;        /* silence stupider compilers */
3104
0
  }
3105
0
  else if (pg_add_s32_overflow(S, L, &E))
3106
0
  {
3107
    /*
3108
     * L could be large enough for S + L to overflow, in which case the
3109
     * substring must run to end of string.
3110
     */
3111
0
    L1 = -1;
3112
0
  }
3113
0
  else
3114
0
  {
3115
    /*
3116
     * A zero or negative value for the end position can happen if the
3117
     * start was negative or one. SQL99 says to return a zero-length
3118
     * string.
3119
     */
3120
0
    if (E < 1)
3121
0
      return PG_STR_GET_BYTEA("");
3122
3123
0
    L1 = E - S1;
3124
0
  }
3125
3126
  /*
3127
   * If the start position is past the end of the string, SQL99 says to
3128
   * return a zero-length string -- DatumGetByteaPSlice() will do that for
3129
   * us.  We need only convert S1 to zero-based starting position.
3130
   */
3131
0
  return DatumGetByteaPSlice(str, S1 - 1, L1);
3132
0
}
3133
3134
/*
3135
 * byteaoverlay
3136
 *  Replace specified substring of first string with second
3137
 *
3138
 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3139
 * This code is a direct implementation of what the standard says.
3140
 */
3141
Datum
3142
byteaoverlay(PG_FUNCTION_ARGS)
3143
0
{
3144
0
  bytea    *t1 = PG_GETARG_BYTEA_PP(0);
3145
0
  bytea    *t2 = PG_GETARG_BYTEA_PP(1);
3146
0
  int     sp = PG_GETARG_INT32(2);  /* substring start position */
3147
0
  int     sl = PG_GETARG_INT32(3);  /* substring length */
3148
3149
0
  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3150
0
}
3151
3152
Datum
3153
byteaoverlay_no_len(PG_FUNCTION_ARGS)
3154
0
{
3155
0
  bytea    *t1 = PG_GETARG_BYTEA_PP(0);
3156
0
  bytea    *t2 = PG_GETARG_BYTEA_PP(1);
3157
0
  int     sp = PG_GETARG_INT32(2);  /* substring start position */
3158
0
  int     sl;
3159
3160
0
  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3161
0
  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3162
0
}
3163
3164
static bytea *
3165
bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3166
0
{
3167
0
  bytea    *result;
3168
0
  bytea    *s1;
3169
0
  bytea    *s2;
3170
0
  int     sp_pl_sl;
3171
3172
  /*
3173
   * Check for possible integer-overflow cases.  For negative sp, throw a
3174
   * "substring length" error because that's what should be expected
3175
   * according to the spec's definition of OVERLAY().
3176
   */
3177
0
  if (sp <= 0)
3178
0
    ereport(ERROR,
3179
0
        (errcode(ERRCODE_SUBSTRING_ERROR),
3180
0
         errmsg("negative substring length not allowed")));
3181
0
  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3182
0
    ereport(ERROR,
3183
0
        (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3184
0
         errmsg("integer out of range")));
3185
3186
0
  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3187
0
  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3188
0
  result = bytea_catenate(s1, t2);
3189
0
  result = bytea_catenate(result, s2);
3190
3191
0
  return result;
3192
0
}
3193
3194
/*
3195
 * bit_count
3196
 */
3197
Datum
3198
bytea_bit_count(PG_FUNCTION_ARGS)
3199
0
{
3200
0
  bytea    *t1 = PG_GETARG_BYTEA_PP(0);
3201
3202
0
  PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3203
0
}
3204
3205
/*
3206
 * byteapos -
3207
 *    Return the position of the specified substring.
3208
 *    Implements the SQL POSITION() function.
3209
 * Cloned from textpos and modified as required.
3210
 */
3211
Datum
3212
byteapos(PG_FUNCTION_ARGS)
3213
0
{
3214
0
  bytea    *t1 = PG_GETARG_BYTEA_PP(0);
3215
0
  bytea    *t2 = PG_GETARG_BYTEA_PP(1);
3216
0
  int     pos;
3217
0
  int     px,
3218
0
        p;
3219
0
  int     len1,
3220
0
        len2;
3221
0
  char     *p1,
3222
0
         *p2;
3223
3224
0
  len1 = VARSIZE_ANY_EXHDR(t1);
3225
0
  len2 = VARSIZE_ANY_EXHDR(t2);
3226
3227
0
  if (len2 <= 0)
3228
0
    PG_RETURN_INT32(1);    /* result for empty pattern */
3229
3230
0
  p1 = VARDATA_ANY(t1);
3231
0
  p2 = VARDATA_ANY(t2);
3232
3233
0
  pos = 0;
3234
0
  px = (len1 - len2);
3235
0
  for (p = 0; p <= px; p++)
3236
0
  {
3237
0
    if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3238
0
    {
3239
0
      pos = p + 1;
3240
0
      break;
3241
0
    };
3242
0
    p1++;
3243
0
  };
3244
3245
0
  PG_RETURN_INT32(pos);
3246
0
}
3247
3248
/*-------------------------------------------------------------
3249
 * byteaGetByte
3250
 *
3251
 * this routine treats "bytea" as an array of bytes.
3252
 * It returns the Nth byte (a number between 0 and 255).
3253
 *-------------------------------------------------------------
3254
 */
3255
Datum
3256
byteaGetByte(PG_FUNCTION_ARGS)
3257
0
{
3258
0
  bytea    *v = PG_GETARG_BYTEA_PP(0);
3259
0
  int32   n = PG_GETARG_INT32(1);
3260
0
  int     len;
3261
0
  int     byte;
3262
3263
0
  len = VARSIZE_ANY_EXHDR(v);
3264
3265
0
  if (n < 0 || n >= len)
3266
0
    ereport(ERROR,
3267
0
        (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3268
0
         errmsg("index %d out of valid range, 0..%d",
3269
0
            n, len - 1)));
3270
3271
0
  byte = ((unsigned char *) VARDATA_ANY(v))[n];
3272
3273
0
  PG_RETURN_INT32(byte);
3274
0
}
3275
3276
/*-------------------------------------------------------------
3277
 * byteaGetBit
3278
 *
3279
 * This routine treats a "bytea" type like an array of bits.
3280
 * It returns the value of the Nth bit (0 or 1).
3281
 *
3282
 *-------------------------------------------------------------
3283
 */
3284
Datum
3285
byteaGetBit(PG_FUNCTION_ARGS)
3286
0
{
3287
0
  bytea    *v = PG_GETARG_BYTEA_PP(0);
3288
0
  int64   n = PG_GETARG_INT64(1);
3289
0
  int     byteNo,
3290
0
        bitNo;
3291
0
  int     len;
3292
0
  int     byte;
3293
3294
0
  len = VARSIZE_ANY_EXHDR(v);
3295
3296
0
  if (n < 0 || n >= (int64) len * 8)
3297
0
    ereport(ERROR,
3298
0
        (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3299
0
         errmsg("index %" PRId64 " out of valid range, 0..%" PRId64,
3300
0
            n, (int64) len * 8 - 1)));
3301
3302
  /* n/8 is now known < len, so safe to cast to int */
3303
0
  byteNo = (int) (n / 8);
3304
0
  bitNo = (int) (n % 8);
3305
3306
0
  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3307
3308
0
  if (byte & (1 << bitNo))
3309
0
    PG_RETURN_INT32(1);
3310
0
  else
3311
0
    PG_RETURN_INT32(0);
3312
0
}
3313
3314
/*-------------------------------------------------------------
3315
 * byteaSetByte
3316
 *
3317
 * Given an instance of type 'bytea' creates a new one with
3318
 * the Nth byte set to the given value.
3319
 *
3320
 *-------------------------------------------------------------
3321
 */
3322
Datum
3323
byteaSetByte(PG_FUNCTION_ARGS)
3324
0
{
3325
0
  bytea    *res = PG_GETARG_BYTEA_P_COPY(0);
3326
0
  int32   n = PG_GETARG_INT32(1);
3327
0
  int32   newByte = PG_GETARG_INT32(2);
3328
0
  int     len;
3329
3330
0
  len = VARSIZE(res) - VARHDRSZ;
3331
3332
0
  if (n < 0 || n >= len)
3333
0
    ereport(ERROR,
3334
0
        (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3335
0
         errmsg("index %d out of valid range, 0..%d",
3336
0
            n, len - 1)));
3337
3338
  /*
3339
   * Now set the byte.
3340
   */
3341
0
  ((unsigned char *) VARDATA(res))[n] = newByte;
3342
3343
0
  PG_RETURN_BYTEA_P(res);
3344
0
}
3345
3346
/*-------------------------------------------------------------
3347
 * byteaSetBit
3348
 *
3349
 * Given an instance of type 'bytea' creates a new one with
3350
 * the Nth bit set to the given value.
3351
 *
3352
 *-------------------------------------------------------------
3353
 */
3354
Datum
3355
byteaSetBit(PG_FUNCTION_ARGS)
3356
0
{
3357
0
  bytea    *res = PG_GETARG_BYTEA_P_COPY(0);
3358
0
  int64   n = PG_GETARG_INT64(1);
3359
0
  int32   newBit = PG_GETARG_INT32(2);
3360
0
  int     len;
3361
0
  int     oldByte,
3362
0
        newByte;
3363
0
  int     byteNo,
3364
0
        bitNo;
3365
3366
0
  len = VARSIZE(res) - VARHDRSZ;
3367
3368
0
  if (n < 0 || n >= (int64) len * 8)
3369
0
    ereport(ERROR,
3370
0
        (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3371
0
         errmsg("index %" PRId64 " out of valid range, 0..%" PRId64,
3372
0
            n, (int64) len * 8 - 1)));
3373
3374
  /* n/8 is now known < len, so safe to cast to int */
3375
0
  byteNo = (int) (n / 8);
3376
0
  bitNo = (int) (n % 8);
3377
3378
  /*
3379
   * sanity check!
3380
   */
3381
0
  if (newBit != 0 && newBit != 1)
3382
0
    ereport(ERROR,
3383
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3384
0
         errmsg("new bit must be 0 or 1")));
3385
3386
  /*
3387
   * Update the byte.
3388
   */
3389
0
  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3390
3391
0
  if (newBit == 0)
3392
0
    newByte = oldByte & (~(1 << bitNo));
3393
0
  else
3394
0
    newByte = oldByte | (1 << bitNo);
3395
3396
0
  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3397
3398
0
  PG_RETURN_BYTEA_P(res);
3399
0
}
3400
3401
/*
3402
 * Return reversed bytea
3403
 */
3404
Datum
3405
bytea_reverse(PG_FUNCTION_ARGS)
3406
0
{
3407
0
  bytea    *v = PG_GETARG_BYTEA_PP(0);
3408
0
  const char *p = VARDATA_ANY(v);
3409
0
  int     len = VARSIZE_ANY_EXHDR(v);
3410
0
  const char *endp = p + len;
3411
0
  bytea    *result = palloc(len + VARHDRSZ);
3412
0
  char     *dst = (char *) VARDATA(result) + len;
3413
3414
0
  SET_VARSIZE(result, len + VARHDRSZ);
3415
3416
0
  while (p < endp)
3417
0
    *(--dst) = *p++;
3418
3419
0
  PG_RETURN_BYTEA_P(result);
3420
0
}
3421
3422
3423
/* text_name()
3424
 * Converts a text type to a Name type.
3425
 */
3426
Datum
3427
text_name(PG_FUNCTION_ARGS)
3428
0
{
3429
0
  text     *s = PG_GETARG_TEXT_PP(0);
3430
0
  Name    result;
3431
0
  int     len;
3432
3433
0
  len = VARSIZE_ANY_EXHDR(s);
3434
3435
  /* Truncate oversize input */
3436
0
  if (len >= NAMEDATALEN)
3437
0
    len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3438
3439
  /* We use palloc0 here to ensure result is zero-padded */
3440
0
  result = (Name) palloc0(NAMEDATALEN);
3441
0
  memcpy(NameStr(*result), VARDATA_ANY(s), len);
3442
3443
0
  PG_RETURN_NAME(result);
3444
0
}
3445
3446
/* name_text()
3447
 * Converts a Name type to a text type.
3448
 */
3449
Datum
3450
name_text(PG_FUNCTION_ARGS)
3451
0
{
3452
0
  Name    s = PG_GETARG_NAME(0);
3453
3454
0
  PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3455
0
}
3456
3457
3458
/*
3459
 * textToQualifiedNameList - convert a text object to list of names
3460
 *
3461
 * This implements the input parsing needed by nextval() and other
3462
 * functions that take a text parameter representing a qualified name.
3463
 * We split the name at dots, downcase if not double-quoted, and
3464
 * truncate names if they're too long.
3465
 */
3466
List *
3467
textToQualifiedNameList(text *textval)
3468
0
{
3469
0
  char     *rawname;
3470
0
  List     *result = NIL;
3471
0
  List     *namelist;
3472
0
  ListCell   *l;
3473
3474
  /* Convert to C string (handles possible detoasting). */
3475
  /* Note we rely on being able to modify rawname below. */
3476
0
  rawname = text_to_cstring(textval);
3477
3478
0
  if (!SplitIdentifierString(rawname, '.', &namelist))
3479
0
    ereport(ERROR,
3480
0
        (errcode(ERRCODE_INVALID_NAME),
3481
0
         errmsg("invalid name syntax")));
3482
3483
0
  if (namelist == NIL)
3484
0
    ereport(ERROR,
3485
0
        (errcode(ERRCODE_INVALID_NAME),
3486
0
         errmsg("invalid name syntax")));
3487
3488
0
  foreach(l, namelist)
3489
0
  {
3490
0
    char     *curname = (char *) lfirst(l);
3491
3492
0
    result = lappend(result, makeString(pstrdup(curname)));
3493
0
  }
3494
3495
0
  pfree(rawname);
3496
0
  list_free(namelist);
3497
3498
0
  return result;
3499
0
}
3500
3501
/*
3502
 * SplitIdentifierString --- parse a string containing identifiers
3503
 *
3504
 * This is the guts of textToQualifiedNameList, and is exported for use in
3505
 * other situations such as parsing GUC variables.  In the GUC case, it's
3506
 * important to avoid memory leaks, so the API is designed to minimize the
3507
 * amount of stuff that needs to be allocated and freed.
3508
 *
3509
 * Inputs:
3510
 *  rawstring: the input string; must be overwritable!  On return, it's
3511
 *         been modified to contain the separated identifiers.
3512
 *  separator: the separator punctuation expected between identifiers
3513
 *         (typically '.' or ',').  Whitespace may also appear around
3514
 *         identifiers.
3515
 * Outputs:
3516
 *  namelist: filled with a palloc'd list of pointers to identifiers within
3517
 *        rawstring.  Caller should list_free() this even on error return.
3518
 *
3519
 * Returns true if okay, false if there is a syntax error in the string.
3520
 *
3521
 * Note that an empty string is considered okay here, though not in
3522
 * textToQualifiedNameList.
3523
 */
3524
bool
3525
SplitIdentifierString(char *rawstring, char separator,
3526
            List **namelist)
3527
0
{
3528
0
  char     *nextp = rawstring;
3529
0
  bool    done = false;
3530
3531
0
  *namelist = NIL;
3532
3533
0
  while (scanner_isspace(*nextp))
3534
0
    nextp++;       /* skip leading whitespace */
3535
3536
0
  if (*nextp == '\0')
3537
0
    return true;     /* allow empty string */
3538
3539
  /* At the top of the loop, we are at start of a new identifier. */
3540
0
  do
3541
0
  {
3542
0
    char     *curname;
3543
0
    char     *endp;
3544
3545
0
    if (*nextp == '"')
3546
0
    {
3547
      /* Quoted name --- collapse quote-quote pairs, no downcasing */
3548
0
      curname = nextp + 1;
3549
0
      for (;;)
3550
0
      {
3551
0
        endp = strchr(nextp + 1, '"');
3552
0
        if (endp == NULL)
3553
0
          return false; /* mismatched quotes */
3554
0
        if (endp[1] != '"')
3555
0
          break;   /* found end of quoted name */
3556
        /* Collapse adjacent quotes into one quote, and look again */
3557
0
        memmove(endp, endp + 1, strlen(endp));
3558
0
        nextp = endp;
3559
0
      }
3560
      /* endp now points at the terminating quote */
3561
0
      nextp = endp + 1;
3562
0
    }
3563
0
    else
3564
0
    {
3565
      /* Unquoted name --- extends to separator or whitespace */
3566
0
      char     *downname;
3567
0
      int     len;
3568
3569
0
      curname = nextp;
3570
0
      while (*nextp && *nextp != separator &&
3571
0
           !scanner_isspace(*nextp))
3572
0
        nextp++;
3573
0
      endp = nextp;
3574
0
      if (curname == nextp)
3575
0
        return false; /* empty unquoted name not allowed */
3576
3577
      /*
3578
       * Downcase the identifier, using same code as main lexer does.
3579
       *
3580
       * XXX because we want to overwrite the input in-place, we cannot
3581
       * support a downcasing transformation that increases the string
3582
       * length.  This is not a problem given the current implementation
3583
       * of downcase_truncate_identifier, but we'll probably have to do
3584
       * something about this someday.
3585
       */
3586
0
      len = endp - curname;
3587
0
      downname = downcase_truncate_identifier(curname, len, false);
3588
0
      Assert(strlen(downname) <= len);
3589
0
      strncpy(curname, downname, len);  /* strncpy is required here */
3590
0
      pfree(downname);
3591
0
    }
3592
3593
0
    while (scanner_isspace(*nextp))
3594
0
      nextp++;     /* skip trailing whitespace */
3595
3596
0
    if (*nextp == separator)
3597
0
    {
3598
0
      nextp++;
3599
0
      while (scanner_isspace(*nextp))
3600
0
        nextp++;   /* skip leading whitespace for next */
3601
      /* we expect another name, so done remains false */
3602
0
    }
3603
0
    else if (*nextp == '\0')
3604
0
      done = true;
3605
0
    else
3606
0
      return false;   /* invalid syntax */
3607
3608
    /* Now safe to overwrite separator with a null */
3609
0
    *endp = '\0';
3610
3611
    /* Truncate name if it's overlength */
3612
0
    truncate_identifier(curname, strlen(curname), false);
3613
3614
    /*
3615
     * Finished isolating current name --- add it to list
3616
     */
3617
0
    *namelist = lappend(*namelist, curname);
3618
3619
    /* Loop back if we didn't reach end of string */
3620
0
  } while (!done);
3621
3622
0
  return true;
3623
0
}
3624
3625
3626
/*
3627
 * SplitDirectoriesString --- parse a string containing file/directory names
3628
 *
3629
 * This works fine on file names too; the function name is historical.
3630
 *
3631
 * This is similar to SplitIdentifierString, except that the parsing
3632
 * rules are meant to handle pathnames instead of identifiers: there is
3633
 * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3634
 * and we apply canonicalize_path() to each extracted string.  Because of the
3635
 * last, the returned strings are separately palloc'd rather than being
3636
 * pointers into rawstring --- but we still scribble on rawstring.
3637
 *
3638
 * Inputs:
3639
 *  rawstring: the input string; must be modifiable!
3640
 *  separator: the separator punctuation expected between directories
3641
 *         (typically ',' or ';').  Whitespace may also appear around
3642
 *         directories.
3643
 * Outputs:
3644
 *  namelist: filled with a palloc'd list of directory names.
3645
 *        Caller should list_free_deep() this even on error return.
3646
 *
3647
 * Returns true if okay, false if there is a syntax error in the string.
3648
 *
3649
 * Note that an empty string is considered okay here.
3650
 */
3651
bool
3652
SplitDirectoriesString(char *rawstring, char separator,
3653
             List **namelist)
3654
0
{
3655
0
  char     *nextp = rawstring;
3656
0
  bool    done = false;
3657
3658
0
  *namelist = NIL;
3659
3660
0
  while (scanner_isspace(*nextp))
3661
0
    nextp++;       /* skip leading whitespace */
3662
3663
0
  if (*nextp == '\0')
3664
0
    return true;     /* allow empty string */
3665
3666
  /* At the top of the loop, we are at start of a new directory. */
3667
0
  do
3668
0
  {
3669
0
    char     *curname;
3670
0
    char     *endp;
3671
3672
0
    if (*nextp == '"')
3673
0
    {
3674
      /* Quoted name --- collapse quote-quote pairs */
3675
0
      curname = nextp + 1;
3676
0
      for (;;)
3677
0
      {
3678
0
        endp = strchr(nextp + 1, '"');
3679
0
        if (endp == NULL)
3680
0
          return false; /* mismatched quotes */
3681
0
        if (endp[1] != '"')
3682
0
          break;   /* found end of quoted name */
3683
        /* Collapse adjacent quotes into one quote, and look again */
3684
0
        memmove(endp, endp + 1, strlen(endp));
3685
0
        nextp = endp;
3686
0
      }
3687
      /* endp now points at the terminating quote */
3688
0
      nextp = endp + 1;
3689
0
    }
3690
0
    else
3691
0
    {
3692
      /* Unquoted name --- extends to separator or end of string */
3693
0
      curname = endp = nextp;
3694
0
      while (*nextp && *nextp != separator)
3695
0
      {
3696
        /* trailing whitespace should not be included in name */
3697
0
        if (!scanner_isspace(*nextp))
3698
0
          endp = nextp + 1;
3699
0
        nextp++;
3700
0
      }
3701
0
      if (curname == endp)
3702
0
        return false; /* empty unquoted name not allowed */
3703
0
    }
3704
3705
0
    while (scanner_isspace(*nextp))
3706
0
      nextp++;     /* skip trailing whitespace */
3707
3708
0
    if (*nextp == separator)
3709
0
    {
3710
0
      nextp++;
3711
0
      while (scanner_isspace(*nextp))
3712
0
        nextp++;   /* skip leading whitespace for next */
3713
      /* we expect another name, so done remains false */
3714
0
    }
3715
0
    else if (*nextp == '\0')
3716
0
      done = true;
3717
0
    else
3718
0
      return false;   /* invalid syntax */
3719
3720
    /* Now safe to overwrite separator with a null */
3721
0
    *endp = '\0';
3722
3723
    /* Truncate path if it's overlength */
3724
0
    if (strlen(curname) >= MAXPGPATH)
3725
0
      curname[MAXPGPATH - 1] = '\0';
3726
3727
    /*
3728
     * Finished isolating current name --- add it to list
3729
     */
3730
0
    curname = pstrdup(curname);
3731
0
    canonicalize_path(curname);
3732
0
    *namelist = lappend(*namelist, curname);
3733
3734
    /* Loop back if we didn't reach end of string */
3735
0
  } while (!done);
3736
3737
0
  return true;
3738
0
}
3739
3740
3741
/*
3742
 * SplitGUCList --- parse a string containing identifiers or file names
3743
 *
3744
 * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3745
 * presuming whether the elements will be taken as identifiers or file names.
3746
 * We assume the input has already been through flatten_set_variable_args(),
3747
 * so that we need never downcase (if appropriate, that was done already).
3748
 * Nor do we ever truncate, since we don't know the correct max length.
3749
 * We disallow embedded whitespace for simplicity (it shouldn't matter,
3750
 * because any embedded whitespace should have led to double-quoting).
3751
 * Otherwise the API is identical to SplitIdentifierString.
3752
 *
3753
 * XXX it's annoying to have so many copies of this string-splitting logic.
3754
 * However, it's not clear that having one function with a bunch of option
3755
 * flags would be much better.
3756
 *
3757
 * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3758
 * Be sure to update that if you have to change this.
3759
 *
3760
 * Inputs:
3761
 *  rawstring: the input string; must be overwritable!  On return, it's
3762
 *         been modified to contain the separated identifiers.
3763
 *  separator: the separator punctuation expected between identifiers
3764
 *         (typically '.' or ',').  Whitespace may also appear around
3765
 *         identifiers.
3766
 * Outputs:
3767
 *  namelist: filled with a palloc'd list of pointers to identifiers within
3768
 *        rawstring.  Caller should list_free() this even on error return.
3769
 *
3770
 * Returns true if okay, false if there is a syntax error in the string.
3771
 */
3772
bool
3773
SplitGUCList(char *rawstring, char separator,
3774
       List **namelist)
3775
0
{
3776
0
  char     *nextp = rawstring;
3777
0
  bool    done = false;
3778
3779
0
  *namelist = NIL;
3780
3781
0
  while (scanner_isspace(*nextp))
3782
0
    nextp++;       /* skip leading whitespace */
3783
3784
0
  if (*nextp == '\0')
3785
0
    return true;     /* allow empty string */
3786
3787
  /* At the top of the loop, we are at start of a new identifier. */
3788
0
  do
3789
0
  {
3790
0
    char     *curname;
3791
0
    char     *endp;
3792
3793
0
    if (*nextp == '"')
3794
0
    {
3795
      /* Quoted name --- collapse quote-quote pairs */
3796
0
      curname = nextp + 1;
3797
0
      for (;;)
3798
0
      {
3799
0
        endp = strchr(nextp + 1, '"');
3800
0
        if (endp == NULL)
3801
0
          return false; /* mismatched quotes */
3802
0
        if (endp[1] != '"')
3803
0
          break;   /* found end of quoted name */
3804
        /* Collapse adjacent quotes into one quote, and look again */
3805
0
        memmove(endp, endp + 1, strlen(endp));
3806
0
        nextp = endp;
3807
0
      }
3808
      /* endp now points at the terminating quote */
3809
0
      nextp = endp + 1;
3810
0
    }
3811
0
    else
3812
0
    {
3813
      /* Unquoted name --- extends to separator or whitespace */
3814
0
      curname = nextp;
3815
0
      while (*nextp && *nextp != separator &&
3816
0
           !scanner_isspace(*nextp))
3817
0
        nextp++;
3818
0
      endp = nextp;
3819
0
      if (curname == nextp)
3820
0
        return false; /* empty unquoted name not allowed */
3821
0
    }
3822
3823
0
    while (scanner_isspace(*nextp))
3824
0
      nextp++;     /* skip trailing whitespace */
3825
3826
0
    if (*nextp == separator)
3827
0
    {
3828
0
      nextp++;
3829
0
      while (scanner_isspace(*nextp))
3830
0
        nextp++;   /* skip leading whitespace for next */
3831
      /* we expect another name, so done remains false */
3832
0
    }
3833
0
    else if (*nextp == '\0')
3834
0
      done = true;
3835
0
    else
3836
0
      return false;   /* invalid syntax */
3837
3838
    /* Now safe to overwrite separator with a null */
3839
0
    *endp = '\0';
3840
3841
    /*
3842
     * Finished isolating current name --- add it to list
3843
     */
3844
0
    *namelist = lappend(*namelist, curname);
3845
3846
    /* Loop back if we didn't reach end of string */
3847
0
  } while (!done);
3848
3849
0
  return true;
3850
0
}
3851
3852
3853
/*****************************************************************************
3854
 *  Comparison Functions used for bytea
3855
 *
3856
 * Note: btree indexes need these routines not to leak memory; therefore,
3857
 * be careful to free working copies of toasted datums.  Most places don't
3858
 * need to be so careful.
3859
 *****************************************************************************/
3860
3861
Datum
3862
byteaeq(PG_FUNCTION_ARGS)
3863
0
{
3864
0
  Datum   arg1 = PG_GETARG_DATUM(0);
3865
0
  Datum   arg2 = PG_GETARG_DATUM(1);
3866
0
  bool    result;
3867
0
  Size    len1,
3868
0
        len2;
3869
3870
  /*
3871
   * We can use a fast path for unequal lengths, which might save us from
3872
   * having to detoast one or both values.
3873
   */
3874
0
  len1 = toast_raw_datum_size(arg1);
3875
0
  len2 = toast_raw_datum_size(arg2);
3876
0
  if (len1 != len2)
3877
0
    result = false;
3878
0
  else
3879
0
  {
3880
0
    bytea    *barg1 = DatumGetByteaPP(arg1);
3881
0
    bytea    *barg2 = DatumGetByteaPP(arg2);
3882
3883
0
    result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3884
0
             len1 - VARHDRSZ) == 0);
3885
3886
0
    PG_FREE_IF_COPY(barg1, 0);
3887
0
    PG_FREE_IF_COPY(barg2, 1);
3888
0
  }
3889
3890
0
  PG_RETURN_BOOL(result);
3891
0
}
3892
3893
Datum
3894
byteane(PG_FUNCTION_ARGS)
3895
0
{
3896
0
  Datum   arg1 = PG_GETARG_DATUM(0);
3897
0
  Datum   arg2 = PG_GETARG_DATUM(1);
3898
0
  bool    result;
3899
0
  Size    len1,
3900
0
        len2;
3901
3902
  /*
3903
   * We can use a fast path for unequal lengths, which might save us from
3904
   * having to detoast one or both values.
3905
   */
3906
0
  len1 = toast_raw_datum_size(arg1);
3907
0
  len2 = toast_raw_datum_size(arg2);
3908
0
  if (len1 != len2)
3909
0
    result = true;
3910
0
  else
3911
0
  {
3912
0
    bytea    *barg1 = DatumGetByteaPP(arg1);
3913
0
    bytea    *barg2 = DatumGetByteaPP(arg2);
3914
3915
0
    result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3916
0
             len1 - VARHDRSZ) != 0);
3917
3918
0
    PG_FREE_IF_COPY(barg1, 0);
3919
0
    PG_FREE_IF_COPY(barg2, 1);
3920
0
  }
3921
3922
0
  PG_RETURN_BOOL(result);
3923
0
}
3924
3925
Datum
3926
bytealt(PG_FUNCTION_ARGS)
3927
0
{
3928
0
  bytea    *arg1 = PG_GETARG_BYTEA_PP(0);
3929
0
  bytea    *arg2 = PG_GETARG_BYTEA_PP(1);
3930
0
  int     len1,
3931
0
        len2;
3932
0
  int     cmp;
3933
3934
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
3935
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
3936
3937
0
  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3938
3939
0
  PG_FREE_IF_COPY(arg1, 0);
3940
0
  PG_FREE_IF_COPY(arg2, 1);
3941
3942
0
  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3943
0
}
3944
3945
Datum
3946
byteale(PG_FUNCTION_ARGS)
3947
0
{
3948
0
  bytea    *arg1 = PG_GETARG_BYTEA_PP(0);
3949
0
  bytea    *arg2 = PG_GETARG_BYTEA_PP(1);
3950
0
  int     len1,
3951
0
        len2;
3952
0
  int     cmp;
3953
3954
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
3955
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
3956
3957
0
  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3958
3959
0
  PG_FREE_IF_COPY(arg1, 0);
3960
0
  PG_FREE_IF_COPY(arg2, 1);
3961
3962
0
  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3963
0
}
3964
3965
Datum
3966
byteagt(PG_FUNCTION_ARGS)
3967
0
{
3968
0
  bytea    *arg1 = PG_GETARG_BYTEA_PP(0);
3969
0
  bytea    *arg2 = PG_GETARG_BYTEA_PP(1);
3970
0
  int     len1,
3971
0
        len2;
3972
0
  int     cmp;
3973
3974
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
3975
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
3976
3977
0
  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3978
3979
0
  PG_FREE_IF_COPY(arg1, 0);
3980
0
  PG_FREE_IF_COPY(arg2, 1);
3981
3982
0
  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3983
0
}
3984
3985
Datum
3986
byteage(PG_FUNCTION_ARGS)
3987
0
{
3988
0
  bytea    *arg1 = PG_GETARG_BYTEA_PP(0);
3989
0
  bytea    *arg2 = PG_GETARG_BYTEA_PP(1);
3990
0
  int     len1,
3991
0
        len2;
3992
0
  int     cmp;
3993
3994
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
3995
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
3996
3997
0
  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3998
3999
0
  PG_FREE_IF_COPY(arg1, 0);
4000
0
  PG_FREE_IF_COPY(arg2, 1);
4001
4002
0
  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4003
0
}
4004
4005
Datum
4006
byteacmp(PG_FUNCTION_ARGS)
4007
0
{
4008
0
  bytea    *arg1 = PG_GETARG_BYTEA_PP(0);
4009
0
  bytea    *arg2 = PG_GETARG_BYTEA_PP(1);
4010
0
  int     len1,
4011
0
        len2;
4012
0
  int     cmp;
4013
4014
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
4015
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
4016
4017
0
  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4018
0
  if ((cmp == 0) && (len1 != len2))
4019
0
    cmp = (len1 < len2) ? -1 : 1;
4020
4021
0
  PG_FREE_IF_COPY(arg1, 0);
4022
0
  PG_FREE_IF_COPY(arg2, 1);
4023
4024
0
  PG_RETURN_INT32(cmp);
4025
0
}
4026
4027
Datum
4028
bytea_larger(PG_FUNCTION_ARGS)
4029
0
{
4030
0
  bytea    *arg1 = PG_GETARG_BYTEA_PP(0);
4031
0
  bytea    *arg2 = PG_GETARG_BYTEA_PP(1);
4032
0
  bytea    *result;
4033
0
  int     len1,
4034
0
        len2;
4035
0
  int     cmp;
4036
4037
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
4038
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
4039
4040
0
  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4041
0
  result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2);
4042
4043
0
  PG_RETURN_BYTEA_P(result);
4044
0
}
4045
4046
Datum
4047
bytea_smaller(PG_FUNCTION_ARGS)
4048
0
{
4049
0
  bytea    *arg1 = PG_GETARG_BYTEA_PP(0);
4050
0
  bytea    *arg2 = PG_GETARG_BYTEA_PP(1);
4051
0
  bytea    *result;
4052
0
  int     len1,
4053
0
        len2;
4054
0
  int     cmp;
4055
4056
0
  len1 = VARSIZE_ANY_EXHDR(arg1);
4057
0
  len2 = VARSIZE_ANY_EXHDR(arg2);
4058
4059
0
  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4060
0
  result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2);
4061
4062
0
  PG_RETURN_BYTEA_P(result);
4063
0
}
4064
4065
Datum
4066
bytea_sortsupport(PG_FUNCTION_ARGS)
4067
0
{
4068
0
  SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4069
0
  MemoryContext oldcontext;
4070
4071
0
  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4072
4073
  /* Use generic string SortSupport, forcing "C" collation */
4074
0
  varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4075
4076
0
  MemoryContextSwitchTo(oldcontext);
4077
4078
0
  PG_RETURN_VOID();
4079
0
}
4080
4081
/* Cast bytea -> int2 */
4082
Datum
4083
bytea_int2(PG_FUNCTION_ARGS)
4084
0
{
4085
0
  bytea    *v = PG_GETARG_BYTEA_PP(0);
4086
0
  int     len = VARSIZE_ANY_EXHDR(v);
4087
0
  uint16    result;
4088
4089
  /* Check that the byte array is not too long */
4090
0
  if (len > sizeof(result))
4091
0
    ereport(ERROR,
4092
0
        errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
4093
0
        errmsg("smallint out of range"));
4094
4095
  /* Convert it to an integer; most significant bytes come first */
4096
0
  result = 0;
4097
0
  for (int i = 0; i < len; i++)
4098
0
  {
4099
0
    result <<= BITS_PER_BYTE;
4100
0
    result |= ((unsigned char *) VARDATA_ANY(v))[i];
4101
0
  }
4102
4103
0
  PG_RETURN_INT16(result);
4104
0
}
4105
4106
/* Cast bytea -> int4 */
4107
Datum
4108
bytea_int4(PG_FUNCTION_ARGS)
4109
0
{
4110
0
  bytea    *v = PG_GETARG_BYTEA_PP(0);
4111
0
  int     len = VARSIZE_ANY_EXHDR(v);
4112
0
  uint32    result;
4113
4114
  /* Check that the byte array is not too long */
4115
0
  if (len > sizeof(result))
4116
0
    ereport(ERROR,
4117
0
        errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
4118
0
        errmsg("integer out of range"));
4119
4120
  /* Convert it to an integer; most significant bytes come first */
4121
0
  result = 0;
4122
0
  for (int i = 0; i < len; i++)
4123
0
  {
4124
0
    result <<= BITS_PER_BYTE;
4125
0
    result |= ((unsigned char *) VARDATA_ANY(v))[i];
4126
0
  }
4127
4128
0
  PG_RETURN_INT32(result);
4129
0
}
4130
4131
/* Cast bytea -> int8 */
4132
Datum
4133
bytea_int8(PG_FUNCTION_ARGS)
4134
0
{
4135
0
  bytea    *v = PG_GETARG_BYTEA_PP(0);
4136
0
  int     len = VARSIZE_ANY_EXHDR(v);
4137
0
  uint64    result;
4138
4139
  /* Check that the byte array is not too long */
4140
0
  if (len > sizeof(result))
4141
0
    ereport(ERROR,
4142
0
        errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
4143
0
        errmsg("bigint out of range"));
4144
4145
  /* Convert it to an integer; most significant bytes come first */
4146
0
  result = 0;
4147
0
  for (int i = 0; i < len; i++)
4148
0
  {
4149
0
    result <<= BITS_PER_BYTE;
4150
0
    result |= ((unsigned char *) VARDATA_ANY(v))[i];
4151
0
  }
4152
4153
0
  PG_RETURN_INT64(result);
4154
0
}
4155
4156
/* Cast int2 -> bytea; can just use int2send() */
4157
Datum
4158
int2_bytea(PG_FUNCTION_ARGS)
4159
0
{
4160
0
  return int2send(fcinfo);
4161
0
}
4162
4163
/* Cast int4 -> bytea; can just use int4send() */
4164
Datum
4165
int4_bytea(PG_FUNCTION_ARGS)
4166
0
{
4167
0
  return int4send(fcinfo);
4168
0
}
4169
4170
/* Cast int8 -> bytea; can just use int8send() */
4171
Datum
4172
int8_bytea(PG_FUNCTION_ARGS)
4173
0
{
4174
0
  return int8send(fcinfo);
4175
0
}
4176
4177
/*
4178
 * appendStringInfoText
4179
 *
4180
 * Append a text to str.
4181
 * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4182
 */
4183
static void
4184
appendStringInfoText(StringInfo str, const text *t)
4185
0
{
4186
0
  appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4187
0
}
4188
4189
/*
4190
 * replace_text
4191
 * replace all occurrences of 'old_sub_str' in 'orig_str'
4192
 * with 'new_sub_str' to form 'new_str'
4193
 *
4194
 * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4195
 * otherwise returns 'new_str'
4196
 */
4197
Datum
4198
replace_text(PG_FUNCTION_ARGS)
4199
0
{
4200
0
  text     *src_text = PG_GETARG_TEXT_PP(0);
4201
0
  text     *from_sub_text = PG_GETARG_TEXT_PP(1);
4202
0
  text     *to_sub_text = PG_GETARG_TEXT_PP(2);
4203
0
  int     src_text_len;
4204
0
  int     from_sub_text_len;
4205
0
  TextPositionState state;
4206
0
  text     *ret_text;
4207
0
  int     chunk_len;
4208
0
  char     *curr_ptr;
4209
0
  char     *start_ptr;
4210
0
  StringInfoData str;
4211
0
  bool    found;
4212
4213
0
  src_text_len = VARSIZE_ANY_EXHDR(src_text);
4214
0
  from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4215
4216
  /* Return unmodified source string if empty source or pattern */
4217
0
  if (src_text_len < 1 || from_sub_text_len < 1)
4218
0
  {
4219
0
    PG_RETURN_TEXT_P(src_text);
4220
0
  }
4221
4222
0
  text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4223
4224
0
  found = text_position_next(&state);
4225
4226
  /* When the from_sub_text is not found, there is nothing to do. */
4227
0
  if (!found)
4228
0
  {
4229
0
    text_position_cleanup(&state);
4230
0
    PG_RETURN_TEXT_P(src_text);
4231
0
  }
4232
0
  curr_ptr = text_position_get_match_ptr(&state);
4233
0
  start_ptr = VARDATA_ANY(src_text);
4234
4235
0
  initStringInfo(&str);
4236
4237
0
  do
4238
0
  {
4239
0
    CHECK_FOR_INTERRUPTS();
4240
4241
    /* copy the data skipped over by last text_position_next() */
4242
0
    chunk_len = curr_ptr - start_ptr;
4243
0
    appendBinaryStringInfo(&str, start_ptr, chunk_len);
4244
4245
0
    appendStringInfoText(&str, to_sub_text);
4246
4247
0
    start_ptr = curr_ptr + state.last_match_len;
4248
4249
0
    found = text_position_next(&state);
4250
0
    if (found)
4251
0
      curr_ptr = text_position_get_match_ptr(&state);
4252
0
  }
4253
0
  while (found);
4254
4255
  /* copy trailing data */
4256
0
  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4257
0
  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4258
4259
0
  text_position_cleanup(&state);
4260
4261
0
  ret_text = cstring_to_text_with_len(str.data, str.len);
4262
0
  pfree(str.data);
4263
4264
0
  PG_RETURN_TEXT_P(ret_text);
4265
0
}
4266
4267
/*
4268
 * check_replace_text_has_escape
4269
 *
4270
 * Returns 0 if text contains no backslashes that need processing.
4271
 * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4272
 * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4273
 */
4274
static int
4275
check_replace_text_has_escape(const text *replace_text)
4276
0
{
4277
0
  int     result = 0;
4278
0
  const char *p = VARDATA_ANY(replace_text);
4279
0
  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4280
4281
0
  while (p < p_end)
4282
0
  {
4283
    /* Find next escape char, if any. */
4284
0
    p = memchr(p, '\\', p_end - p);
4285
0
    if (p == NULL)
4286
0
      break;
4287
0
    p++;
4288
    /* Note: a backslash at the end doesn't require extra processing. */
4289
0
    if (p < p_end)
4290
0
    {
4291
0
      if (*p >= '1' && *p <= '9')
4292
0
        return 2;   /* Found a submatch specifier, so done */
4293
0
      result = 1;     /* Found some other sequence, keep looking */
4294
0
      p++;
4295
0
    }
4296
0
  }
4297
0
  return result;
4298
0
}
4299
4300
/*
4301
 * appendStringInfoRegexpSubstr
4302
 *
4303
 * Append replace_text to str, substituting regexp back references for
4304
 * \n escapes.  start_ptr is the start of the match in the source string,
4305
 * at logical character position data_pos.
4306
 */
4307
static void
4308
appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4309
               regmatch_t *pmatch,
4310
               char *start_ptr, int data_pos)
4311
0
{
4312
0
  const char *p = VARDATA_ANY(replace_text);
4313
0
  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4314
4315
0
  while (p < p_end)
4316
0
  {
4317
0
    const char *chunk_start = p;
4318
0
    int     so;
4319
0
    int     eo;
4320
4321
    /* Find next escape char, if any. */
4322
0
    p = memchr(p, '\\', p_end - p);
4323
0
    if (p == NULL)
4324
0
      p = p_end;
4325
4326
    /* Copy the text we just scanned over, if any. */
4327
0
    if (p > chunk_start)
4328
0
      appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4329
4330
    /* Done if at end of string, else advance over escape char. */
4331
0
    if (p >= p_end)
4332
0
      break;
4333
0
    p++;
4334
4335
0
    if (p >= p_end)
4336
0
    {
4337
      /* Escape at very end of input.  Treat same as unexpected char */
4338
0
      appendStringInfoChar(str, '\\');
4339
0
      break;
4340
0
    }
4341
4342
0
    if (*p >= '1' && *p <= '9')
4343
0
    {
4344
      /* Use the back reference of regexp. */
4345
0
      int     idx = *p - '0';
4346
4347
0
      so = pmatch[idx].rm_so;
4348
0
      eo = pmatch[idx].rm_eo;
4349
0
      p++;
4350
0
    }
4351
0
    else if (*p == '&')
4352
0
    {
4353
      /* Use the entire matched string. */
4354
0
      so = pmatch[0].rm_so;
4355
0
      eo = pmatch[0].rm_eo;
4356
0
      p++;
4357
0
    }
4358
0
    else if (*p == '\\')
4359
0
    {
4360
      /* \\ means transfer one \ to output. */
4361
0
      appendStringInfoChar(str, '\\');
4362
0
      p++;
4363
0
      continue;
4364
0
    }
4365
0
    else
4366
0
    {
4367
      /*
4368
       * If escape char is not followed by any expected char, just treat
4369
       * it as ordinary data to copy.  (XXX would it be better to throw
4370
       * an error?)
4371
       */
4372
0
      appendStringInfoChar(str, '\\');
4373
0
      continue;
4374
0
    }
4375
4376
0
    if (so >= 0 && eo >= 0)
4377
0
    {
4378
      /*
4379
       * Copy the text that is back reference of regexp.  Note so and eo
4380
       * are counted in characters not bytes.
4381
       */
4382
0
      char     *chunk_start;
4383
0
      int     chunk_len;
4384
4385
0
      Assert(so >= data_pos);
4386
0
      chunk_start = start_ptr;
4387
0
      chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4388
0
      chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4389
0
      appendBinaryStringInfo(str, chunk_start, chunk_len);
4390
0
    }
4391
0
  }
4392
0
}
4393
4394
/*
4395
 * replace_text_regexp
4396
 *
4397
 * replace substring(s) in src_text that match pattern with replace_text.
4398
 * The replace_text can contain backslash markers to substitute
4399
 * (parts of) the matched text.
4400
 *
4401
 * cflags: regexp compile flags.
4402
 * collation: collation to use.
4403
 * search_start: the character (not byte) offset in src_text at which to
4404
 * begin searching.
4405
 * n: if 0, replace all matches; if > 0, replace only the N'th match.
4406
 */
4407
text *
4408
replace_text_regexp(text *src_text, text *pattern_text,
4409
          text *replace_text,
4410
          int cflags, Oid collation,
4411
          int search_start, int n)
4412
0
{
4413
0
  text     *ret_text;
4414
0
  regex_t    *re;
4415
0
  int     src_text_len = VARSIZE_ANY_EXHDR(src_text);
4416
0
  int     nmatches = 0;
4417
0
  StringInfoData buf;
4418
0
  regmatch_t  pmatch[10];   /* main match, plus \1 to \9 */
4419
0
  int     nmatch = lengthof(pmatch);
4420
0
  pg_wchar   *data;
4421
0
  size_t    data_len;
4422
0
  int     data_pos;
4423
0
  char     *start_ptr;
4424
0
  int     escape_status;
4425
4426
0
  initStringInfo(&buf);
4427
4428
  /* Convert data string to wide characters. */
4429
0
  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4430
0
  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4431
4432
  /* Check whether replace_text has escapes, especially regexp submatches. */
4433
0
  escape_status = check_replace_text_has_escape(replace_text);
4434
4435
  /* If no regexp submatches, we can use REG_NOSUB. */
4436
0
  if (escape_status < 2)
4437
0
  {
4438
0
    cflags |= REG_NOSUB;
4439
    /* Also tell pg_regexec we only want the whole-match location. */
4440
0
    nmatch = 1;
4441
0
  }
4442
4443
  /* Prepare the regexp. */
4444
0
  re = RE_compile_and_cache(pattern_text, cflags, collation);
4445
4446
  /* start_ptr points to the data_pos'th character of src_text */
4447
0
  start_ptr = (char *) VARDATA_ANY(src_text);
4448
0
  data_pos = 0;
4449
4450
0
  while (search_start <= data_len)
4451
0
  {
4452
0
    int     regexec_result;
4453
4454
0
    CHECK_FOR_INTERRUPTS();
4455
4456
0
    regexec_result = pg_regexec(re,
4457
0
                  data,
4458
0
                  data_len,
4459
0
                  search_start,
4460
0
                  NULL, /* no details */
4461
0
                  nmatch,
4462
0
                  pmatch,
4463
0
                  0);
4464
4465
0
    if (regexec_result == REG_NOMATCH)
4466
0
      break;
4467
4468
0
    if (regexec_result != REG_OKAY)
4469
0
    {
4470
0
      char    errMsg[100];
4471
4472
0
      pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4473
0
      ereport(ERROR,
4474
0
          (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4475
0
           errmsg("regular expression failed: %s", errMsg)));
4476
0
    }
4477
4478
    /*
4479
     * Count matches, and decide whether to replace this match.
4480
     */
4481
0
    nmatches++;
4482
0
    if (n > 0 && nmatches != n)
4483
0
    {
4484
      /*
4485
       * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4486
       * we treat the matched text as if it weren't matched, and copy it
4487
       * to the output later.)
4488
       */
4489
0
      search_start = pmatch[0].rm_eo;
4490
0
      if (pmatch[0].rm_so == pmatch[0].rm_eo)
4491
0
        search_start++;
4492
0
      continue;
4493
0
    }
4494
4495
    /*
4496
     * Copy the text to the left of the match position.  Note we are given
4497
     * character not byte indexes.
4498
     */
4499
0
    if (pmatch[0].rm_so - data_pos > 0)
4500
0
    {
4501
0
      int     chunk_len;
4502
4503
0
      chunk_len = charlen_to_bytelen(start_ptr,
4504
0
                       pmatch[0].rm_so - data_pos);
4505
0
      appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4506
4507
      /*
4508
       * Advance start_ptr over that text, to avoid multiple rescans of
4509
       * it if the replace_text contains multiple back-references.
4510
       */
4511
0
      start_ptr += chunk_len;
4512
0
      data_pos = pmatch[0].rm_so;
4513
0
    }
4514
4515
    /*
4516
     * Copy the replace_text, processing escapes if any are present.
4517
     */
4518
0
    if (escape_status > 0)
4519
0
      appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4520
0
                     start_ptr, data_pos);
4521
0
    else
4522
0
      appendStringInfoText(&buf, replace_text);
4523
4524
    /* Advance start_ptr and data_pos over the matched text. */
4525
0
    start_ptr += charlen_to_bytelen(start_ptr,
4526
0
                    pmatch[0].rm_eo - data_pos);
4527
0
    data_pos = pmatch[0].rm_eo;
4528
4529
    /*
4530
     * If we only want to replace one occurrence, we're done.
4531
     */
4532
0
    if (n > 0)
4533
0
      break;
4534
4535
    /*
4536
     * Advance search position.  Normally we start the next search at the
4537
     * end of the previous match; but if the match was of zero length, we
4538
     * have to advance by one character, or we'd just find the same match
4539
     * again.
4540
     */
4541
0
    search_start = data_pos;
4542
0
    if (pmatch[0].rm_so == pmatch[0].rm_eo)
4543
0
      search_start++;
4544
0
  }
4545
4546
  /*
4547
   * Copy the text to the right of the last match.
4548
   */
4549
0
  if (data_pos < data_len)
4550
0
  {
4551
0
    int     chunk_len;
4552
4553
0
    chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4554
0
    appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4555
0
  }
4556
4557
0
  ret_text = cstring_to_text_with_len(buf.data, buf.len);
4558
0
  pfree(buf.data);
4559
0
  pfree(data);
4560
4561
0
  return ret_text;
4562
0
}
4563
4564
/*
4565
 * split_part
4566
 * parse input string based on provided field separator
4567
 * return N'th item (1 based, negative counts from end)
4568
 */
4569
Datum
4570
split_part(PG_FUNCTION_ARGS)
4571
0
{
4572
0
  text     *inputstring = PG_GETARG_TEXT_PP(0);
4573
0
  text     *fldsep = PG_GETARG_TEXT_PP(1);
4574
0
  int     fldnum = PG_GETARG_INT32(2);
4575
0
  int     inputstring_len;
4576
0
  int     fldsep_len;
4577
0
  TextPositionState state;
4578
0
  char     *start_ptr;
4579
0
  char     *end_ptr;
4580
0
  text     *result_text;
4581
0
  bool    found;
4582
4583
  /* field number is 1 based */
4584
0
  if (fldnum == 0)
4585
0
    ereport(ERROR,
4586
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4587
0
         errmsg("field position must not be zero")));
4588
4589
0
  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4590
0
  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4591
4592
  /* return empty string for empty input string */
4593
0
  if (inputstring_len < 1)
4594
0
    PG_RETURN_TEXT_P(cstring_to_text(""));
4595
4596
  /* handle empty field separator */
4597
0
  if (fldsep_len < 1)
4598
0
  {
4599
    /* if first or last field, return input string, else empty string */
4600
0
    if (fldnum == 1 || fldnum == -1)
4601
0
      PG_RETURN_TEXT_P(inputstring);
4602
0
    else
4603
0
      PG_RETURN_TEXT_P(cstring_to_text(""));
4604
0
  }
4605
4606
  /* find the first field separator */
4607
0
  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4608
4609
0
  found = text_position_next(&state);
4610
4611
  /* special case if fldsep not found at all */
4612
0
  if (!found)
4613
0
  {
4614
0
    text_position_cleanup(&state);
4615
    /* if first or last field, return input string, else empty string */
4616
0
    if (fldnum == 1 || fldnum == -1)
4617
0
      PG_RETURN_TEXT_P(inputstring);
4618
0
    else
4619
0
      PG_RETURN_TEXT_P(cstring_to_text(""));
4620
0
  }
4621
4622
  /*
4623
   * take care of a negative field number (i.e. count from the right) by
4624
   * converting to a positive field number; we need total number of fields
4625
   */
4626
0
  if (fldnum < 0)
4627
0
  {
4628
    /* we found a fldsep, so there are at least two fields */
4629
0
    int     numfields = 2;
4630
4631
0
    while (text_position_next(&state))
4632
0
      numfields++;
4633
4634
    /* special case of last field does not require an extra pass */
4635
0
    if (fldnum == -1)
4636
0
    {
4637
0
      start_ptr = text_position_get_match_ptr(&state) + state.last_match_len;
4638
0
      end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4639
0
      text_position_cleanup(&state);
4640
0
      PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4641
0
                            end_ptr - start_ptr));
4642
0
    }
4643
4644
    /* else, convert fldnum to positive notation */
4645
0
    fldnum += numfields + 1;
4646
4647
    /* if nonexistent field, return empty string */
4648
0
    if (fldnum <= 0)
4649
0
    {
4650
0
      text_position_cleanup(&state);
4651
0
      PG_RETURN_TEXT_P(cstring_to_text(""));
4652
0
    }
4653
4654
    /* reset to pointing at first match, but now with positive fldnum */
4655
0
    text_position_reset(&state);
4656
0
    found = text_position_next(&state);
4657
0
    Assert(found);
4658
0
  }
4659
4660
  /* identify bounds of first field */
4661
0
  start_ptr = VARDATA_ANY(inputstring);
4662
0
  end_ptr = text_position_get_match_ptr(&state);
4663
4664
0
  while (found && --fldnum > 0)
4665
0
  {
4666
    /* identify bounds of next field */
4667
0
    start_ptr = end_ptr + state.last_match_len;
4668
0
    found = text_position_next(&state);
4669
0
    if (found)
4670
0
      end_ptr = text_position_get_match_ptr(&state);
4671
0
  }
4672
4673
0
  text_position_cleanup(&state);
4674
4675
0
  if (fldnum > 0)
4676
0
  {
4677
    /* N'th field separator not found */
4678
    /* if last field requested, return it, else empty string */
4679
0
    if (fldnum == 1)
4680
0
    {
4681
0
      int     last_len = start_ptr - VARDATA_ANY(inputstring);
4682
4683
0
      result_text = cstring_to_text_with_len(start_ptr,
4684
0
                           inputstring_len - last_len);
4685
0
    }
4686
0
    else
4687
0
      result_text = cstring_to_text("");
4688
0
  }
4689
0
  else
4690
0
  {
4691
    /* non-last field requested */
4692
0
    result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4693
0
  }
4694
4695
0
  PG_RETURN_TEXT_P(result_text);
4696
0
}
4697
4698
/*
4699
 * Convenience function to return true when two text params are equal.
4700
 */
4701
static bool
4702
text_isequal(text *txt1, text *txt2, Oid collid)
4703
0
{
4704
0
  return DatumGetBool(DirectFunctionCall2Coll(texteq,
4705
0
                        collid,
4706
0
                        PointerGetDatum(txt1),
4707
0
                        PointerGetDatum(txt2)));
4708
0
}
4709
4710
/*
4711
 * text_to_array
4712
 * parse input string and return text array of elements,
4713
 * based on provided field separator
4714
 */
4715
Datum
4716
text_to_array(PG_FUNCTION_ARGS)
4717
0
{
4718
0
  SplitTextOutputData tstate;
4719
4720
  /* For array output, tstate should start as all zeroes */
4721
0
  memset(&tstate, 0, sizeof(tstate));
4722
4723
0
  if (!split_text(fcinfo, &tstate))
4724
0
    PG_RETURN_NULL();
4725
4726
0
  if (tstate.astate == NULL)
4727
0
    PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4728
4729
0
  PG_RETURN_DATUM(makeArrayResult(tstate.astate,
4730
0
                  CurrentMemoryContext));
4731
0
}
4732
4733
/*
4734
 * text_to_array_null
4735
 * parse input string and return text array of elements,
4736
 * based on provided field separator and null string
4737
 *
4738
 * This is a separate entry point only to prevent the regression tests from
4739
 * complaining about different argument sets for the same internal function.
4740
 */
4741
Datum
4742
text_to_array_null(PG_FUNCTION_ARGS)
4743
0
{
4744
0
  return text_to_array(fcinfo);
4745
0
}
4746
4747
/*
4748
 * text_to_table
4749
 * parse input string and return table of elements,
4750
 * based on provided field separator
4751
 */
4752
Datum
4753
text_to_table(PG_FUNCTION_ARGS)
4754
0
{
4755
0
  ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4756
0
  SplitTextOutputData tstate;
4757
4758
0
  tstate.astate = NULL;
4759
0
  InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
4760
0
  tstate.tupstore = rsi->setResult;
4761
0
  tstate.tupdesc = rsi->setDesc;
4762
4763
0
  (void) split_text(fcinfo, &tstate);
4764
4765
0
  return (Datum) 0;
4766
0
}
4767
4768
/*
4769
 * text_to_table_null
4770
 * parse input string and return table of elements,
4771
 * based on provided field separator and null string
4772
 *
4773
 * This is a separate entry point only to prevent the regression tests from
4774
 * complaining about different argument sets for the same internal function.
4775
 */
4776
Datum
4777
text_to_table_null(PG_FUNCTION_ARGS)
4778
0
{
4779
0
  return text_to_table(fcinfo);
4780
0
}
4781
4782
/*
4783
 * Common code for text_to_array, text_to_array_null, text_to_table
4784
 * and text_to_table_null functions.
4785
 *
4786
 * These are not strict so we have to test for null inputs explicitly.
4787
 * Returns false if result is to be null, else returns true.
4788
 *
4789
 * Note that if the result is valid but empty (zero elements), we return
4790
 * without changing *tstate --- caller must handle that case, too.
4791
 */
4792
static bool
4793
split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4794
0
{
4795
0
  text     *inputstring;
4796
0
  text     *fldsep;
4797
0
  text     *null_string;
4798
0
  Oid     collation = PG_GET_COLLATION();
4799
0
  int     inputstring_len;
4800
0
  int     fldsep_len;
4801
0
  char     *start_ptr;
4802
0
  text     *result_text;
4803
4804
  /* when input string is NULL, then result is NULL too */
4805
0
  if (PG_ARGISNULL(0))
4806
0
    return false;
4807
4808
0
  inputstring = PG_GETARG_TEXT_PP(0);
4809
4810
  /* fldsep can be NULL */
4811
0
  if (!PG_ARGISNULL(1))
4812
0
    fldsep = PG_GETARG_TEXT_PP(1);
4813
0
  else
4814
0
    fldsep = NULL;
4815
4816
  /* null_string can be NULL or omitted */
4817
0
  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4818
0
    null_string = PG_GETARG_TEXT_PP(2);
4819
0
  else
4820
0
    null_string = NULL;
4821
4822
0
  if (fldsep != NULL)
4823
0
  {
4824
    /*
4825
     * Normal case with non-null fldsep.  Use the text_position machinery
4826
     * to search for occurrences of fldsep.
4827
     */
4828
0
    TextPositionState state;
4829
4830
0
    inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4831
0
    fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4832
4833
    /* return empty set for empty input string */
4834
0
    if (inputstring_len < 1)
4835
0
      return true;
4836
4837
    /* empty field separator: return input string as a one-element set */
4838
0
    if (fldsep_len < 1)
4839
0
    {
4840
0
      split_text_accum_result(tstate, inputstring,
4841
0
                  null_string, collation);
4842
0
      return true;
4843
0
    }
4844
4845
0
    text_position_setup(inputstring, fldsep, collation, &state);
4846
4847
0
    start_ptr = VARDATA_ANY(inputstring);
4848
4849
0
    for (;;)
4850
0
    {
4851
0
      bool    found;
4852
0
      char     *end_ptr;
4853
0
      int     chunk_len;
4854
4855
0
      CHECK_FOR_INTERRUPTS();
4856
4857
0
      found = text_position_next(&state);
4858
0
      if (!found)
4859
0
      {
4860
        /* fetch last field */
4861
0
        chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4862
0
        end_ptr = NULL; /* not used, but some compilers complain */
4863
0
      }
4864
0
      else
4865
0
      {
4866
        /* fetch non-last field */
4867
0
        end_ptr = text_position_get_match_ptr(&state);
4868
0
        chunk_len = end_ptr - start_ptr;
4869
0
      }
4870
4871
      /* build a temp text datum to pass to split_text_accum_result */
4872
0
      result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4873
4874
      /* stash away this field */
4875
0
      split_text_accum_result(tstate, result_text,
4876
0
                  null_string, collation);
4877
4878
0
      pfree(result_text);
4879
4880
0
      if (!found)
4881
0
        break;
4882
4883
0
      start_ptr = end_ptr + state.last_match_len;
4884
0
    }
4885
4886
0
    text_position_cleanup(&state);
4887
0
  }
4888
0
  else
4889
0
  {
4890
    /*
4891
     * When fldsep is NULL, each character in the input string becomes a
4892
     * separate element in the result set.  The separator is effectively
4893
     * the space between characters.
4894
     */
4895
0
    inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4896
4897
0
    start_ptr = VARDATA_ANY(inputstring);
4898
4899
0
    while (inputstring_len > 0)
4900
0
    {
4901
0
      int     chunk_len = pg_mblen(start_ptr);
4902
4903
0
      CHECK_FOR_INTERRUPTS();
4904
4905
      /* build a temp text datum to pass to split_text_accum_result */
4906
0
      result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4907
4908
      /* stash away this field */
4909
0
      split_text_accum_result(tstate, result_text,
4910
0
                  null_string, collation);
4911
4912
0
      pfree(result_text);
4913
4914
0
      start_ptr += chunk_len;
4915
0
      inputstring_len -= chunk_len;
4916
0
    }
4917
0
  }
4918
4919
0
  return true;
4920
0
}
4921
4922
/*
4923
 * Add text item to result set (table or array).
4924
 *
4925
 * This is also responsible for checking to see if the item matches
4926
 * the null_string, in which case we should emit NULL instead.
4927
 */
4928
static void
4929
split_text_accum_result(SplitTextOutputData *tstate,
4930
            text *field_value,
4931
            text *null_string,
4932
            Oid collation)
4933
0
{
4934
0
  bool    is_null = false;
4935
4936
0
  if (null_string && text_isequal(field_value, null_string, collation))
4937
0
    is_null = true;
4938
4939
0
  if (tstate->tupstore)
4940
0
  {
4941
0
    Datum   values[1];
4942
0
    bool    nulls[1];
4943
4944
0
    values[0] = PointerGetDatum(field_value);
4945
0
    nulls[0] = is_null;
4946
4947
0
    tuplestore_putvalues(tstate->tupstore,
4948
0
               tstate->tupdesc,
4949
0
               values,
4950
0
               nulls);
4951
0
  }
4952
0
  else
4953
0
  {
4954
0
    tstate->astate = accumArrayResult(tstate->astate,
4955
0
                      PointerGetDatum(field_value),
4956
0
                      is_null,
4957
0
                      TEXTOID,
4958
0
                      CurrentMemoryContext);
4959
0
  }
4960
0
}
4961
4962
/*
4963
 * array_to_text
4964
 * concatenate Cstring representation of input array elements
4965
 * using provided field separator
4966
 */
4967
Datum
4968
array_to_text(PG_FUNCTION_ARGS)
4969
0
{
4970
0
  ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
4971
0
  char     *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4972
4973
0
  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4974
0
}
4975
4976
/*
4977
 * array_to_text_null
4978
 * concatenate Cstring representation of input array elements
4979
 * using provided field separator and null string
4980
 *
4981
 * This version is not strict so we have to test for null inputs explicitly.
4982
 */
4983
Datum
4984
array_to_text_null(PG_FUNCTION_ARGS)
4985
0
{
4986
0
  ArrayType  *v;
4987
0
  char     *fldsep;
4988
0
  char     *null_string;
4989
4990
  /* returns NULL when first or second parameter is NULL */
4991
0
  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4992
0
    PG_RETURN_NULL();
4993
4994
0
  v = PG_GETARG_ARRAYTYPE_P(0);
4995
0
  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4996
4997
  /* NULL null string is passed through as a null pointer */
4998
0
  if (!PG_ARGISNULL(2))
4999
0
    null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5000
0
  else
5001
0
    null_string = NULL;
5002
5003
0
  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5004
0
}
5005
5006
/*
5007
 * common code for array_to_text and array_to_text_null functions
5008
 */
5009
static text *
5010
array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
5011
             const char *fldsep, const char *null_string)
5012
0
{
5013
0
  text     *result;
5014
0
  int     nitems,
5015
0
         *dims,
5016
0
        ndims;
5017
0
  Oid     element_type;
5018
0
  int     typlen;
5019
0
  bool    typbyval;
5020
0
  char    typalign;
5021
0
  StringInfoData buf;
5022
0
  bool    printed = false;
5023
0
  char     *p;
5024
0
  bits8    *bitmap;
5025
0
  int     bitmask;
5026
0
  int     i;
5027
0
  ArrayMetaState *my_extra;
5028
5029
0
  ndims = ARR_NDIM(v);
5030
0
  dims = ARR_DIMS(v);
5031
0
  nitems = ArrayGetNItems(ndims, dims);
5032
5033
  /* if there are no elements, return an empty string */
5034
0
  if (nitems == 0)
5035
0
    return cstring_to_text_with_len("", 0);
5036
5037
0
  element_type = ARR_ELEMTYPE(v);
5038
0
  initStringInfo(&buf);
5039
5040
  /*
5041
   * We arrange to look up info about element type, including its output
5042
   * conversion proc, only once per series of calls, assuming the element
5043
   * type doesn't change underneath us.
5044
   */
5045
0
  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5046
0
  if (my_extra == NULL)
5047
0
  {
5048
0
    fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5049
0
                            sizeof(ArrayMetaState));
5050
0
    my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5051
0
    my_extra->element_type = ~element_type;
5052
0
  }
5053
5054
0
  if (my_extra->element_type != element_type)
5055
0
  {
5056
    /*
5057
     * Get info about element type, including its output conversion proc
5058
     */
5059
0
    get_type_io_data(element_type, IOFunc_output,
5060
0
             &my_extra->typlen, &my_extra->typbyval,
5061
0
             &my_extra->typalign, &my_extra->typdelim,
5062
0
             &my_extra->typioparam, &my_extra->typiofunc);
5063
0
    fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5064
0
            fcinfo->flinfo->fn_mcxt);
5065
0
    my_extra->element_type = element_type;
5066
0
  }
5067
0
  typlen = my_extra->typlen;
5068
0
  typbyval = my_extra->typbyval;
5069
0
  typalign = my_extra->typalign;
5070
5071
0
  p = ARR_DATA_PTR(v);
5072
0
  bitmap = ARR_NULLBITMAP(v);
5073
0
  bitmask = 1;
5074
5075
0
  for (i = 0; i < nitems; i++)
5076
0
  {
5077
0
    Datum   itemvalue;
5078
0
    char     *value;
5079
5080
    /* Get source element, checking for NULL */
5081
0
    if (bitmap && (*bitmap & bitmask) == 0)
5082
0
    {
5083
      /* if null_string is NULL, we just ignore null elements */
5084
0
      if (null_string != NULL)
5085
0
      {
5086
0
        if (printed)
5087
0
          appendStringInfo(&buf, "%s%s", fldsep, null_string);
5088
0
        else
5089
0
          appendStringInfoString(&buf, null_string);
5090
0
        printed = true;
5091
0
      }
5092
0
    }
5093
0
    else
5094
0
    {
5095
0
      itemvalue = fetch_att(p, typbyval, typlen);
5096
5097
0
      value = OutputFunctionCall(&my_extra->proc, itemvalue);
5098
5099
0
      if (printed)
5100
0
        appendStringInfo(&buf, "%s%s", fldsep, value);
5101
0
      else
5102
0
        appendStringInfoString(&buf, value);
5103
0
      printed = true;
5104
5105
0
      p = att_addlength_pointer(p, typlen, p);
5106
0
      p = (char *) att_align_nominal(p, typalign);
5107
0
    }
5108
5109
    /* advance bitmap pointer if any */
5110
0
    if (bitmap)
5111
0
    {
5112
0
      bitmask <<= 1;
5113
0
      if (bitmask == 0x100)
5114
0
      {
5115
0
        bitmap++;
5116
0
        bitmask = 1;
5117
0
      }
5118
0
    }
5119
0
  }
5120
5121
0
  result = cstring_to_text_with_len(buf.data, buf.len);
5122
0
  pfree(buf.data);
5123
5124
0
  return result;
5125
0
}
5126
5127
/*
5128
 * Workhorse for to_bin, to_oct, and to_hex.  Note that base must be > 1 and <=
5129
 * 16.
5130
 */
5131
static inline text *
5132
convert_to_base(uint64 value, int base)
5133
0
{
5134
0
  const char *digits = "0123456789abcdef";
5135
5136
  /* We size the buffer for to_bin's longest possible return value. */
5137
0
  char    buf[sizeof(uint64) * BITS_PER_BYTE];
5138
0
  char     *const end = buf + sizeof(buf);
5139
0
  char     *ptr = end;
5140
5141
0
  Assert(base > 1);
5142
0
  Assert(base <= 16);
5143
5144
0
  do
5145
0
  {
5146
0
    *--ptr = digits[value % base];
5147
0
    value /= base;
5148
0
  } while (ptr > buf && value);
5149
5150
0
  return cstring_to_text_with_len(ptr, end - ptr);
5151
0
}
5152
5153
/*
5154
 * Convert an integer to a string containing a base-2 (binary) representation
5155
 * of the number.
5156
 */
5157
Datum
5158
to_bin32(PG_FUNCTION_ARGS)
5159
0
{
5160
0
  uint64    value = (uint32) PG_GETARG_INT32(0);
5161
5162
0
  PG_RETURN_TEXT_P(convert_to_base(value, 2));
5163
0
}
5164
Datum
5165
to_bin64(PG_FUNCTION_ARGS)
5166
0
{
5167
0
  uint64    value = (uint64) PG_GETARG_INT64(0);
5168
5169
0
  PG_RETURN_TEXT_P(convert_to_base(value, 2));
5170
0
}
5171
5172
/*
5173
 * Convert an integer to a string containing a base-8 (oct) representation of
5174
 * the number.
5175
 */
5176
Datum
5177
to_oct32(PG_FUNCTION_ARGS)
5178
0
{
5179
0
  uint64    value = (uint32) PG_GETARG_INT32(0);
5180
5181
0
  PG_RETURN_TEXT_P(convert_to_base(value, 8));
5182
0
}
5183
Datum
5184
to_oct64(PG_FUNCTION_ARGS)
5185
0
{
5186
0
  uint64    value = (uint64) PG_GETARG_INT64(0);
5187
5188
0
  PG_RETURN_TEXT_P(convert_to_base(value, 8));
5189
0
}
5190
5191
/*
5192
 * Convert an integer to a string containing a base-16 (hex) representation of
5193
 * the number.
5194
 */
5195
Datum
5196
to_hex32(PG_FUNCTION_ARGS)
5197
0
{
5198
0
  uint64    value = (uint32) PG_GETARG_INT32(0);
5199
5200
0
  PG_RETURN_TEXT_P(convert_to_base(value, 16));
5201
0
}
5202
Datum
5203
to_hex64(PG_FUNCTION_ARGS)
5204
0
{
5205
0
  uint64    value = (uint64) PG_GETARG_INT64(0);
5206
5207
0
  PG_RETURN_TEXT_P(convert_to_base(value, 16));
5208
0
}
5209
5210
/*
5211
 * Return the size of a datum, possibly compressed
5212
 *
5213
 * Works on any data type
5214
 */
5215
Datum
5216
pg_column_size(PG_FUNCTION_ARGS)
5217
0
{
5218
0
  Datum   value = PG_GETARG_DATUM(0);
5219
0
  int32   result;
5220
0
  int     typlen;
5221
5222
  /* On first call, get the input type's typlen, and save at *fn_extra */
5223
0
  if (fcinfo->flinfo->fn_extra == NULL)
5224
0
  {
5225
    /* Lookup the datatype of the supplied argument */
5226
0
    Oid     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5227
5228
0
    typlen = get_typlen(argtypeid);
5229
0
    if (typlen == 0)   /* should not happen */
5230
0
      elog(ERROR, "cache lookup failed for type %u", argtypeid);
5231
5232
0
    fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5233
0
                            sizeof(int));
5234
0
    *((int *) fcinfo->flinfo->fn_extra) = typlen;
5235
0
  }
5236
0
  else
5237
0
    typlen = *((int *) fcinfo->flinfo->fn_extra);
5238
5239
0
  if (typlen == -1)
5240
0
  {
5241
    /* varlena type, possibly toasted */
5242
0
    result = toast_datum_size(value);
5243
0
  }
5244
0
  else if (typlen == -2)
5245
0
  {
5246
    /* cstring */
5247
0
    result = strlen(DatumGetCString(value)) + 1;
5248
0
  }
5249
0
  else
5250
0
  {
5251
    /* ordinary fixed-width type */
5252
0
    result = typlen;
5253
0
  }
5254
5255
0
  PG_RETURN_INT32(result);
5256
0
}
5257
5258
/*
5259
 * Return the compression method stored in the compressed attribute.  Return
5260
 * NULL for non varlena type or uncompressed data.
5261
 */
5262
Datum
5263
pg_column_compression(PG_FUNCTION_ARGS)
5264
0
{
5265
0
  int     typlen;
5266
0
  char     *result;
5267
0
  ToastCompressionId cmid;
5268
5269
  /* On first call, get the input type's typlen, and save at *fn_extra */
5270
0
  if (fcinfo->flinfo->fn_extra == NULL)
5271
0
  {
5272
    /* Lookup the datatype of the supplied argument */
5273
0
    Oid     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5274
5275
0
    typlen = get_typlen(argtypeid);
5276
0
    if (typlen == 0)   /* should not happen */
5277
0
      elog(ERROR, "cache lookup failed for type %u", argtypeid);
5278
5279
0
    fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5280
0
                            sizeof(int));
5281
0
    *((int *) fcinfo->flinfo->fn_extra) = typlen;
5282
0
  }
5283
0
  else
5284
0
    typlen = *((int *) fcinfo->flinfo->fn_extra);
5285
5286
0
  if (typlen != -1)
5287
0
    PG_RETURN_NULL();
5288
5289
  /* get the compression method id stored in the compressed varlena */
5290
0
  cmid = toast_get_compression_id((struct varlena *)
5291
0
                  DatumGetPointer(PG_GETARG_DATUM(0)));
5292
0
  if (cmid == TOAST_INVALID_COMPRESSION_ID)
5293
0
    PG_RETURN_NULL();
5294
5295
  /* convert compression method id to compression method name */
5296
0
  switch (cmid)
5297
0
  {
5298
0
    case TOAST_PGLZ_COMPRESSION_ID:
5299
0
      result = "pglz";
5300
0
      break;
5301
0
    case TOAST_LZ4_COMPRESSION_ID:
5302
0
      result = "lz4";
5303
0
      break;
5304
0
    default:
5305
0
      elog(ERROR, "invalid compression method id %d", cmid);
5306
0
  }
5307
5308
0
  PG_RETURN_TEXT_P(cstring_to_text(result));
5309
0
}
5310
5311
/*
5312
 * Return the chunk_id of the on-disk TOASTed value.  Return NULL if the value
5313
 * is un-TOASTed or not on-disk.
5314
 */
5315
Datum
5316
pg_column_toast_chunk_id(PG_FUNCTION_ARGS)
5317
0
{
5318
0
  int     typlen;
5319
0
  struct varlena *attr;
5320
0
  struct varatt_external toast_pointer;
5321
5322
  /* On first call, get the input type's typlen, and save at *fn_extra */
5323
0
  if (fcinfo->flinfo->fn_extra == NULL)
5324
0
  {
5325
    /* Lookup the datatype of the supplied argument */
5326
0
    Oid     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5327
5328
0
    typlen = get_typlen(argtypeid);
5329
0
    if (typlen == 0)   /* should not happen */
5330
0
      elog(ERROR, "cache lookup failed for type %u", argtypeid);
5331
5332
0
    fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5333
0
                            sizeof(int));
5334
0
    *((int *) fcinfo->flinfo->fn_extra) = typlen;
5335
0
  }
5336
0
  else
5337
0
    typlen = *((int *) fcinfo->flinfo->fn_extra);
5338
5339
0
  if (typlen != -1)
5340
0
    PG_RETURN_NULL();
5341
5342
0
  attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0));
5343
5344
0
  if (!VARATT_IS_EXTERNAL_ONDISK(attr))
5345
0
    PG_RETURN_NULL();
5346
5347
0
  VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
5348
5349
0
  PG_RETURN_OID(toast_pointer.va_valueid);
5350
0
}
5351
5352
/*
5353
 * string_agg - Concatenates values and returns string.
5354
 *
5355
 * Syntax: string_agg(value text, delimiter text) RETURNS text
5356
 *
5357
 * Note: Any NULL values are ignored. The first-call delimiter isn't
5358
 * actually used at all, and on subsequent calls the delimiter precedes
5359
 * the associated value.
5360
 */
5361
5362
/* subroutine to initialize state */
5363
static StringInfo
5364
makeStringAggState(FunctionCallInfo fcinfo)
5365
0
{
5366
0
  StringInfo  state;
5367
0
  MemoryContext aggcontext;
5368
0
  MemoryContext oldcontext;
5369
5370
0
  if (!AggCheckCallContext(fcinfo, &aggcontext))
5371
0
  {
5372
    /* cannot be called directly because of internal-type argument */
5373
0
    elog(ERROR, "string_agg_transfn called in non-aggregate context");
5374
0
  }
5375
5376
  /*
5377
   * Create state in aggregate context.  It'll stay there across subsequent
5378
   * calls.
5379
   */
5380
0
  oldcontext = MemoryContextSwitchTo(aggcontext);
5381
0
  state = makeStringInfo();
5382
0
  MemoryContextSwitchTo(oldcontext);
5383
5384
0
  return state;
5385
0
}
5386
5387
Datum
5388
string_agg_transfn(PG_FUNCTION_ARGS)
5389
0
{
5390
0
  StringInfo  state;
5391
5392
0
  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5393
5394
  /* Append the value unless null, preceding it with the delimiter. */
5395
0
  if (!PG_ARGISNULL(1))
5396
0
  {
5397
0
    text     *value = PG_GETARG_TEXT_PP(1);
5398
0
    bool    isfirst = false;
5399
5400
    /*
5401
     * You might think we can just throw away the first delimiter, however
5402
     * we must keep it as we may be a parallel worker doing partial
5403
     * aggregation building a state to send to the main process.  We need
5404
     * to keep the delimiter of every aggregation so that the combine
5405
     * function can properly join up the strings of two separately
5406
     * partially aggregated results.  The first delimiter is only stripped
5407
     * off in the final function.  To know how much to strip off the front
5408
     * of the string, we store the length of the first delimiter in the
5409
     * StringInfo's cursor field, which we don't otherwise need here.
5410
     */
5411
0
    if (state == NULL)
5412
0
    {
5413
0
      state = makeStringAggState(fcinfo);
5414
0
      isfirst = true;
5415
0
    }
5416
5417
0
    if (!PG_ARGISNULL(2))
5418
0
    {
5419
0
      text     *delim = PG_GETARG_TEXT_PP(2);
5420
5421
0
      appendStringInfoText(state, delim);
5422
0
      if (isfirst)
5423
0
        state->cursor = VARSIZE_ANY_EXHDR(delim);
5424
0
    }
5425
5426
0
    appendStringInfoText(state, value);
5427
0
  }
5428
5429
  /*
5430
   * The transition type for string_agg() is declared to be "internal",
5431
   * which is a pass-by-value type the same size as a pointer.
5432
   */
5433
0
  if (state)
5434
0
    PG_RETURN_POINTER(state);
5435
0
  PG_RETURN_NULL();
5436
0
}
5437
5438
/*
5439
 * string_agg_combine
5440
 *    Aggregate combine function for string_agg(text) and string_agg(bytea)
5441
 */
5442
Datum
5443
string_agg_combine(PG_FUNCTION_ARGS)
5444
0
{
5445
0
  StringInfo  state1;
5446
0
  StringInfo  state2;
5447
0
  MemoryContext agg_context;
5448
5449
0
  if (!AggCheckCallContext(fcinfo, &agg_context))
5450
0
    elog(ERROR, "aggregate function called in non-aggregate context");
5451
5452
0
  state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5453
0
  state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
5454
5455
0
  if (state2 == NULL)
5456
0
  {
5457
    /*
5458
     * NULL state2 is easy, just return state1, which we know is already
5459
     * in the agg_context
5460
     */
5461
0
    if (state1 == NULL)
5462
0
      PG_RETURN_NULL();
5463
0
    PG_RETURN_POINTER(state1);
5464
0
  }
5465
5466
0
  if (state1 == NULL)
5467
0
  {
5468
    /* We must copy state2's data into the agg_context */
5469
0
    MemoryContext old_context;
5470
5471
0
    old_context = MemoryContextSwitchTo(agg_context);
5472
0
    state1 = makeStringAggState(fcinfo);
5473
0
    appendBinaryStringInfo(state1, state2->data, state2->len);
5474
0
    state1->cursor = state2->cursor;
5475
0
    MemoryContextSwitchTo(old_context);
5476
0
  }
5477
0
  else if (state2->len > 0)
5478
0
  {
5479
    /* Combine ... state1->cursor does not change in this case */
5480
0
    appendBinaryStringInfo(state1, state2->data, state2->len);
5481
0
  }
5482
5483
0
  PG_RETURN_POINTER(state1);
5484
0
}
5485
5486
/*
5487
 * string_agg_serialize
5488
 *    Aggregate serialize function for string_agg(text) and string_agg(bytea)
5489
 *
5490
 * This is strict, so we need not handle NULL input
5491
 */
5492
Datum
5493
string_agg_serialize(PG_FUNCTION_ARGS)
5494
0
{
5495
0
  StringInfo  state;
5496
0
  StringInfoData buf;
5497
0
  bytea    *result;
5498
5499
  /* cannot be called directly because of internal-type argument */
5500
0
  Assert(AggCheckCallContext(fcinfo, NULL));
5501
5502
0
  state = (StringInfo) PG_GETARG_POINTER(0);
5503
5504
0
  pq_begintypsend(&buf);
5505
5506
  /* cursor */
5507
0
  pq_sendint(&buf, state->cursor, 4);
5508
5509
  /* data */
5510
0
  pq_sendbytes(&buf, state->data, state->len);
5511
5512
0
  result = pq_endtypsend(&buf);
5513
5514
0
  PG_RETURN_BYTEA_P(result);
5515
0
}
5516
5517
/*
5518
 * string_agg_deserialize
5519
 *    Aggregate deserial function for string_agg(text) and string_agg(bytea)
5520
 *
5521
 * This is strict, so we need not handle NULL input
5522
 */
5523
Datum
5524
string_agg_deserialize(PG_FUNCTION_ARGS)
5525
0
{
5526
0
  bytea    *sstate;
5527
0
  StringInfo  result;
5528
0
  StringInfoData buf;
5529
0
  char     *data;
5530
0
  int     datalen;
5531
5532
  /* cannot be called directly because of internal-type argument */
5533
0
  Assert(AggCheckCallContext(fcinfo, NULL));
5534
5535
0
  sstate = PG_GETARG_BYTEA_PP(0);
5536
5537
  /*
5538
   * Initialize a StringInfo so that we can "receive" it using the standard
5539
   * recv-function infrastructure.
5540
   */
5541
0
  initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
5542
0
               VARSIZE_ANY_EXHDR(sstate));
5543
5544
0
  result = makeStringAggState(fcinfo);
5545
5546
  /* cursor */
5547
0
  result->cursor = pq_getmsgint(&buf, 4);
5548
5549
  /* data */
5550
0
  datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
5551
0
  data = (char *) pq_getmsgbytes(&buf, datalen);
5552
0
  appendBinaryStringInfo(result, data, datalen);
5553
5554
0
  pq_getmsgend(&buf);
5555
5556
0
  PG_RETURN_POINTER(result);
5557
0
}
5558
5559
Datum
5560
string_agg_finalfn(PG_FUNCTION_ARGS)
5561
0
{
5562
0
  StringInfo  state;
5563
5564
  /* cannot be called directly because of internal-type argument */
5565
0
  Assert(AggCheckCallContext(fcinfo, NULL));
5566
5567
0
  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5568
5569
0
  if (state != NULL)
5570
0
  {
5571
    /* As per comment in transfn, strip data before the cursor position */
5572
0
    PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
5573
0
                          state->len - state->cursor));
5574
0
  }
5575
0
  else
5576
0
    PG_RETURN_NULL();
5577
0
}
5578
5579
/*
5580
 * Prepare cache with fmgr info for the output functions of the datatypes of
5581
 * the arguments of a concat-like function, beginning with argument "argidx".
5582
 * (Arguments before that will have corresponding slots in the resulting
5583
 * FmgrInfo array, but we don't fill those slots.)
5584
 */
5585
static FmgrInfo *
5586
build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5587
0
{
5588
0
  FmgrInfo   *foutcache;
5589
0
  int     i;
5590
5591
  /* We keep the info in fn_mcxt so it survives across calls */
5592
0
  foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5593
0
                        PG_NARGS() * sizeof(FmgrInfo));
5594
5595
0
  for (i = argidx; i < PG_NARGS(); i++)
5596
0
  {
5597
0
    Oid     valtype;
5598
0
    Oid     typOutput;
5599
0
    bool    typIsVarlena;
5600
5601
0
    valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5602
0
    if (!OidIsValid(valtype))
5603
0
      elog(ERROR, "could not determine data type of concat() input");
5604
5605
0
    getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5606
0
    fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5607
0
  }
5608
5609
0
  fcinfo->flinfo->fn_extra = foutcache;
5610
5611
0
  return foutcache;
5612
0
}
5613
5614
/*
5615
 * Implementation of both concat() and concat_ws().
5616
 *
5617
 * sepstr is the separator string to place between values.
5618
 * argidx identifies the first argument to concatenate (counting from zero);
5619
 * note that this must be constant across any one series of calls.
5620
 *
5621
 * Returns NULL if result should be NULL, else text value.
5622
 */
5623
static text *
5624
concat_internal(const char *sepstr, int argidx,
5625
        FunctionCallInfo fcinfo)
5626
0
{
5627
0
  text     *result;
5628
0
  StringInfoData str;
5629
0
  FmgrInfo   *foutcache;
5630
0
  bool    first_arg = true;
5631
0
  int     i;
5632
5633
  /*
5634
   * concat(VARIADIC some-array) is essentially equivalent to
5635
   * array_to_text(), ie concat the array elements with the given separator.
5636
   * So we just pass the case off to that code.
5637
   */
5638
0
  if (get_fn_expr_variadic(fcinfo->flinfo))
5639
0
  {
5640
0
    ArrayType  *arr;
5641
5642
    /* Should have just the one argument */
5643
0
    Assert(argidx == PG_NARGS() - 1);
5644
5645
    /* concat(VARIADIC NULL) is defined as NULL */
5646
0
    if (PG_ARGISNULL(argidx))
5647
0
      return NULL;
5648
5649
    /*
5650
     * Non-null argument had better be an array.  We assume that any call
5651
     * context that could let get_fn_expr_variadic return true will have
5652
     * checked that a VARIADIC-labeled parameter actually is an array.  So
5653
     * it should be okay to just Assert that it's an array rather than
5654
     * doing a full-fledged error check.
5655
     */
5656
0
    Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5657
5658
    /* OK, safe to fetch the array value */
5659
0
    arr = PG_GETARG_ARRAYTYPE_P(argidx);
5660
5661
    /*
5662
     * And serialize the array.  We tell array_to_text to ignore null
5663
     * elements, which matches the behavior of the loop below.
5664
     */
5665
0
    return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5666
0
  }
5667
5668
  /* Normal case without explicit VARIADIC marker */
5669
0
  initStringInfo(&str);
5670
5671
  /* Get output function info, building it if first time through */
5672
0
  foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5673
0
  if (foutcache == NULL)
5674
0
    foutcache = build_concat_foutcache(fcinfo, argidx);
5675
5676
0
  for (i = argidx; i < PG_NARGS(); i++)
5677
0
  {
5678
0
    if (!PG_ARGISNULL(i))
5679
0
    {
5680
0
      Datum   value = PG_GETARG_DATUM(i);
5681
5682
      /* add separator if appropriate */
5683
0
      if (first_arg)
5684
0
        first_arg = false;
5685
0
      else
5686
0
        appendStringInfoString(&str, sepstr);
5687
5688
      /* call the appropriate type output function, append the result */
5689
0
      appendStringInfoString(&str,
5690
0
                   OutputFunctionCall(&foutcache[i], value));
5691
0
    }
5692
0
  }
5693
5694
0
  result = cstring_to_text_with_len(str.data, str.len);
5695
0
  pfree(str.data);
5696
5697
0
  return result;
5698
0
}
5699
5700
/*
5701
 * Concatenate all arguments. NULL arguments are ignored.
5702
 */
5703
Datum
5704
text_concat(PG_FUNCTION_ARGS)
5705
0
{
5706
0
  text     *result;
5707
5708
0
  result = concat_internal("", 0, fcinfo);
5709
0
  if (result == NULL)
5710
0
    PG_RETURN_NULL();
5711
0
  PG_RETURN_TEXT_P(result);
5712
0
}
5713
5714
/*
5715
 * Concatenate all but first argument value with separators. The first
5716
 * parameter is used as the separator. NULL arguments are ignored.
5717
 */
5718
Datum
5719
text_concat_ws(PG_FUNCTION_ARGS)
5720
0
{
5721
0
  char     *sep;
5722
0
  text     *result;
5723
5724
  /* return NULL when separator is NULL */
5725
0
  if (PG_ARGISNULL(0))
5726
0
    PG_RETURN_NULL();
5727
0
  sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5728
5729
0
  result = concat_internal(sep, 1, fcinfo);
5730
0
  if (result == NULL)
5731
0
    PG_RETURN_NULL();
5732
0
  PG_RETURN_TEXT_P(result);
5733
0
}
5734
5735
/*
5736
 * Return first n characters in the string. When n is negative,
5737
 * return all but last |n| characters.
5738
 */
5739
Datum
5740
text_left(PG_FUNCTION_ARGS)
5741
0
{
5742
0
  int     n = PG_GETARG_INT32(1);
5743
5744
0
  if (n < 0)
5745
0
  {
5746
0
    text     *str = PG_GETARG_TEXT_PP(0);
5747
0
    const char *p = VARDATA_ANY(str);
5748
0
    int     len = VARSIZE_ANY_EXHDR(str);
5749
0
    int     rlen;
5750
5751
0
    n = pg_mbstrlen_with_len(p, len) + n;
5752
0
    rlen = pg_mbcharcliplen(p, len, n);
5753
0
    PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5754
0
  }
5755
0
  else
5756
0
    PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5757
0
}
5758
5759
/*
5760
 * Return last n characters in the string. When n is negative,
5761
 * return all but first |n| characters.
5762
 */
5763
Datum
5764
text_right(PG_FUNCTION_ARGS)
5765
0
{
5766
0
  text     *str = PG_GETARG_TEXT_PP(0);
5767
0
  const char *p = VARDATA_ANY(str);
5768
0
  int     len = VARSIZE_ANY_EXHDR(str);
5769
0
  int     n = PG_GETARG_INT32(1);
5770
0
  int     off;
5771
5772
0
  if (n < 0)
5773
0
    n = -n;
5774
0
  else
5775
0
    n = pg_mbstrlen_with_len(p, len) - n;
5776
0
  off = pg_mbcharcliplen(p, len, n);
5777
5778
0
  PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5779
0
}
5780
5781
/*
5782
 * Return reversed string
5783
 */
5784
Datum
5785
text_reverse(PG_FUNCTION_ARGS)
5786
0
{
5787
0
  text     *str = PG_GETARG_TEXT_PP(0);
5788
0
  const char *p = VARDATA_ANY(str);
5789
0
  int     len = VARSIZE_ANY_EXHDR(str);
5790
0
  const char *endp = p + len;
5791
0
  text     *result;
5792
0
  char     *dst;
5793
5794
0
  result = palloc(len + VARHDRSZ);
5795
0
  dst = (char *) VARDATA(result) + len;
5796
0
  SET_VARSIZE(result, len + VARHDRSZ);
5797
5798
0
  if (pg_database_encoding_max_length() > 1)
5799
0
  {
5800
    /* multibyte version */
5801
0
    while (p < endp)
5802
0
    {
5803
0
      int     sz;
5804
5805
0
      sz = pg_mblen(p);
5806
0
      dst -= sz;
5807
0
      memcpy(dst, p, sz);
5808
0
      p += sz;
5809
0
    }
5810
0
  }
5811
0
  else
5812
0
  {
5813
    /* single byte version */
5814
0
    while (p < endp)
5815
0
      *(--dst) = *p++;
5816
0
  }
5817
5818
0
  PG_RETURN_TEXT_P(result);
5819
0
}
5820
5821
5822
/*
5823
 * Support macros for text_format()
5824
 */
5825
0
#define TEXT_FORMAT_FLAG_MINUS  0x0001  /* is minus flag present? */
5826
5827
#define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5828
0
  do { \
5829
0
    if (++(ptr) >= (end_ptr)) \
5830
0
      ereport(ERROR, \
5831
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5832
0
           errmsg("unterminated format() type specifier"), \
5833
0
           errhint("For a single \"%%\" use \"%%%%\"."))); \
5834
0
  } while (0)
5835
5836
/*
5837
 * Returns a formatted string
5838
 */
5839
Datum
5840
text_format(PG_FUNCTION_ARGS)
5841
0
{
5842
0
  text     *fmt;
5843
0
  StringInfoData str;
5844
0
  const char *cp;
5845
0
  const char *start_ptr;
5846
0
  const char *end_ptr;
5847
0
  text     *result;
5848
0
  int     arg;
5849
0
  bool    funcvariadic;
5850
0
  int     nargs;
5851
0
  Datum    *elements = NULL;
5852
0
  bool     *nulls = NULL;
5853
0
  Oid     element_type = InvalidOid;
5854
0
  Oid     prev_type = InvalidOid;
5855
0
  Oid     prev_width_type = InvalidOid;
5856
0
  FmgrInfo  typoutputfinfo;
5857
0
  FmgrInfo  typoutputinfo_width;
5858
5859
  /* When format string is null, immediately return null */
5860
0
  if (PG_ARGISNULL(0))
5861
0
    PG_RETURN_NULL();
5862
5863
  /* If argument is marked VARIADIC, expand array into elements */
5864
0
  if (get_fn_expr_variadic(fcinfo->flinfo))
5865
0
  {
5866
0
    ArrayType  *arr;
5867
0
    int16   elmlen;
5868
0
    bool    elmbyval;
5869
0
    char    elmalign;
5870
0
    int     nitems;
5871
5872
    /* Should have just the one argument */
5873
0
    Assert(PG_NARGS() == 2);
5874
5875
    /* If argument is NULL, we treat it as zero-length array */
5876
0
    if (PG_ARGISNULL(1))
5877
0
      nitems = 0;
5878
0
    else
5879
0
    {
5880
      /*
5881
       * Non-null argument had better be an array.  We assume that any
5882
       * call context that could let get_fn_expr_variadic return true
5883
       * will have checked that a VARIADIC-labeled parameter actually is
5884
       * an array.  So it should be okay to just Assert that it's an
5885
       * array rather than doing a full-fledged error check.
5886
       */
5887
0
      Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5888
5889
      /* OK, safe to fetch the array value */
5890
0
      arr = PG_GETARG_ARRAYTYPE_P(1);
5891
5892
      /* Get info about array element type */
5893
0
      element_type = ARR_ELEMTYPE(arr);
5894
0
      get_typlenbyvalalign(element_type,
5895
0
                 &elmlen, &elmbyval, &elmalign);
5896
5897
      /* Extract all array elements */
5898
0
      deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5899
0
                &elements, &nulls, &nitems);
5900
0
    }
5901
5902
0
    nargs = nitems + 1;
5903
0
    funcvariadic = true;
5904
0
  }
5905
0
  else
5906
0
  {
5907
    /* Non-variadic case, we'll process the arguments individually */
5908
0
    nargs = PG_NARGS();
5909
0
    funcvariadic = false;
5910
0
  }
5911
5912
  /* Setup for main loop. */
5913
0
  fmt = PG_GETARG_TEXT_PP(0);
5914
0
  start_ptr = VARDATA_ANY(fmt);
5915
0
  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5916
0
  initStringInfo(&str);
5917
0
  arg = 1;          /* next argument position to print */
5918
5919
  /* Scan format string, looking for conversion specifiers. */
5920
0
  for (cp = start_ptr; cp < end_ptr; cp++)
5921
0
  {
5922
0
    int     argpos;
5923
0
    int     widthpos;
5924
0
    int     flags;
5925
0
    int     width;
5926
0
    Datum   value;
5927
0
    bool    isNull;
5928
0
    Oid     typid;
5929
5930
    /*
5931
     * If it's not the start of a conversion specifier, just copy it to
5932
     * the output buffer.
5933
     */
5934
0
    if (*cp != '%')
5935
0
    {
5936
0
      appendStringInfoCharMacro(&str, *cp);
5937
0
      continue;
5938
0
    }
5939
5940
0
    ADVANCE_PARSE_POINTER(cp, end_ptr);
5941
5942
    /* Easy case: %% outputs a single % */
5943
0
    if (*cp == '%')
5944
0
    {
5945
0
      appendStringInfoCharMacro(&str, *cp);
5946
0
      continue;
5947
0
    }
5948
5949
    /* Parse the optional portions of the format specifier */
5950
0
    cp = text_format_parse_format(cp, end_ptr,
5951
0
                    &argpos, &widthpos,
5952
0
                    &flags, &width);
5953
5954
    /*
5955
     * Next we should see the main conversion specifier.  Whether or not
5956
     * an argument position was present, it's known that at least one
5957
     * character remains in the string at this point.  Experience suggests
5958
     * that it's worth checking that that character is one of the expected
5959
     * ones before we try to fetch arguments, so as to produce the least
5960
     * confusing response to a mis-formatted specifier.
5961
     */
5962
0
    if (strchr("sIL", *cp) == NULL)
5963
0
      ereport(ERROR,
5964
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5965
0
           errmsg("unrecognized format() type specifier \"%.*s\"",
5966
0
              pg_mblen(cp), cp),
5967
0
           errhint("For a single \"%%\" use \"%%%%\".")));
5968
5969
    /* If indirect width was specified, get its value */
5970
0
    if (widthpos >= 0)
5971
0
    {
5972
      /* Collect the specified or next argument position */
5973
0
      if (widthpos > 0)
5974
0
        arg = widthpos;
5975
0
      if (arg >= nargs)
5976
0
        ereport(ERROR,
5977
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5978
0
             errmsg("too few arguments for format()")));
5979
5980
      /* Get the value and type of the selected argument */
5981
0
      if (!funcvariadic)
5982
0
      {
5983
0
        value = PG_GETARG_DATUM(arg);
5984
0
        isNull = PG_ARGISNULL(arg);
5985
0
        typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5986
0
      }
5987
0
      else
5988
0
      {
5989
0
        value = elements[arg - 1];
5990
0
        isNull = nulls[arg - 1];
5991
0
        typid = element_type;
5992
0
      }
5993
0
      if (!OidIsValid(typid))
5994
0
        elog(ERROR, "could not determine data type of format() input");
5995
5996
0
      arg++;
5997
5998
      /* We can treat NULL width the same as zero */
5999
0
      if (isNull)
6000
0
        width = 0;
6001
0
      else if (typid == INT4OID)
6002
0
        width = DatumGetInt32(value);
6003
0
      else if (typid == INT2OID)
6004
0
        width = DatumGetInt16(value);
6005
0
      else
6006
0
      {
6007
        /* For less-usual datatypes, convert to text then to int */
6008
0
        char     *str;
6009
6010
0
        if (typid != prev_width_type)
6011
0
        {
6012
0
          Oid     typoutputfunc;
6013
0
          bool    typIsVarlena;
6014
6015
0
          getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
6016
0
          fmgr_info(typoutputfunc, &typoutputinfo_width);
6017
0
          prev_width_type = typid;
6018
0
        }
6019
6020
0
        str = OutputFunctionCall(&typoutputinfo_width, value);
6021
6022
        /* pg_strtoint32 will complain about bad data or overflow */
6023
0
        width = pg_strtoint32(str);
6024
6025
0
        pfree(str);
6026
0
      }
6027
0
    }
6028
6029
    /* Collect the specified or next argument position */
6030
0
    if (argpos > 0)
6031
0
      arg = argpos;
6032
0
    if (arg >= nargs)
6033
0
      ereport(ERROR,
6034
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6035
0
           errmsg("too few arguments for format()")));
6036
6037
    /* Get the value and type of the selected argument */
6038
0
    if (!funcvariadic)
6039
0
    {
6040
0
      value = PG_GETARG_DATUM(arg);
6041
0
      isNull = PG_ARGISNULL(arg);
6042
0
      typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
6043
0
    }
6044
0
    else
6045
0
    {
6046
0
      value = elements[arg - 1];
6047
0
      isNull = nulls[arg - 1];
6048
0
      typid = element_type;
6049
0
    }
6050
0
    if (!OidIsValid(typid))
6051
0
      elog(ERROR, "could not determine data type of format() input");
6052
6053
0
    arg++;
6054
6055
    /*
6056
     * Get the appropriate typOutput function, reusing previous one if
6057
     * same type as previous argument.  That's particularly useful in the
6058
     * variadic-array case, but often saves work even for ordinary calls.
6059
     */
6060
0
    if (typid != prev_type)
6061
0
    {
6062
0
      Oid     typoutputfunc;
6063
0
      bool    typIsVarlena;
6064
6065
0
      getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
6066
0
      fmgr_info(typoutputfunc, &typoutputfinfo);
6067
0
      prev_type = typid;
6068
0
    }
6069
6070
    /*
6071
     * And now we can format the value.
6072
     */
6073
0
    switch (*cp)
6074
0
    {
6075
0
      case 's':
6076
0
      case 'I':
6077
0
      case 'L':
6078
0
        text_format_string_conversion(&str, *cp, &typoutputfinfo,
6079
0
                        value, isNull,
6080
0
                        flags, width);
6081
0
        break;
6082
0
      default:
6083
        /* should not get here, because of previous check */
6084
0
        ereport(ERROR,
6085
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6086
0
             errmsg("unrecognized format() type specifier \"%.*s\"",
6087
0
                pg_mblen(cp), cp),
6088
0
             errhint("For a single \"%%\" use \"%%%%\".")));
6089
0
        break;
6090
0
    }
6091
0
  }
6092
6093
  /* Don't need deconstruct_array results anymore. */
6094
0
  if (elements != NULL)
6095
0
    pfree(elements);
6096
0
  if (nulls != NULL)
6097
0
    pfree(nulls);
6098
6099
  /* Generate results. */
6100
0
  result = cstring_to_text_with_len(str.data, str.len);
6101
0
  pfree(str.data);
6102
6103
0
  PG_RETURN_TEXT_P(result);
6104
0
}
6105
6106
/*
6107
 * Parse contiguous digits as a decimal number.
6108
 *
6109
 * Returns true if some digits could be parsed.
6110
 * The value is returned into *value, and *ptr is advanced to the next
6111
 * character to be parsed.
6112
 *
6113
 * Note parsing invariant: at least one character is known available before
6114
 * string end (end_ptr) at entry, and this is still true at exit.
6115
 */
6116
static bool
6117
text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
6118
0
{
6119
0
  bool    found = false;
6120
0
  const char *cp = *ptr;
6121
0
  int     val = 0;
6122
6123
0
  while (*cp >= '0' && *cp <= '9')
6124
0
  {
6125
0
    int8    digit = (*cp - '0');
6126
6127
0
    if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
6128
0
      unlikely(pg_add_s32_overflow(val, digit, &val)))
6129
0
      ereport(ERROR,
6130
0
          (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6131
0
           errmsg("number is out of range")));
6132
0
    ADVANCE_PARSE_POINTER(cp, end_ptr);
6133
0
    found = true;
6134
0
  }
6135
6136
0
  *ptr = cp;
6137
0
  *value = val;
6138
6139
0
  return found;
6140
0
}
6141
6142
/*
6143
 * Parse a format specifier (generally following the SUS printf spec).
6144
 *
6145
 * We have already advanced over the initial '%', and we are looking for
6146
 * [argpos][flags][width]type (but the type character is not consumed here).
6147
 *
6148
 * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
6149
 * Output parameters:
6150
 *  argpos: argument position for value to be printed.  -1 means unspecified.
6151
 *  widthpos: argument position for width.  Zero means the argument position
6152
 *      was unspecified (ie, take the next arg) and -1 means no width
6153
 *      argument (width was omitted or specified as a constant).
6154
 *  flags: bitmask of flags.
6155
 *  width: directly-specified width value.  Zero means the width was omitted
6156
 *      (note it's not necessary to distinguish this case from an explicit
6157
 *      zero width value).
6158
 *
6159
 * The function result is the next character position to be parsed, ie, the
6160
 * location where the type character is/should be.
6161
 *
6162
 * Note parsing invariant: at least one character is known available before
6163
 * string end (end_ptr) at entry, and this is still true at exit.
6164
 */
6165
static const char *
6166
text_format_parse_format(const char *start_ptr, const char *end_ptr,
6167
             int *argpos, int *widthpos,
6168
             int *flags, int *width)
6169
0
{
6170
0
  const char *cp = start_ptr;
6171
0
  int     n;
6172
6173
  /* set defaults for output parameters */
6174
0
  *argpos = -1;
6175
0
  *widthpos = -1;
6176
0
  *flags = 0;
6177
0
  *width = 0;
6178
6179
  /* try to identify first number */
6180
0
  if (text_format_parse_digits(&cp, end_ptr, &n))
6181
0
  {
6182
0
    if (*cp != '$')
6183
0
    {
6184
      /* Must be just a width and a type, so we're done */
6185
0
      *width = n;
6186
0
      return cp;
6187
0
    }
6188
    /* The number was argument position */
6189
0
    *argpos = n;
6190
    /* Explicit 0 for argument index is immediately refused */
6191
0
    if (n == 0)
6192
0
      ereport(ERROR,
6193
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6194
0
           errmsg("format specifies argument 0, but arguments are numbered from 1")));
6195
0
    ADVANCE_PARSE_POINTER(cp, end_ptr);
6196
0
  }
6197
6198
  /* Handle flags (only minus is supported now) */
6199
0
  while (*cp == '-')
6200
0
  {
6201
0
    *flags |= TEXT_FORMAT_FLAG_MINUS;
6202
0
    ADVANCE_PARSE_POINTER(cp, end_ptr);
6203
0
  }
6204
6205
0
  if (*cp == '*')
6206
0
  {
6207
    /* Handle indirect width */
6208
0
    ADVANCE_PARSE_POINTER(cp, end_ptr);
6209
0
    if (text_format_parse_digits(&cp, end_ptr, &n))
6210
0
    {
6211
      /* number in this position must be closed by $ */
6212
0
      if (*cp != '$')
6213
0
        ereport(ERROR,
6214
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6215
0
             errmsg("width argument position must be ended by \"$\"")));
6216
      /* The number was width argument position */
6217
0
      *widthpos = n;
6218
      /* Explicit 0 for argument index is immediately refused */
6219
0
      if (n == 0)
6220
0
        ereport(ERROR,
6221
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6222
0
             errmsg("format specifies argument 0, but arguments are numbered from 1")));
6223
0
      ADVANCE_PARSE_POINTER(cp, end_ptr);
6224
0
    }
6225
0
    else
6226
0
      *widthpos = 0;   /* width's argument position is unspecified */
6227
0
  }
6228
0
  else
6229
0
  {
6230
    /* Check for direct width specification */
6231
0
    if (text_format_parse_digits(&cp, end_ptr, &n))
6232
0
      *width = n;
6233
0
  }
6234
6235
  /* cp should now be pointing at type character */
6236
0
  return cp;
6237
0
}
6238
6239
/*
6240
 * Format a %s, %I, or %L conversion
6241
 */
6242
static void
6243
text_format_string_conversion(StringInfo buf, char conversion,
6244
                FmgrInfo *typOutputInfo,
6245
                Datum value, bool isNull,
6246
                int flags, int width)
6247
0
{
6248
0
  char     *str;
6249
6250
  /* Handle NULL arguments before trying to stringify the value. */
6251
0
  if (isNull)
6252
0
  {
6253
0
    if (conversion == 's')
6254
0
      text_format_append_string(buf, "", flags, width);
6255
0
    else if (conversion == 'L')
6256
0
      text_format_append_string(buf, "NULL", flags, width);
6257
0
    else if (conversion == 'I')
6258
0
      ereport(ERROR,
6259
0
          (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6260
0
           errmsg("null values cannot be formatted as an SQL identifier")));
6261
0
    return;
6262
0
  }
6263
6264
  /* Stringify. */
6265
0
  str = OutputFunctionCall(typOutputInfo, value);
6266
6267
  /* Escape. */
6268
0
  if (conversion == 'I')
6269
0
  {
6270
    /* quote_identifier may or may not allocate a new string. */
6271
0
    text_format_append_string(buf, quote_identifier(str), flags, width);
6272
0
  }
6273
0
  else if (conversion == 'L')
6274
0
  {
6275
0
    char     *qstr = quote_literal_cstr(str);
6276
6277
0
    text_format_append_string(buf, qstr, flags, width);
6278
    /* quote_literal_cstr() always allocates a new string */
6279
0
    pfree(qstr);
6280
0
  }
6281
0
  else
6282
0
    text_format_append_string(buf, str, flags, width);
6283
6284
  /* Cleanup. */
6285
0
  pfree(str);
6286
0
}
6287
6288
/*
6289
 * Append str to buf, padding as directed by flags/width
6290
 */
6291
static void
6292
text_format_append_string(StringInfo buf, const char *str,
6293
              int flags, int width)
6294
0
{
6295
0
  bool    align_to_left = false;
6296
0
  int     len;
6297
6298
  /* fast path for typical easy case */
6299
0
  if (width == 0)
6300
0
  {
6301
0
    appendStringInfoString(buf, str);
6302
0
    return;
6303
0
  }
6304
6305
0
  if (width < 0)
6306
0
  {
6307
    /* Negative width: implicit '-' flag, then take absolute value */
6308
0
    align_to_left = true;
6309
    /* -INT_MIN is undefined */
6310
0
    if (width <= INT_MIN)
6311
0
      ereport(ERROR,
6312
0
          (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6313
0
           errmsg("number is out of range")));
6314
0
    width = -width;
6315
0
  }
6316
0
  else if (flags & TEXT_FORMAT_FLAG_MINUS)
6317
0
    align_to_left = true;
6318
6319
0
  len = pg_mbstrlen(str);
6320
0
  if (align_to_left)
6321
0
  {
6322
    /* left justify */
6323
0
    appendStringInfoString(buf, str);
6324
0
    if (len < width)
6325
0
      appendStringInfoSpaces(buf, width - len);
6326
0
  }
6327
0
  else
6328
0
  {
6329
    /* right justify */
6330
0
    if (len < width)
6331
0
      appendStringInfoSpaces(buf, width - len);
6332
0
    appendStringInfoString(buf, str);
6333
0
  }
6334
0
}
6335
6336
/*
6337
 * text_format_nv - nonvariadic wrapper for text_format function.
6338
 *
6339
 * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6340
 * which checks that all built-in functions that share the implementing C
6341
 * function take the same number of arguments.
6342
 */
6343
Datum
6344
text_format_nv(PG_FUNCTION_ARGS)
6345
0
{
6346
0
  return text_format(fcinfo);
6347
0
}
6348
6349
/*
6350
 * Helper function for Levenshtein distance functions. Faster than memcmp(),
6351
 * for this use case.
6352
 */
6353
static inline bool
6354
rest_of_char_same(const char *s1, const char *s2, int len)
6355
0
{
6356
0
  while (len > 0)
6357
0
  {
6358
0
    len--;
6359
0
    if (s1[len] != s2[len])
6360
0
      return false;
6361
0
  }
6362
0
  return true;
6363
0
}
6364
6365
/* Expand each Levenshtein distance variant */
6366
#include "levenshtein.c"
6367
#define LEVENSHTEIN_LESS_EQUAL
6368
#include "levenshtein.c"
6369
6370
6371
/*
6372
 * The following *ClosestMatch() functions can be used to determine whether a
6373
 * user-provided string resembles any known valid values, which is useful for
6374
 * providing hints in log messages, among other things.  Use these functions
6375
 * like so:
6376
 *
6377
 *    initClosestMatch(&state, source_string, max_distance);
6378
 *
6379
 *    for (int i = 0; i < num_valid_strings; i++)
6380
 *      updateClosestMatch(&state, valid_strings[i]);
6381
 *
6382
 *    closestMatch = getClosestMatch(&state);
6383
 */
6384
6385
/*
6386
 * Initialize the given state with the source string and maximum Levenshtein
6387
 * distance to consider.
6388
 */
6389
void
6390
initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
6391
0
{
6392
0
  Assert(state);
6393
0
  Assert(max_d >= 0);
6394
6395
0
  state->source = source;
6396
0
  state->min_d = -1;
6397
0
  state->max_d = max_d;
6398
0
  state->match = NULL;
6399
0
}
6400
6401
/*
6402
 * If the candidate string is a closer match than the current one saved (or
6403
 * there is no match saved), save it as the closest match.
6404
 *
6405
 * If the source or candidate string is NULL, empty, or too long, this function
6406
 * takes no action.  Likewise, if the Levenshtein distance exceeds the maximum
6407
 * allowed or more than half the characters are different, no action is taken.
6408
 */
6409
void
6410
updateClosestMatch(ClosestMatchState *state, const char *candidate)
6411
0
{
6412
0
  int     dist;
6413
6414
0
  Assert(state);
6415
6416
0
  if (state->source == NULL || state->source[0] == '\0' ||
6417
0
    candidate == NULL || candidate[0] == '\0')
6418
0
    return;
6419
6420
  /*
6421
   * To avoid ERROR-ing, we check the lengths here instead of setting
6422
   * 'trusted' to false in the call to varstr_levenshtein_less_equal().
6423
   */
6424
0
  if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
6425
0
    strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
6426
0
    return;
6427
6428
0
  dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
6429
0
                     candidate, strlen(candidate), 1, 1, 1,
6430
0
                     state->max_d, true);
6431
0
  if (dist <= state->max_d &&
6432
0
    dist <= strlen(state->source) / 2 &&
6433
0
    (state->min_d == -1 || dist < state->min_d))
6434
0
  {
6435
0
    state->min_d = dist;
6436
0
    state->match = candidate;
6437
0
  }
6438
0
}
6439
6440
/*
6441
 * Return the closest match.  If no suitable candidates were provided via
6442
 * updateClosestMatch(), return NULL.
6443
 */
6444
const char *
6445
getClosestMatch(ClosestMatchState *state)
6446
0
{
6447
0
  Assert(state);
6448
6449
0
  return state->match;
6450
0
}
6451
6452
6453
/*
6454
 * Unicode support
6455
 */
6456
6457
static UnicodeNormalizationForm
6458
unicode_norm_form_from_string(const char *formstr)
6459
0
{
6460
0
  UnicodeNormalizationForm form = -1;
6461
6462
  /*
6463
   * Might as well check this while we're here.
6464
   */
6465
0
  if (GetDatabaseEncoding() != PG_UTF8)
6466
0
    ereport(ERROR,
6467
0
        (errcode(ERRCODE_SYNTAX_ERROR),
6468
0
         errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6469
6470
0
  if (pg_strcasecmp(formstr, "NFC") == 0)
6471
0
    form = UNICODE_NFC;
6472
0
  else if (pg_strcasecmp(formstr, "NFD") == 0)
6473
0
    form = UNICODE_NFD;
6474
0
  else if (pg_strcasecmp(formstr, "NFKC") == 0)
6475
0
    form = UNICODE_NFKC;
6476
0
  else if (pg_strcasecmp(formstr, "NFKD") == 0)
6477
0
    form = UNICODE_NFKD;
6478
0
  else
6479
0
    ereport(ERROR,
6480
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6481
0
         errmsg("invalid normalization form: %s", formstr)));
6482
6483
0
  return form;
6484
0
}
6485
6486
/*
6487
 * Returns version of Unicode used by Postgres in "major.minor" format (the
6488
 * same format as the Unicode version reported by ICU). The third component
6489
 * ("update version") never involves additions to the character repertoire and
6490
 * is unimportant for most purposes.
6491
 *
6492
 * See: https://unicode.org/versions/
6493
 */
6494
Datum
6495
unicode_version(PG_FUNCTION_ARGS)
6496
0
{
6497
0
  PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
6498
0
}
6499
6500
/*
6501
 * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
6502
 */
6503
Datum
6504
icu_unicode_version(PG_FUNCTION_ARGS)
6505
0
{
6506
0
#ifdef USE_ICU
6507
0
  PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
6508
#else
6509
  PG_RETURN_NULL();
6510
#endif
6511
0
}
6512
6513
/*
6514
 * Check whether the string contains only assigned Unicode code
6515
 * points. Requires that the database encoding is UTF-8.
6516
 */
6517
Datum
6518
unicode_assigned(PG_FUNCTION_ARGS)
6519
0
{
6520
0
  text     *input = PG_GETARG_TEXT_PP(0);
6521
0
  unsigned char *p;
6522
0
  int     size;
6523
6524
0
  if (GetDatabaseEncoding() != PG_UTF8)
6525
0
    ereport(ERROR,
6526
0
        (errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
6527
6528
  /* convert to pg_wchar */
6529
0
  size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6530
0
  p = (unsigned char *) VARDATA_ANY(input);
6531
0
  for (int i = 0; i < size; i++)
6532
0
  {
6533
0
    pg_wchar  uchar = utf8_to_unicode(p);
6534
0
    int     category = unicode_category(uchar);
6535
6536
0
    if (category == PG_U_UNASSIGNED)
6537
0
      PG_RETURN_BOOL(false);
6538
6539
0
    p += pg_utf_mblen(p);
6540
0
  }
6541
6542
0
  PG_RETURN_BOOL(true);
6543
0
}
6544
6545
Datum
6546
unicode_normalize_func(PG_FUNCTION_ARGS)
6547
0
{
6548
0
  text     *input = PG_GETARG_TEXT_PP(0);
6549
0
  char     *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6550
0
  UnicodeNormalizationForm form;
6551
0
  int     size;
6552
0
  pg_wchar   *input_chars;
6553
0
  pg_wchar   *output_chars;
6554
0
  unsigned char *p;
6555
0
  text     *result;
6556
0
  int     i;
6557
6558
0
  form = unicode_norm_form_from_string(formstr);
6559
6560
  /* convert to pg_wchar */
6561
0
  size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6562
0
  input_chars = palloc((size + 1) * sizeof(pg_wchar));
6563
0
  p = (unsigned char *) VARDATA_ANY(input);
6564
0
  for (i = 0; i < size; i++)
6565
0
  {
6566
0
    input_chars[i] = utf8_to_unicode(p);
6567
0
    p += pg_utf_mblen(p);
6568
0
  }
6569
0
  input_chars[i] = (pg_wchar) '\0';
6570
0
  Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6571
6572
  /* action */
6573
0
  output_chars = unicode_normalize(form, input_chars);
6574
6575
  /* convert back to UTF-8 string */
6576
0
  size = 0;
6577
0
  for (pg_wchar *wp = output_chars; *wp; wp++)
6578
0
  {
6579
0
    unsigned char buf[4];
6580
6581
0
    unicode_to_utf8(*wp, buf);
6582
0
    size += pg_utf_mblen(buf);
6583
0
  }
6584
6585
0
  result = palloc(size + VARHDRSZ);
6586
0
  SET_VARSIZE(result, size + VARHDRSZ);
6587
6588
0
  p = (unsigned char *) VARDATA_ANY(result);
6589
0
  for (pg_wchar *wp = output_chars; *wp; wp++)
6590
0
  {
6591
0
    unicode_to_utf8(*wp, p);
6592
0
    p += pg_utf_mblen(p);
6593
0
  }
6594
0
  Assert((char *) p == (char *) result + size + VARHDRSZ);
6595
6596
0
  PG_RETURN_TEXT_P(result);
6597
0
}
6598
6599
/*
6600
 * Check whether the string is in the specified Unicode normalization form.
6601
 *
6602
 * This is done by converting the string to the specified normal form and then
6603
 * comparing that to the original string.  To speed that up, we also apply the
6604
 * "quick check" algorithm specified in UAX #15, which can give a yes or no
6605
 * answer for many strings by just scanning the string once.
6606
 *
6607
 * This function should generally be optimized for the case where the string
6608
 * is in fact normalized.  In that case, we'll end up looking at the entire
6609
 * string, so it's probably not worth doing any incremental conversion etc.
6610
 */
6611
Datum
6612
unicode_is_normalized(PG_FUNCTION_ARGS)
6613
0
{
6614
0
  text     *input = PG_GETARG_TEXT_PP(0);
6615
0
  char     *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6616
0
  UnicodeNormalizationForm form;
6617
0
  int     size;
6618
0
  pg_wchar   *input_chars;
6619
0
  pg_wchar   *output_chars;
6620
0
  unsigned char *p;
6621
0
  int     i;
6622
0
  UnicodeNormalizationQC quickcheck;
6623
0
  int     output_size;
6624
0
  bool    result;
6625
6626
0
  form = unicode_norm_form_from_string(formstr);
6627
6628
  /* convert to pg_wchar */
6629
0
  size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6630
0
  input_chars = palloc((size + 1) * sizeof(pg_wchar));
6631
0
  p = (unsigned char *) VARDATA_ANY(input);
6632
0
  for (i = 0; i < size; i++)
6633
0
  {
6634
0
    input_chars[i] = utf8_to_unicode(p);
6635
0
    p += pg_utf_mblen(p);
6636
0
  }
6637
0
  input_chars[i] = (pg_wchar) '\0';
6638
0
  Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6639
6640
  /* quick check (see UAX #15) */
6641
0
  quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6642
0
  if (quickcheck == UNICODE_NORM_QC_YES)
6643
0
    PG_RETURN_BOOL(true);
6644
0
  else if (quickcheck == UNICODE_NORM_QC_NO)
6645
0
    PG_RETURN_BOOL(false);
6646
6647
  /* normalize and compare with original */
6648
0
  output_chars = unicode_normalize(form, input_chars);
6649
6650
0
  output_size = 0;
6651
0
  for (pg_wchar *wp = output_chars; *wp; wp++)
6652
0
    output_size++;
6653
6654
0
  result = (size == output_size) &&
6655
0
    (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6656
6657
0
  PG_RETURN_BOOL(result);
6658
0
}
6659
6660
/*
6661
 * Check if first n chars are hexadecimal digits
6662
 */
6663
static bool
6664
isxdigits_n(const char *instr, size_t n)
6665
0
{
6666
0
  for (size_t i = 0; i < n; i++)
6667
0
    if (!isxdigit((unsigned char) instr[i]))
6668
0
      return false;
6669
6670
0
  return true;
6671
0
}
6672
6673
static unsigned int
6674
hexval(unsigned char c)
6675
0
{
6676
0
  if (c >= '0' && c <= '9')
6677
0
    return c - '0';
6678
0
  if (c >= 'a' && c <= 'f')
6679
0
    return c - 'a' + 0xA;
6680
0
  if (c >= 'A' && c <= 'F')
6681
0
    return c - 'A' + 0xA;
6682
0
  elog(ERROR, "invalid hexadecimal digit");
6683
0
  return 0;         /* not reached */
6684
0
}
6685
6686
/*
6687
 * Translate string with hexadecimal digits to number
6688
 */
6689
static unsigned int
6690
hexval_n(const char *instr, size_t n)
6691
0
{
6692
0
  unsigned int result = 0;
6693
6694
0
  for (size_t i = 0; i < n; i++)
6695
0
    result += hexval(instr[i]) << (4 * (n - i - 1));
6696
6697
0
  return result;
6698
0
}
6699
6700
/*
6701
 * Replaces Unicode escape sequences by Unicode characters
6702
 */
6703
Datum
6704
unistr(PG_FUNCTION_ARGS)
6705
0
{
6706
0
  text     *input_text = PG_GETARG_TEXT_PP(0);
6707
0
  char     *instr;
6708
0
  int     len;
6709
0
  StringInfoData str;
6710
0
  text     *result;
6711
0
  pg_wchar  pair_first = 0;
6712
0
  char    cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6713
6714
0
  instr = VARDATA_ANY(input_text);
6715
0
  len = VARSIZE_ANY_EXHDR(input_text);
6716
6717
0
  initStringInfo(&str);
6718
6719
0
  while (len > 0)
6720
0
  {
6721
0
    if (instr[0] == '\\')
6722
0
    {
6723
0
      if (len >= 2 &&
6724
0
        instr[1] == '\\')
6725
0
      {
6726
0
        if (pair_first)
6727
0
          goto invalid_pair;
6728
0
        appendStringInfoChar(&str, '\\');
6729
0
        instr += 2;
6730
0
        len -= 2;
6731
0
      }
6732
0
      else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6733
0
           (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6734
0
      {
6735
0
        pg_wchar  unicode;
6736
0
        int     offset = instr[1] == 'u' ? 2 : 1;
6737
6738
0
        unicode = hexval_n(instr + offset, 4);
6739
6740
0
        if (!is_valid_unicode_codepoint(unicode))
6741
0
          ereport(ERROR,
6742
0
              errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6743
0
              errmsg("invalid Unicode code point: %04X", unicode));
6744
6745
0
        if (pair_first)
6746
0
        {
6747
0
          if (is_utf16_surrogate_second(unicode))
6748
0
          {
6749
0
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6750
0
            pair_first = 0;
6751
0
          }
6752
0
          else
6753
0
            goto invalid_pair;
6754
0
        }
6755
0
        else if (is_utf16_surrogate_second(unicode))
6756
0
          goto invalid_pair;
6757
6758
0
        if (is_utf16_surrogate_first(unicode))
6759
0
          pair_first = unicode;
6760
0
        else
6761
0
        {
6762
0
          pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6763
0
          appendStringInfoString(&str, cbuf);
6764
0
        }
6765
6766
0
        instr += 4 + offset;
6767
0
        len -= 4 + offset;
6768
0
      }
6769
0
      else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6770
0
      {
6771
0
        pg_wchar  unicode;
6772
6773
0
        unicode = hexval_n(instr + 2, 6);
6774
6775
0
        if (!is_valid_unicode_codepoint(unicode))
6776
0
          ereport(ERROR,
6777
0
              errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6778
0
              errmsg("invalid Unicode code point: %04X", unicode));
6779
6780
0
        if (pair_first)
6781
0
        {
6782
0
          if (is_utf16_surrogate_second(unicode))
6783
0
          {
6784
0
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6785
0
            pair_first = 0;
6786
0
          }
6787
0
          else
6788
0
            goto invalid_pair;
6789
0
        }
6790
0
        else if (is_utf16_surrogate_second(unicode))
6791
0
          goto invalid_pair;
6792
6793
0
        if (is_utf16_surrogate_first(unicode))
6794
0
          pair_first = unicode;
6795
0
        else
6796
0
        {
6797
0
          pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6798
0
          appendStringInfoString(&str, cbuf);
6799
0
        }
6800
6801
0
        instr += 8;
6802
0
        len -= 8;
6803
0
      }
6804
0
      else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6805
0
      {
6806
0
        pg_wchar  unicode;
6807
6808
0
        unicode = hexval_n(instr + 2, 8);
6809
6810
0
        if (!is_valid_unicode_codepoint(unicode))
6811
0
          ereport(ERROR,
6812
0
              errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6813
0
              errmsg("invalid Unicode code point: %04X", unicode));
6814
6815
0
        if (pair_first)
6816
0
        {
6817
0
          if (is_utf16_surrogate_second(unicode))
6818
0
          {
6819
0
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6820
0
            pair_first = 0;
6821
0
          }
6822
0
          else
6823
0
            goto invalid_pair;
6824
0
        }
6825
0
        else if (is_utf16_surrogate_second(unicode))
6826
0
          goto invalid_pair;
6827
6828
0
        if (is_utf16_surrogate_first(unicode))
6829
0
          pair_first = unicode;
6830
0
        else
6831
0
        {
6832
0
          pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6833
0
          appendStringInfoString(&str, cbuf);
6834
0
        }
6835
6836
0
        instr += 10;
6837
0
        len -= 10;
6838
0
      }
6839
0
      else
6840
0
        ereport(ERROR,
6841
0
            (errcode(ERRCODE_SYNTAX_ERROR),
6842
0
             errmsg("invalid Unicode escape"),
6843
0
             errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6844
0
    }
6845
0
    else
6846
0
    {
6847
0
      if (pair_first)
6848
0
        goto invalid_pair;
6849
6850
0
      appendStringInfoChar(&str, *instr++);
6851
0
      len--;
6852
0
    }
6853
0
  }
6854
6855
  /* unfinished surrogate pair? */
6856
0
  if (pair_first)
6857
0
    goto invalid_pair;
6858
6859
0
  result = cstring_to_text_with_len(str.data, str.len);
6860
0
  pfree(str.data);
6861
6862
0
  PG_RETURN_TEXT_P(result);
6863
6864
0
invalid_pair:
6865
0
  ereport(ERROR,
6866
0
      (errcode(ERRCODE_SYNTAX_ERROR),
6867
0
       errmsg("invalid Unicode surrogate pair")));
6868
0
  PG_RETURN_NULL();     /* keep compiler quiet */
6869
0
}