/src/postgres/src/backend/utils/adt/varlena.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * varlena.c |
4 | | * Functions for the variable-length built-in types. |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/utils/adt/varlena.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | #include "postgres.h" |
16 | | |
17 | | #include <ctype.h> |
18 | | #include <limits.h> |
19 | | |
20 | | #include "access/detoast.h" |
21 | | #include "access/toast_compression.h" |
22 | | #include "catalog/pg_collation.h" |
23 | | #include "catalog/pg_type.h" |
24 | | #include "common/hashfn.h" |
25 | | #include "common/int.h" |
26 | | #include "common/unicode_category.h" |
27 | | #include "common/unicode_norm.h" |
28 | | #include "common/unicode_version.h" |
29 | | #include "funcapi.h" |
30 | | #include "lib/hyperloglog.h" |
31 | | #include "libpq/pqformat.h" |
32 | | #include "miscadmin.h" |
33 | | #include "nodes/execnodes.h" |
34 | | #include "parser/scansup.h" |
35 | | #include "port/pg_bswap.h" |
36 | | #include "regex/regex.h" |
37 | | #include "utils/builtins.h" |
38 | | #include "utils/bytea.h" |
39 | | #include "utils/guc.h" |
40 | | #include "utils/lsyscache.h" |
41 | | #include "utils/memutils.h" |
42 | | #include "utils/pg_locale.h" |
43 | | #include "utils/sortsupport.h" |
44 | | #include "utils/varlena.h" |
45 | | |
46 | | |
47 | | /* GUC variable */ |
48 | | int bytea_output = BYTEA_OUTPUT_HEX; |
49 | | |
50 | | typedef struct varlena VarString; |
51 | | |
52 | | /* |
53 | | * State for text_position_* functions. |
54 | | */ |
55 | | typedef struct |
56 | | { |
57 | | pg_locale_t locale; /* collation used for substring matching */ |
58 | | bool is_multibyte_char_in_char; /* need to check char boundaries? */ |
59 | | bool greedy; /* find longest possible substring? */ |
60 | | |
61 | | char *str1; /* haystack string */ |
62 | | char *str2; /* needle string */ |
63 | | int len1; /* string lengths in bytes */ |
64 | | int len2; |
65 | | |
66 | | /* Skip table for Boyer-Moore-Horspool search algorithm: */ |
67 | | int skiptablemask; /* mask for ANDing with skiptable subscripts */ |
68 | | int skiptable[256]; /* skip distance for given mismatched char */ |
69 | | |
70 | | /* |
71 | | * Note that with nondeterministic collations, the length of the last |
72 | | * match is not necessarily equal to the length of the "needle" passed in. |
73 | | */ |
74 | | char *last_match; /* pointer to last match in 'str1' */ |
75 | | int last_match_len; /* length of last match */ |
76 | | int last_match_len_tmp; /* same but for internal use */ |
77 | | |
78 | | /* |
79 | | * Sometimes we need to convert the byte position of a match to a |
80 | | * character position. These store the last position that was converted, |
81 | | * so that on the next call, we can continue from that point, rather than |
82 | | * count characters from the very beginning. |
83 | | */ |
84 | | char *refpoint; /* pointer within original haystack string */ |
85 | | int refpos; /* 0-based character offset of the same point */ |
86 | | } TextPositionState; |
87 | | |
88 | | typedef struct |
89 | | { |
90 | | char *buf1; /* 1st string, or abbreviation original string |
91 | | * buf */ |
92 | | char *buf2; /* 2nd string, or abbreviation strxfrm() buf */ |
93 | | int buflen1; /* Allocated length of buf1 */ |
94 | | int buflen2; /* Allocated length of buf2 */ |
95 | | int last_len1; /* Length of last buf1 string/strxfrm() input */ |
96 | | int last_len2; /* Length of last buf2 string/strxfrm() blob */ |
97 | | int last_returned; /* Last comparison result (cache) */ |
98 | | bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */ |
99 | | bool collate_c; |
100 | | Oid typid; /* Actual datatype (text/bpchar/bytea/name) */ |
101 | | hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ |
102 | | hyperLogLogState full_card; /* Full key cardinality state */ |
103 | | double prop_card; /* Required cardinality proportion */ |
104 | | pg_locale_t locale; |
105 | | } VarStringSortSupport; |
106 | | |
107 | | /* |
108 | | * Output data for split_text(): we output either to an array or a table. |
109 | | * tupstore and tupdesc must be set up in advance to output to a table. |
110 | | */ |
111 | | typedef struct |
112 | | { |
113 | | ArrayBuildState *astate; |
114 | | Tuplestorestate *tupstore; |
115 | | TupleDesc tupdesc; |
116 | | } SplitTextOutputData; |
117 | | |
118 | | /* |
119 | | * This should be large enough that most strings will fit, but small enough |
120 | | * that we feel comfortable putting it on the stack |
121 | | */ |
122 | 0 | #define TEXTBUFLEN 1024 |
123 | | |
124 | | #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X)) |
125 | 0 | #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X)) |
126 | | |
127 | | static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup); |
128 | | static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup); |
129 | | static int namefastcmp_c(Datum x, Datum y, SortSupport ssup); |
130 | | static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup); |
131 | | static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup); |
132 | | static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup); |
133 | | static Datum varstr_abbrev_convert(Datum original, SortSupport ssup); |
134 | | static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup); |
135 | | static int32 text_length(Datum str); |
136 | | static text *text_catenate(text *t1, text *t2); |
137 | | static text *text_substring(Datum str, |
138 | | int32 start, |
139 | | int32 length, |
140 | | bool length_not_specified); |
141 | | static text *text_overlay(text *t1, text *t2, int sp, int sl); |
142 | | static int text_position(text *t1, text *t2, Oid collid); |
143 | | static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state); |
144 | | static bool text_position_next(TextPositionState *state); |
145 | | static char *text_position_next_internal(char *start_ptr, TextPositionState *state); |
146 | | static char *text_position_get_match_ptr(TextPositionState *state); |
147 | | static int text_position_get_match_pos(TextPositionState *state); |
148 | | static void text_position_cleanup(TextPositionState *state); |
149 | | static void check_collation_set(Oid collid); |
150 | | static int text_cmp(text *arg1, text *arg2, Oid collid); |
151 | | static bytea *bytea_catenate(bytea *t1, bytea *t2); |
152 | | static bytea *bytea_substring(Datum str, |
153 | | int S, |
154 | | int L, |
155 | | bool length_not_specified); |
156 | | static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); |
157 | | static void appendStringInfoText(StringInfo str, const text *t); |
158 | | static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate); |
159 | | static void split_text_accum_result(SplitTextOutputData *tstate, |
160 | | text *field_value, |
161 | | text *null_string, |
162 | | Oid collation); |
163 | | static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, |
164 | | const char *fldsep, const char *null_string); |
165 | | static StringInfo makeStringAggState(FunctionCallInfo fcinfo); |
166 | | static bool text_format_parse_digits(const char **ptr, const char *end_ptr, |
167 | | int *value); |
168 | | static const char *text_format_parse_format(const char *start_ptr, |
169 | | const char *end_ptr, |
170 | | int *argpos, int *widthpos, |
171 | | int *flags, int *width); |
172 | | static void text_format_string_conversion(StringInfo buf, char conversion, |
173 | | FmgrInfo *typOutputInfo, |
174 | | Datum value, bool isNull, |
175 | | int flags, int width); |
176 | | static void text_format_append_string(StringInfo buf, const char *str, |
177 | | int flags, int width); |
178 | | |
179 | | |
180 | | /***************************************************************************** |
181 | | * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE * |
182 | | *****************************************************************************/ |
183 | | |
184 | | /* |
185 | | * cstring_to_text |
186 | | * |
187 | | * Create a text value from a null-terminated C string. |
188 | | * |
189 | | * The new text value is freshly palloc'd with a full-size VARHDR. |
190 | | */ |
191 | | text * |
192 | | cstring_to_text(const char *s) |
193 | 0 | { |
194 | 0 | return cstring_to_text_with_len(s, strlen(s)); |
195 | 0 | } |
196 | | |
197 | | /* |
198 | | * cstring_to_text_with_len |
199 | | * |
200 | | * Same as cstring_to_text except the caller specifies the string length; |
201 | | * the string need not be null_terminated. |
202 | | */ |
203 | | text * |
204 | | cstring_to_text_with_len(const char *s, int len) |
205 | 0 | { |
206 | 0 | text *result = (text *) palloc(len + VARHDRSZ); |
207 | |
|
208 | 0 | SET_VARSIZE(result, len + VARHDRSZ); |
209 | 0 | memcpy(VARDATA(result), s, len); |
210 | |
|
211 | 0 | return result; |
212 | 0 | } |
213 | | |
214 | | /* |
215 | | * text_to_cstring |
216 | | * |
217 | | * Create a palloc'd, null-terminated C string from a text value. |
218 | | * |
219 | | * We support being passed a compressed or toasted text value. |
220 | | * This is a bit bogus since such values shouldn't really be referred to as |
221 | | * "text *", but it seems useful for robustness. If we didn't handle that |
222 | | * case here, we'd need another routine that did, anyway. |
223 | | */ |
224 | | char * |
225 | | text_to_cstring(const text *t) |
226 | 0 | { |
227 | | /* must cast away the const, unfortunately */ |
228 | 0 | text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t)); |
229 | 0 | int len = VARSIZE_ANY_EXHDR(tunpacked); |
230 | 0 | char *result; |
231 | |
|
232 | 0 | result = (char *) palloc(len + 1); |
233 | 0 | memcpy(result, VARDATA_ANY(tunpacked), len); |
234 | 0 | result[len] = '\0'; |
235 | |
|
236 | 0 | if (tunpacked != t) |
237 | 0 | pfree(tunpacked); |
238 | |
|
239 | 0 | return result; |
240 | 0 | } |
241 | | |
242 | | /* |
243 | | * text_to_cstring_buffer |
244 | | * |
245 | | * Copy a text value into a caller-supplied buffer of size dst_len. |
246 | | * |
247 | | * The text string is truncated if necessary to fit. The result is |
248 | | * guaranteed null-terminated (unless dst_len == 0). |
249 | | * |
250 | | * We support being passed a compressed or toasted text value. |
251 | | * This is a bit bogus since such values shouldn't really be referred to as |
252 | | * "text *", but it seems useful for robustness. If we didn't handle that |
253 | | * case here, we'd need another routine that did, anyway. |
254 | | */ |
255 | | void |
256 | | text_to_cstring_buffer(const text *src, char *dst, size_t dst_len) |
257 | 0 | { |
258 | | /* must cast away the const, unfortunately */ |
259 | 0 | text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src)); |
260 | 0 | size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked); |
261 | |
|
262 | 0 | if (dst_len > 0) |
263 | 0 | { |
264 | 0 | dst_len--; |
265 | 0 | if (dst_len >= src_len) |
266 | 0 | dst_len = src_len; |
267 | 0 | else /* ensure truncation is encoding-safe */ |
268 | 0 | dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len); |
269 | 0 | memcpy(dst, VARDATA_ANY(srcunpacked), dst_len); |
270 | 0 | dst[dst_len] = '\0'; |
271 | 0 | } |
272 | |
|
273 | 0 | if (srcunpacked != src) |
274 | 0 | pfree(srcunpacked); |
275 | 0 | } |
276 | | |
277 | | |
278 | | /***************************************************************************** |
279 | | * USER I/O ROUTINES * |
280 | | *****************************************************************************/ |
281 | | |
282 | | |
283 | 0 | #define VAL(CH) ((CH) - '0') |
284 | 0 | #define DIG(VAL) ((VAL) + '0') |
285 | | |
286 | | /* |
287 | | * byteain - converts from printable representation of byte array |
288 | | * |
289 | | * Non-printable characters must be passed as '\nnn' (octal) and are |
290 | | * converted to internal form. '\' must be passed as '\\'. |
291 | | * ereport(ERROR, ...) if bad form. |
292 | | * |
293 | | * BUGS: |
294 | | * The input is scanned twice. |
295 | | * The error checking of input is minimal. |
296 | | */ |
297 | | Datum |
298 | | byteain(PG_FUNCTION_ARGS) |
299 | 0 | { |
300 | 0 | char *inputText = PG_GETARG_CSTRING(0); |
301 | 0 | Node *escontext = fcinfo->context; |
302 | 0 | char *tp; |
303 | 0 | char *rp; |
304 | 0 | int bc; |
305 | 0 | bytea *result; |
306 | | |
307 | | /* Recognize hex input */ |
308 | 0 | if (inputText[0] == '\\' && inputText[1] == 'x') |
309 | 0 | { |
310 | 0 | size_t len = strlen(inputText); |
311 | |
|
312 | 0 | bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ |
313 | 0 | result = palloc(bc); |
314 | 0 | bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), |
315 | 0 | escontext); |
316 | 0 | SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */ |
317 | |
|
318 | 0 | PG_RETURN_BYTEA_P(result); |
319 | 0 | } |
320 | | |
321 | | /* Else, it's the traditional escaped style */ |
322 | 0 | for (bc = 0, tp = inputText; *tp != '\0'; bc++) |
323 | 0 | { |
324 | 0 | if (tp[0] != '\\') |
325 | 0 | tp++; |
326 | 0 | else if ((tp[0] == '\\') && |
327 | 0 | (tp[1] >= '0' && tp[1] <= '3') && |
328 | 0 | (tp[2] >= '0' && tp[2] <= '7') && |
329 | 0 | (tp[3] >= '0' && tp[3] <= '7')) |
330 | 0 | tp += 4; |
331 | 0 | else if ((tp[0] == '\\') && |
332 | 0 | (tp[1] == '\\')) |
333 | 0 | tp += 2; |
334 | 0 | else |
335 | 0 | { |
336 | | /* |
337 | | * one backslash, not followed by another or ### valid octal |
338 | | */ |
339 | 0 | ereturn(escontext, (Datum) 0, |
340 | 0 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
341 | 0 | errmsg("invalid input syntax for type %s", "bytea"))); |
342 | 0 | } |
343 | 0 | } |
344 | | |
345 | 0 | bc += VARHDRSZ; |
346 | |
|
347 | 0 | result = (bytea *) palloc(bc); |
348 | 0 | SET_VARSIZE(result, bc); |
349 | |
|
350 | 0 | tp = inputText; |
351 | 0 | rp = VARDATA(result); |
352 | 0 | while (*tp != '\0') |
353 | 0 | { |
354 | 0 | if (tp[0] != '\\') |
355 | 0 | *rp++ = *tp++; |
356 | 0 | else if ((tp[0] == '\\') && |
357 | 0 | (tp[1] >= '0' && tp[1] <= '3') && |
358 | 0 | (tp[2] >= '0' && tp[2] <= '7') && |
359 | 0 | (tp[3] >= '0' && tp[3] <= '7')) |
360 | 0 | { |
361 | 0 | bc = VAL(tp[1]); |
362 | 0 | bc <<= 3; |
363 | 0 | bc += VAL(tp[2]); |
364 | 0 | bc <<= 3; |
365 | 0 | *rp++ = bc + VAL(tp[3]); |
366 | |
|
367 | 0 | tp += 4; |
368 | 0 | } |
369 | 0 | else if ((tp[0] == '\\') && |
370 | 0 | (tp[1] == '\\')) |
371 | 0 | { |
372 | 0 | *rp++ = '\\'; |
373 | 0 | tp += 2; |
374 | 0 | } |
375 | 0 | else |
376 | 0 | { |
377 | | /* |
378 | | * We should never get here. The first pass should not allow it. |
379 | | */ |
380 | 0 | ereturn(escontext, (Datum) 0, |
381 | 0 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
382 | 0 | errmsg("invalid input syntax for type %s", "bytea"))); |
383 | 0 | } |
384 | 0 | } |
385 | | |
386 | 0 | PG_RETURN_BYTEA_P(result); |
387 | 0 | } |
388 | | |
389 | | /* |
390 | | * byteaout - converts to printable representation of byte array |
391 | | * |
392 | | * In the traditional escaped format, non-printable characters are |
393 | | * printed as '\nnn' (octal) and '\' as '\\'. |
394 | | */ |
395 | | Datum |
396 | | byteaout(PG_FUNCTION_ARGS) |
397 | 0 | { |
398 | 0 | bytea *vlena = PG_GETARG_BYTEA_PP(0); |
399 | 0 | char *result; |
400 | 0 | char *rp; |
401 | |
|
402 | 0 | if (bytea_output == BYTEA_OUTPUT_HEX) |
403 | 0 | { |
404 | | /* Print hex format */ |
405 | 0 | rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1); |
406 | 0 | *rp++ = '\\'; |
407 | 0 | *rp++ = 'x'; |
408 | 0 | rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp); |
409 | 0 | } |
410 | 0 | else if (bytea_output == BYTEA_OUTPUT_ESCAPE) |
411 | 0 | { |
412 | | /* Print traditional escaped format */ |
413 | 0 | char *vp; |
414 | 0 | uint64 len; |
415 | 0 | int i; |
416 | |
|
417 | 0 | len = 1; /* empty string has 1 char */ |
418 | 0 | vp = VARDATA_ANY(vlena); |
419 | 0 | for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) |
420 | 0 | { |
421 | 0 | if (*vp == '\\') |
422 | 0 | len += 2; |
423 | 0 | else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) |
424 | 0 | len += 4; |
425 | 0 | else |
426 | 0 | len++; |
427 | 0 | } |
428 | | |
429 | | /* |
430 | | * In principle len can't overflow uint32 if the input fit in 1GB, but |
431 | | * for safety let's check rather than relying on palloc's internal |
432 | | * check. |
433 | | */ |
434 | 0 | if (len > MaxAllocSize) |
435 | 0 | ereport(ERROR, |
436 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
437 | 0 | errmsg_internal("result of bytea output conversion is too large"))); |
438 | 0 | rp = result = (char *) palloc(len); |
439 | |
|
440 | 0 | vp = VARDATA_ANY(vlena); |
441 | 0 | for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) |
442 | 0 | { |
443 | 0 | if (*vp == '\\') |
444 | 0 | { |
445 | 0 | *rp++ = '\\'; |
446 | 0 | *rp++ = '\\'; |
447 | 0 | } |
448 | 0 | else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) |
449 | 0 | { |
450 | 0 | int val; /* holds unprintable chars */ |
451 | |
|
452 | 0 | val = *vp; |
453 | 0 | rp[0] = '\\'; |
454 | 0 | rp[3] = DIG(val & 07); |
455 | 0 | val >>= 3; |
456 | 0 | rp[2] = DIG(val & 07); |
457 | 0 | val >>= 3; |
458 | 0 | rp[1] = DIG(val & 03); |
459 | 0 | rp += 4; |
460 | 0 | } |
461 | 0 | else |
462 | 0 | *rp++ = *vp; |
463 | 0 | } |
464 | 0 | } |
465 | 0 | else |
466 | 0 | { |
467 | 0 | elog(ERROR, "unrecognized \"bytea_output\" setting: %d", |
468 | 0 | bytea_output); |
469 | 0 | rp = result = NULL; /* keep compiler quiet */ |
470 | 0 | } |
471 | 0 | *rp = '\0'; |
472 | 0 | PG_RETURN_CSTRING(result); |
473 | 0 | } |
474 | | |
475 | | /* |
476 | | * bytearecv - converts external binary format to bytea |
477 | | */ |
478 | | Datum |
479 | | bytearecv(PG_FUNCTION_ARGS) |
480 | 0 | { |
481 | 0 | StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); |
482 | 0 | bytea *result; |
483 | 0 | int nbytes; |
484 | |
|
485 | 0 | nbytes = buf->len - buf->cursor; |
486 | 0 | result = (bytea *) palloc(nbytes + VARHDRSZ); |
487 | 0 | SET_VARSIZE(result, nbytes + VARHDRSZ); |
488 | 0 | pq_copymsgbytes(buf, VARDATA(result), nbytes); |
489 | 0 | PG_RETURN_BYTEA_P(result); |
490 | 0 | } |
491 | | |
492 | | /* |
493 | | * byteasend - converts bytea to binary format |
494 | | * |
495 | | * This is a special case: just copy the input... |
496 | | */ |
497 | | Datum |
498 | | byteasend(PG_FUNCTION_ARGS) |
499 | 0 | { |
500 | 0 | bytea *vlena = PG_GETARG_BYTEA_P_COPY(0); |
501 | |
|
502 | 0 | PG_RETURN_BYTEA_P(vlena); |
503 | 0 | } |
504 | | |
505 | | Datum |
506 | | bytea_string_agg_transfn(PG_FUNCTION_ARGS) |
507 | 0 | { |
508 | 0 | StringInfo state; |
509 | |
|
510 | 0 | state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); |
511 | | |
512 | | /* Append the value unless null, preceding it with the delimiter. */ |
513 | 0 | if (!PG_ARGISNULL(1)) |
514 | 0 | { |
515 | 0 | bytea *value = PG_GETARG_BYTEA_PP(1); |
516 | 0 | bool isfirst = false; |
517 | | |
518 | | /* |
519 | | * You might think we can just throw away the first delimiter, however |
520 | | * we must keep it as we may be a parallel worker doing partial |
521 | | * aggregation building a state to send to the main process. We need |
522 | | * to keep the delimiter of every aggregation so that the combine |
523 | | * function can properly join up the strings of two separately |
524 | | * partially aggregated results. The first delimiter is only stripped |
525 | | * off in the final function. To know how much to strip off the front |
526 | | * of the string, we store the length of the first delimiter in the |
527 | | * StringInfo's cursor field, which we don't otherwise need here. |
528 | | */ |
529 | 0 | if (state == NULL) |
530 | 0 | { |
531 | 0 | state = makeStringAggState(fcinfo); |
532 | 0 | isfirst = true; |
533 | 0 | } |
534 | |
|
535 | 0 | if (!PG_ARGISNULL(2)) |
536 | 0 | { |
537 | 0 | bytea *delim = PG_GETARG_BYTEA_PP(2); |
538 | |
|
539 | 0 | appendBinaryStringInfo(state, VARDATA_ANY(delim), |
540 | 0 | VARSIZE_ANY_EXHDR(delim)); |
541 | 0 | if (isfirst) |
542 | 0 | state->cursor = VARSIZE_ANY_EXHDR(delim); |
543 | 0 | } |
544 | |
|
545 | 0 | appendBinaryStringInfo(state, VARDATA_ANY(value), |
546 | 0 | VARSIZE_ANY_EXHDR(value)); |
547 | 0 | } |
548 | | |
549 | | /* |
550 | | * The transition type for string_agg() is declared to be "internal", |
551 | | * which is a pass-by-value type the same size as a pointer. |
552 | | */ |
553 | 0 | if (state) |
554 | 0 | PG_RETURN_POINTER(state); |
555 | 0 | PG_RETURN_NULL(); |
556 | 0 | } |
557 | | |
558 | | Datum |
559 | | bytea_string_agg_finalfn(PG_FUNCTION_ARGS) |
560 | 0 | { |
561 | 0 | StringInfo state; |
562 | | |
563 | | /* cannot be called directly because of internal-type argument */ |
564 | 0 | Assert(AggCheckCallContext(fcinfo, NULL)); |
565 | |
|
566 | 0 | state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); |
567 | |
|
568 | 0 | if (state != NULL) |
569 | 0 | { |
570 | | /* As per comment in transfn, strip data before the cursor position */ |
571 | 0 | bytea *result; |
572 | 0 | int strippedlen = state->len - state->cursor; |
573 | |
|
574 | 0 | result = (bytea *) palloc(strippedlen + VARHDRSZ); |
575 | 0 | SET_VARSIZE(result, strippedlen + VARHDRSZ); |
576 | 0 | memcpy(VARDATA(result), &state->data[state->cursor], strippedlen); |
577 | 0 | PG_RETURN_BYTEA_P(result); |
578 | 0 | } |
579 | 0 | else |
580 | 0 | PG_RETURN_NULL(); |
581 | 0 | } |
582 | | |
583 | | /* |
584 | | * textin - converts cstring to internal representation |
585 | | */ |
586 | | Datum |
587 | | textin(PG_FUNCTION_ARGS) |
588 | 0 | { |
589 | 0 | char *inputText = PG_GETARG_CSTRING(0); |
590 | |
|
591 | 0 | PG_RETURN_TEXT_P(cstring_to_text(inputText)); |
592 | 0 | } |
593 | | |
594 | | /* |
595 | | * textout - converts internal representation to cstring |
596 | | */ |
597 | | Datum |
598 | | textout(PG_FUNCTION_ARGS) |
599 | 0 | { |
600 | 0 | Datum txt = PG_GETARG_DATUM(0); |
601 | |
|
602 | 0 | PG_RETURN_CSTRING(TextDatumGetCString(txt)); |
603 | 0 | } |
604 | | |
605 | | /* |
606 | | * textrecv - converts external binary format to text |
607 | | */ |
608 | | Datum |
609 | | textrecv(PG_FUNCTION_ARGS) |
610 | 0 | { |
611 | 0 | StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); |
612 | 0 | text *result; |
613 | 0 | char *str; |
614 | 0 | int nbytes; |
615 | |
|
616 | 0 | str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes); |
617 | |
|
618 | 0 | result = cstring_to_text_with_len(str, nbytes); |
619 | 0 | pfree(str); |
620 | 0 | PG_RETURN_TEXT_P(result); |
621 | 0 | } |
622 | | |
623 | | /* |
624 | | * textsend - converts text to binary format |
625 | | */ |
626 | | Datum |
627 | | textsend(PG_FUNCTION_ARGS) |
628 | 0 | { |
629 | 0 | text *t = PG_GETARG_TEXT_PP(0); |
630 | 0 | StringInfoData buf; |
631 | |
|
632 | 0 | pq_begintypsend(&buf); |
633 | 0 | pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t)); |
634 | 0 | PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); |
635 | 0 | } |
636 | | |
637 | | |
638 | | /* |
639 | | * unknownin - converts cstring to internal representation |
640 | | */ |
641 | | Datum |
642 | | unknownin(PG_FUNCTION_ARGS) |
643 | 0 | { |
644 | 0 | char *str = PG_GETARG_CSTRING(0); |
645 | | |
646 | | /* representation is same as cstring */ |
647 | 0 | PG_RETURN_CSTRING(pstrdup(str)); |
648 | 0 | } |
649 | | |
650 | | /* |
651 | | * unknownout - converts internal representation to cstring |
652 | | */ |
653 | | Datum |
654 | | unknownout(PG_FUNCTION_ARGS) |
655 | 0 | { |
656 | | /* representation is same as cstring */ |
657 | 0 | char *str = PG_GETARG_CSTRING(0); |
658 | |
|
659 | 0 | PG_RETURN_CSTRING(pstrdup(str)); |
660 | 0 | } |
661 | | |
662 | | /* |
663 | | * unknownrecv - converts external binary format to unknown |
664 | | */ |
665 | | Datum |
666 | | unknownrecv(PG_FUNCTION_ARGS) |
667 | 0 | { |
668 | 0 | StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); |
669 | 0 | char *str; |
670 | 0 | int nbytes; |
671 | |
|
672 | 0 | str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes); |
673 | | /* representation is same as cstring */ |
674 | 0 | PG_RETURN_CSTRING(str); |
675 | 0 | } |
676 | | |
677 | | /* |
678 | | * unknownsend - converts unknown to binary format |
679 | | */ |
680 | | Datum |
681 | | unknownsend(PG_FUNCTION_ARGS) |
682 | 0 | { |
683 | | /* representation is same as cstring */ |
684 | 0 | char *str = PG_GETARG_CSTRING(0); |
685 | 0 | StringInfoData buf; |
686 | |
|
687 | 0 | pq_begintypsend(&buf); |
688 | 0 | pq_sendtext(&buf, str, strlen(str)); |
689 | 0 | PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); |
690 | 0 | } |
691 | | |
692 | | |
693 | | /* ========== PUBLIC ROUTINES ========== */ |
694 | | |
695 | | /* |
696 | | * textlen - |
697 | | * returns the logical length of a text* |
698 | | * (which is less than the VARSIZE of the text*) |
699 | | */ |
700 | | Datum |
701 | | textlen(PG_FUNCTION_ARGS) |
702 | 0 | { |
703 | 0 | Datum str = PG_GETARG_DATUM(0); |
704 | | |
705 | | /* try to avoid decompressing argument */ |
706 | 0 | PG_RETURN_INT32(text_length(str)); |
707 | 0 | } |
708 | | |
709 | | /* |
710 | | * text_length - |
711 | | * Does the real work for textlen() |
712 | | * |
713 | | * This is broken out so it can be called directly by other string processing |
714 | | * functions. Note that the argument is passed as a Datum, to indicate that |
715 | | * it may still be in compressed form. We can avoid decompressing it at all |
716 | | * in some cases. |
717 | | */ |
718 | | static int32 |
719 | | text_length(Datum str) |
720 | 0 | { |
721 | | /* fastpath when max encoding length is one */ |
722 | 0 | if (pg_database_encoding_max_length() == 1) |
723 | 0 | PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); |
724 | 0 | else |
725 | 0 | { |
726 | 0 | text *t = DatumGetTextPP(str); |
727 | |
|
728 | 0 | PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t), |
729 | 0 | VARSIZE_ANY_EXHDR(t))); |
730 | 0 | } |
731 | 0 | } |
732 | | |
733 | | /* |
734 | | * textoctetlen - |
735 | | * returns the physical length of a text* |
736 | | * (which is less than the VARSIZE of the text*) |
737 | | */ |
738 | | Datum |
739 | | textoctetlen(PG_FUNCTION_ARGS) |
740 | 0 | { |
741 | 0 | Datum str = PG_GETARG_DATUM(0); |
742 | | |
743 | | /* We need not detoast the input at all */ |
744 | 0 | PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); |
745 | 0 | } |
746 | | |
747 | | /* |
748 | | * textcat - |
749 | | * takes two text* and returns a text* that is the concatenation of |
750 | | * the two. |
751 | | * |
752 | | * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96. |
753 | | * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10. |
754 | | * Allocate space for output in all cases. |
755 | | * XXX - thomas 1997-07-10 |
756 | | */ |
757 | | Datum |
758 | | textcat(PG_FUNCTION_ARGS) |
759 | 0 | { |
760 | 0 | text *t1 = PG_GETARG_TEXT_PP(0); |
761 | 0 | text *t2 = PG_GETARG_TEXT_PP(1); |
762 | |
|
763 | 0 | PG_RETURN_TEXT_P(text_catenate(t1, t2)); |
764 | 0 | } |
765 | | |
766 | | /* |
767 | | * text_catenate |
768 | | * Guts of textcat(), broken out so it can be used by other functions |
769 | | * |
770 | | * Arguments can be in short-header form, but not compressed or out-of-line |
771 | | */ |
772 | | static text * |
773 | | text_catenate(text *t1, text *t2) |
774 | 0 | { |
775 | 0 | text *result; |
776 | 0 | int len1, |
777 | 0 | len2, |
778 | 0 | len; |
779 | 0 | char *ptr; |
780 | |
|
781 | 0 | len1 = VARSIZE_ANY_EXHDR(t1); |
782 | 0 | len2 = VARSIZE_ANY_EXHDR(t2); |
783 | | |
784 | | /* paranoia ... probably should throw error instead? */ |
785 | 0 | if (len1 < 0) |
786 | 0 | len1 = 0; |
787 | 0 | if (len2 < 0) |
788 | 0 | len2 = 0; |
789 | |
|
790 | 0 | len = len1 + len2 + VARHDRSZ; |
791 | 0 | result = (text *) palloc(len); |
792 | | |
793 | | /* Set size of result string... */ |
794 | 0 | SET_VARSIZE(result, len); |
795 | | |
796 | | /* Fill data field of result string... */ |
797 | 0 | ptr = VARDATA(result); |
798 | 0 | if (len1 > 0) |
799 | 0 | memcpy(ptr, VARDATA_ANY(t1), len1); |
800 | 0 | if (len2 > 0) |
801 | 0 | memcpy(ptr + len1, VARDATA_ANY(t2), len2); |
802 | |
|
803 | 0 | return result; |
804 | 0 | } |
805 | | |
806 | | /* |
807 | | * charlen_to_bytelen() |
808 | | * Compute the number of bytes occupied by n characters starting at *p |
809 | | * |
810 | | * It is caller's responsibility that there actually are n characters; |
811 | | * the string need not be null-terminated. |
812 | | */ |
813 | | static int |
814 | | charlen_to_bytelen(const char *p, int n) |
815 | 0 | { |
816 | 0 | if (pg_database_encoding_max_length() == 1) |
817 | 0 | { |
818 | | /* Optimization for single-byte encodings */ |
819 | 0 | return n; |
820 | 0 | } |
821 | 0 | else |
822 | 0 | { |
823 | 0 | const char *s; |
824 | |
|
825 | 0 | for (s = p; n > 0; n--) |
826 | 0 | s += pg_mblen(s); |
827 | |
|
828 | 0 | return s - p; |
829 | 0 | } |
830 | 0 | } |
831 | | |
832 | | /* |
833 | | * text_substr() |
834 | | * Return a substring starting at the specified position. |
835 | | * - thomas 1997-12-31 |
836 | | * |
837 | | * Input: |
838 | | * - string |
839 | | * - starting position (is one-based) |
840 | | * - string length |
841 | | * |
842 | | * If the starting position is zero or less, then return from the start of the string |
843 | | * adjusting the length to be consistent with the "negative start" per SQL. |
844 | | * If the length is less than zero, return the remaining string. |
845 | | * |
846 | | * Added multibyte support. |
847 | | * - Tatsuo Ishii 1998-4-21 |
848 | | * Changed behavior if starting position is less than one to conform to SQL behavior. |
849 | | * Formerly returned the entire string; now returns a portion. |
850 | | * - Thomas Lockhart 1998-12-10 |
851 | | * Now uses faster TOAST-slicing interface |
852 | | * - John Gray 2002-02-22 |
853 | | * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change |
854 | | * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw |
855 | | * error; if E < 1, return '', not entire string). Fixed MB related bug when |
856 | | * S > LC and < LC + 4 sometimes garbage characters are returned. |
857 | | * - Joe Conway 2002-08-10 |
858 | | */ |
859 | | Datum |
860 | | text_substr(PG_FUNCTION_ARGS) |
861 | 0 | { |
862 | 0 | PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), |
863 | 0 | PG_GETARG_INT32(1), |
864 | 0 | PG_GETARG_INT32(2), |
865 | 0 | false)); |
866 | 0 | } |
867 | | |
868 | | /* |
869 | | * text_substr_no_len - |
870 | | * Wrapper to avoid opr_sanity failure due to |
871 | | * one function accepting a different number of args. |
872 | | */ |
873 | | Datum |
874 | | text_substr_no_len(PG_FUNCTION_ARGS) |
875 | 0 | { |
876 | 0 | PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), |
877 | 0 | PG_GETARG_INT32(1), |
878 | 0 | -1, true)); |
879 | 0 | } |
880 | | |
881 | | /* |
882 | | * text_substring - |
883 | | * Does the real work for text_substr() and text_substr_no_len() |
884 | | * |
885 | | * This is broken out so it can be called directly by other string processing |
886 | | * functions. Note that the argument is passed as a Datum, to indicate that |
887 | | * it may still be in compressed/toasted form. We can avoid detoasting all |
888 | | * of it in some cases. |
889 | | * |
890 | | * The result is always a freshly palloc'd datum. |
891 | | */ |
892 | | static text * |
893 | | text_substring(Datum str, int32 start, int32 length, bool length_not_specified) |
894 | 0 | { |
895 | 0 | int32 eml = pg_database_encoding_max_length(); |
896 | 0 | int32 S = start; /* start position */ |
897 | 0 | int32 S1; /* adjusted start position */ |
898 | 0 | int32 L1; /* adjusted substring length */ |
899 | 0 | int32 E; /* end position */ |
900 | | |
901 | | /* |
902 | | * SQL99 says S can be zero or negative (which we don't document), but we |
903 | | * still must fetch from the start of the string. |
904 | | * https://www.postgresql.org/message-id/170905442373.643.11536838320909376197%40wrigleys.postgresql.org |
905 | | */ |
906 | 0 | S1 = Max(S, 1); |
907 | | |
908 | | /* life is easy if the encoding max length is 1 */ |
909 | 0 | if (eml == 1) |
910 | 0 | { |
911 | 0 | if (length_not_specified) /* special case - get length to end of |
912 | | * string */ |
913 | 0 | L1 = -1; |
914 | 0 | else if (length < 0) |
915 | 0 | { |
916 | | /* SQL99 says to throw an error for E < S, i.e., negative length */ |
917 | 0 | ereport(ERROR, |
918 | 0 | (errcode(ERRCODE_SUBSTRING_ERROR), |
919 | 0 | errmsg("negative substring length not allowed"))); |
920 | 0 | L1 = -1; /* silence stupider compilers */ |
921 | 0 | } |
922 | 0 | else if (pg_add_s32_overflow(S, length, &E)) |
923 | 0 | { |
924 | | /* |
925 | | * L could be large enough for S + L to overflow, in which case |
926 | | * the substring must run to end of string. |
927 | | */ |
928 | 0 | L1 = -1; |
929 | 0 | } |
930 | 0 | else |
931 | 0 | { |
932 | | /* |
933 | | * A zero or negative value for the end position can happen if the |
934 | | * start was negative or one. SQL99 says to return a zero-length |
935 | | * string. |
936 | | */ |
937 | 0 | if (E < 1) |
938 | 0 | return cstring_to_text(""); |
939 | | |
940 | 0 | L1 = E - S1; |
941 | 0 | } |
942 | | |
943 | | /* |
944 | | * If the start position is past the end of the string, SQL99 says to |
945 | | * return a zero-length string -- DatumGetTextPSlice() will do that |
946 | | * for us. We need only convert S1 to zero-based starting position. |
947 | | */ |
948 | 0 | return DatumGetTextPSlice(str, S1 - 1, L1); |
949 | 0 | } |
950 | 0 | else if (eml > 1) |
951 | 0 | { |
952 | | /* |
953 | | * When encoding max length is > 1, we can't get LC without |
954 | | * detoasting, so we'll grab a conservatively large slice now and go |
955 | | * back later to do the right thing |
956 | | */ |
957 | 0 | int32 slice_start; |
958 | 0 | int32 slice_size; |
959 | 0 | int32 slice_strlen; |
960 | 0 | text *slice; |
961 | 0 | int32 E1; |
962 | 0 | int32 i; |
963 | 0 | char *p; |
964 | 0 | char *s; |
965 | 0 | text *ret; |
966 | | |
967 | | /* |
968 | | * We need to start at position zero because there is no way to know |
969 | | * in advance which byte offset corresponds to the supplied start |
970 | | * position. |
971 | | */ |
972 | 0 | slice_start = 0; |
973 | |
|
974 | 0 | if (length_not_specified) /* special case - get length to end of |
975 | | * string */ |
976 | 0 | slice_size = L1 = -1; |
977 | 0 | else if (length < 0) |
978 | 0 | { |
979 | | /* SQL99 says to throw an error for E < S, i.e., negative length */ |
980 | 0 | ereport(ERROR, |
981 | 0 | (errcode(ERRCODE_SUBSTRING_ERROR), |
982 | 0 | errmsg("negative substring length not allowed"))); |
983 | 0 | slice_size = L1 = -1; /* silence stupider compilers */ |
984 | 0 | } |
985 | 0 | else if (pg_add_s32_overflow(S, length, &E)) |
986 | 0 | { |
987 | | /* |
988 | | * L could be large enough for S + L to overflow, in which case |
989 | | * the substring must run to end of string. |
990 | | */ |
991 | 0 | slice_size = L1 = -1; |
992 | 0 | } |
993 | 0 | else |
994 | 0 | { |
995 | | /* |
996 | | * A zero or negative value for the end position can happen if the |
997 | | * start was negative or one. SQL99 says to return a zero-length |
998 | | * string. |
999 | | */ |
1000 | 0 | if (E < 1) |
1001 | 0 | return cstring_to_text(""); |
1002 | | |
1003 | | /* |
1004 | | * if E is past the end of the string, the tuple toaster will |
1005 | | * truncate the length for us |
1006 | | */ |
1007 | 0 | L1 = E - S1; |
1008 | | |
1009 | | /* |
1010 | | * Total slice size in bytes can't be any longer than the start |
1011 | | * position plus substring length times the encoding max length. |
1012 | | * If that overflows, we can just use -1. |
1013 | | */ |
1014 | 0 | if (pg_mul_s32_overflow(E, eml, &slice_size)) |
1015 | 0 | slice_size = -1; |
1016 | 0 | } |
1017 | | |
1018 | | /* |
1019 | | * If we're working with an untoasted source, no need to do an extra |
1020 | | * copying step. |
1021 | | */ |
1022 | 0 | if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) || |
1023 | 0 | VARATT_IS_EXTERNAL(DatumGetPointer(str))) |
1024 | 0 | slice = DatumGetTextPSlice(str, slice_start, slice_size); |
1025 | 0 | else |
1026 | 0 | slice = (text *) DatumGetPointer(str); |
1027 | | |
1028 | | /* see if we got back an empty string */ |
1029 | 0 | if (VARSIZE_ANY_EXHDR(slice) == 0) |
1030 | 0 | { |
1031 | 0 | if (slice != (text *) DatumGetPointer(str)) |
1032 | 0 | pfree(slice); |
1033 | 0 | return cstring_to_text(""); |
1034 | 0 | } |
1035 | | |
1036 | | /* Now we can get the actual length of the slice in MB characters */ |
1037 | 0 | slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice), |
1038 | 0 | VARSIZE_ANY_EXHDR(slice)); |
1039 | | |
1040 | | /* |
1041 | | * Check that the start position wasn't > slice_strlen. If so, SQL99 |
1042 | | * says to return a zero-length string. |
1043 | | */ |
1044 | 0 | if (S1 > slice_strlen) |
1045 | 0 | { |
1046 | 0 | if (slice != (text *) DatumGetPointer(str)) |
1047 | 0 | pfree(slice); |
1048 | 0 | return cstring_to_text(""); |
1049 | 0 | } |
1050 | | |
1051 | | /* |
1052 | | * Adjust L1 and E1 now that we know the slice string length. Again |
1053 | | * remember that S1 is one based, and slice_start is zero based. |
1054 | | */ |
1055 | 0 | if (L1 > -1) |
1056 | 0 | E1 = Min(S1 + L1, slice_start + 1 + slice_strlen); |
1057 | 0 | else |
1058 | 0 | E1 = slice_start + 1 + slice_strlen; |
1059 | | |
1060 | | /* |
1061 | | * Find the start position in the slice; remember S1 is not zero based |
1062 | | */ |
1063 | 0 | p = VARDATA_ANY(slice); |
1064 | 0 | for (i = 0; i < S1 - 1; i++) |
1065 | 0 | p += pg_mblen(p); |
1066 | | |
1067 | | /* hang onto a pointer to our start position */ |
1068 | 0 | s = p; |
1069 | | |
1070 | | /* |
1071 | | * Count the actual bytes used by the substring of the requested |
1072 | | * length. |
1073 | | */ |
1074 | 0 | for (i = S1; i < E1; i++) |
1075 | 0 | p += pg_mblen(p); |
1076 | |
|
1077 | 0 | ret = (text *) palloc(VARHDRSZ + (p - s)); |
1078 | 0 | SET_VARSIZE(ret, VARHDRSZ + (p - s)); |
1079 | 0 | memcpy(VARDATA(ret), s, (p - s)); |
1080 | |
|
1081 | 0 | if (slice != (text *) DatumGetPointer(str)) |
1082 | 0 | pfree(slice); |
1083 | |
|
1084 | 0 | return ret; |
1085 | 0 | } |
1086 | 0 | else |
1087 | 0 | elog(ERROR, "invalid backend encoding: encoding max length < 1"); |
1088 | | |
1089 | | /* not reached: suppress compiler warning */ |
1090 | 0 | return NULL; |
1091 | 0 | } |
1092 | | |
1093 | | /* |
1094 | | * textoverlay |
1095 | | * Replace specified substring of first string with second |
1096 | | * |
1097 | | * The SQL standard defines OVERLAY() in terms of substring and concatenation. |
1098 | | * This code is a direct implementation of what the standard says. |
1099 | | */ |
1100 | | Datum |
1101 | | textoverlay(PG_FUNCTION_ARGS) |
1102 | 0 | { |
1103 | 0 | text *t1 = PG_GETARG_TEXT_PP(0); |
1104 | 0 | text *t2 = PG_GETARG_TEXT_PP(1); |
1105 | 0 | int sp = PG_GETARG_INT32(2); /* substring start position */ |
1106 | 0 | int sl = PG_GETARG_INT32(3); /* substring length */ |
1107 | |
|
1108 | 0 | PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl)); |
1109 | 0 | } |
1110 | | |
1111 | | Datum |
1112 | | textoverlay_no_len(PG_FUNCTION_ARGS) |
1113 | 0 | { |
1114 | 0 | text *t1 = PG_GETARG_TEXT_PP(0); |
1115 | 0 | text *t2 = PG_GETARG_TEXT_PP(1); |
1116 | 0 | int sp = PG_GETARG_INT32(2); /* substring start position */ |
1117 | 0 | int sl; |
1118 | |
|
1119 | 0 | sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */ |
1120 | 0 | PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl)); |
1121 | 0 | } |
1122 | | |
1123 | | static text * |
1124 | | text_overlay(text *t1, text *t2, int sp, int sl) |
1125 | 0 | { |
1126 | 0 | text *result; |
1127 | 0 | text *s1; |
1128 | 0 | text *s2; |
1129 | 0 | int sp_pl_sl; |
1130 | | |
1131 | | /* |
1132 | | * Check for possible integer-overflow cases. For negative sp, throw a |
1133 | | * "substring length" error because that's what should be expected |
1134 | | * according to the spec's definition of OVERLAY(). |
1135 | | */ |
1136 | 0 | if (sp <= 0) |
1137 | 0 | ereport(ERROR, |
1138 | 0 | (errcode(ERRCODE_SUBSTRING_ERROR), |
1139 | 0 | errmsg("negative substring length not allowed"))); |
1140 | 0 | if (pg_add_s32_overflow(sp, sl, &sp_pl_sl)) |
1141 | 0 | ereport(ERROR, |
1142 | 0 | (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
1143 | 0 | errmsg("integer out of range"))); |
1144 | | |
1145 | 0 | s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false); |
1146 | 0 | s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true); |
1147 | 0 | result = text_catenate(s1, t2); |
1148 | 0 | result = text_catenate(result, s2); |
1149 | |
|
1150 | 0 | return result; |
1151 | 0 | } |
1152 | | |
1153 | | /* |
1154 | | * textpos - |
1155 | | * Return the position of the specified substring. |
1156 | | * Implements the SQL POSITION() function. |
1157 | | * Ref: A Guide To The SQL Standard, Date & Darwen, 1997 |
1158 | | * - thomas 1997-07-27 |
1159 | | */ |
1160 | | Datum |
1161 | | textpos(PG_FUNCTION_ARGS) |
1162 | 0 | { |
1163 | 0 | text *str = PG_GETARG_TEXT_PP(0); |
1164 | 0 | text *search_str = PG_GETARG_TEXT_PP(1); |
1165 | |
|
1166 | 0 | PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION())); |
1167 | 0 | } |
1168 | | |
1169 | | /* |
1170 | | * text_position - |
1171 | | * Does the real work for textpos() |
1172 | | * |
1173 | | * Inputs: |
1174 | | * t1 - string to be searched |
1175 | | * t2 - pattern to match within t1 |
1176 | | * Result: |
1177 | | * Character index of the first matched char, starting from 1, |
1178 | | * or 0 if no match. |
1179 | | * |
1180 | | * This is broken out so it can be called directly by other string processing |
1181 | | * functions. |
1182 | | */ |
1183 | | static int |
1184 | | text_position(text *t1, text *t2, Oid collid) |
1185 | 0 | { |
1186 | 0 | TextPositionState state; |
1187 | 0 | int result; |
1188 | |
|
1189 | 0 | check_collation_set(collid); |
1190 | | |
1191 | | /* Empty needle always matches at position 1 */ |
1192 | 0 | if (VARSIZE_ANY_EXHDR(t2) < 1) |
1193 | 0 | return 1; |
1194 | | |
1195 | | /* Otherwise, can't match if haystack is shorter than needle */ |
1196 | 0 | if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) && |
1197 | 0 | pg_newlocale_from_collation(collid)->deterministic) |
1198 | 0 | return 0; |
1199 | | |
1200 | 0 | text_position_setup(t1, t2, collid, &state); |
1201 | | /* don't need greedy mode here */ |
1202 | 0 | state.greedy = false; |
1203 | |
|
1204 | 0 | if (!text_position_next(&state)) |
1205 | 0 | result = 0; |
1206 | 0 | else |
1207 | 0 | result = text_position_get_match_pos(&state); |
1208 | 0 | text_position_cleanup(&state); |
1209 | 0 | return result; |
1210 | 0 | } |
1211 | | |
1212 | | |
1213 | | /* |
1214 | | * text_position_setup, text_position_next, text_position_cleanup - |
1215 | | * Component steps of text_position() |
1216 | | * |
1217 | | * These are broken out so that a string can be efficiently searched for |
1218 | | * multiple occurrences of the same pattern. text_position_next may be |
1219 | | * called multiple times, and it advances to the next match on each call. |
1220 | | * text_position_get_match_ptr() and text_position_get_match_pos() return |
1221 | | * a pointer or 1-based character position of the last match, respectively. |
1222 | | * |
1223 | | * The "state" variable is normally just a local variable in the caller. |
1224 | | * |
1225 | | * NOTE: text_position_next skips over the matched portion. For example, |
1226 | | * searching for "xx" in "xxx" returns only one match, not two. |
1227 | | */ |
1228 | | |
1229 | | static void |
1230 | | text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state) |
1231 | 0 | { |
1232 | 0 | int len1 = VARSIZE_ANY_EXHDR(t1); |
1233 | 0 | int len2 = VARSIZE_ANY_EXHDR(t2); |
1234 | |
|
1235 | 0 | check_collation_set(collid); |
1236 | |
|
1237 | 0 | state->locale = pg_newlocale_from_collation(collid); |
1238 | | |
1239 | | /* |
1240 | | * Most callers need greedy mode, but some might want to unset this to |
1241 | | * optimize. |
1242 | | */ |
1243 | 0 | state->greedy = true; |
1244 | |
|
1245 | 0 | Assert(len2 > 0); |
1246 | | |
1247 | | /* |
1248 | | * Even with a multi-byte encoding, we perform the search using the raw |
1249 | | * byte sequence, ignoring multibyte issues. For UTF-8, that works fine, |
1250 | | * because in UTF-8 the byte sequence of one character cannot contain |
1251 | | * another character. For other multi-byte encodings, we do the search |
1252 | | * initially as a simple byte search, ignoring multibyte issues, but |
1253 | | * verify afterwards that the match we found is at a character boundary, |
1254 | | * and continue the search if it was a false match. |
1255 | | */ |
1256 | 0 | if (pg_database_encoding_max_length() == 1) |
1257 | 0 | state->is_multibyte_char_in_char = false; |
1258 | 0 | else if (GetDatabaseEncoding() == PG_UTF8) |
1259 | 0 | state->is_multibyte_char_in_char = false; |
1260 | 0 | else |
1261 | 0 | state->is_multibyte_char_in_char = true; |
1262 | |
|
1263 | 0 | state->str1 = VARDATA_ANY(t1); |
1264 | 0 | state->str2 = VARDATA_ANY(t2); |
1265 | 0 | state->len1 = len1; |
1266 | 0 | state->len2 = len2; |
1267 | 0 | state->last_match = NULL; |
1268 | 0 | state->refpoint = state->str1; |
1269 | 0 | state->refpos = 0; |
1270 | | |
1271 | | /* |
1272 | | * Prepare the skip table for Boyer-Moore-Horspool searching. In these |
1273 | | * notes we use the terminology that the "haystack" is the string to be |
1274 | | * searched (t1) and the "needle" is the pattern being sought (t2). |
1275 | | * |
1276 | | * If the needle is empty or bigger than the haystack then there is no |
1277 | | * point in wasting cycles initializing the table. We also choose not to |
1278 | | * use B-M-H for needles of length 1, since the skip table can't possibly |
1279 | | * save anything in that case. |
1280 | | * |
1281 | | * (With nondeterministic collations, the search is already |
1282 | | * multibyte-aware, so we don't need this.) |
1283 | | */ |
1284 | 0 | if (len1 >= len2 && len2 > 1 && state->locale->deterministic) |
1285 | 0 | { |
1286 | 0 | int searchlength = len1 - len2; |
1287 | 0 | int skiptablemask; |
1288 | 0 | int last; |
1289 | 0 | int i; |
1290 | 0 | const char *str2 = state->str2; |
1291 | | |
1292 | | /* |
1293 | | * First we must determine how much of the skip table to use. The |
1294 | | * declaration of TextPositionState allows up to 256 elements, but for |
1295 | | * short search problems we don't really want to have to initialize so |
1296 | | * many elements --- it would take too long in comparison to the |
1297 | | * actual search time. So we choose a useful skip table size based on |
1298 | | * the haystack length minus the needle length. The closer the needle |
1299 | | * length is to the haystack length the less useful skipping becomes. |
1300 | | * |
1301 | | * Note: since we use bit-masking to select table elements, the skip |
1302 | | * table size MUST be a power of 2, and so the mask must be 2^N-1. |
1303 | | */ |
1304 | 0 | if (searchlength < 16) |
1305 | 0 | skiptablemask = 3; |
1306 | 0 | else if (searchlength < 64) |
1307 | 0 | skiptablemask = 7; |
1308 | 0 | else if (searchlength < 128) |
1309 | 0 | skiptablemask = 15; |
1310 | 0 | else if (searchlength < 512) |
1311 | 0 | skiptablemask = 31; |
1312 | 0 | else if (searchlength < 2048) |
1313 | 0 | skiptablemask = 63; |
1314 | 0 | else if (searchlength < 4096) |
1315 | 0 | skiptablemask = 127; |
1316 | 0 | else |
1317 | 0 | skiptablemask = 255; |
1318 | 0 | state->skiptablemask = skiptablemask; |
1319 | | |
1320 | | /* |
1321 | | * Initialize the skip table. We set all elements to the needle |
1322 | | * length, since this is the correct skip distance for any character |
1323 | | * not found in the needle. |
1324 | | */ |
1325 | 0 | for (i = 0; i <= skiptablemask; i++) |
1326 | 0 | state->skiptable[i] = len2; |
1327 | | |
1328 | | /* |
1329 | | * Now examine the needle. For each character except the last one, |
1330 | | * set the corresponding table element to the appropriate skip |
1331 | | * distance. Note that when two characters share the same skip table |
1332 | | * entry, the one later in the needle must determine the skip |
1333 | | * distance. |
1334 | | */ |
1335 | 0 | last = len2 - 1; |
1336 | |
|
1337 | 0 | for (i = 0; i < last; i++) |
1338 | 0 | state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i; |
1339 | 0 | } |
1340 | 0 | } |
1341 | | |
1342 | | /* |
1343 | | * Advance to the next match, starting from the end of the previous match |
1344 | | * (or the beginning of the string, on first call). Returns true if a match |
1345 | | * is found. |
1346 | | * |
1347 | | * Note that this refuses to match an empty-string needle. Most callers |
1348 | | * will have handled that case specially and we'll never see it here. |
1349 | | */ |
1350 | | static bool |
1351 | | text_position_next(TextPositionState *state) |
1352 | 0 | { |
1353 | 0 | int needle_len = state->len2; |
1354 | 0 | char *start_ptr; |
1355 | 0 | char *matchptr; |
1356 | |
|
1357 | 0 | if (needle_len <= 0) |
1358 | 0 | return false; /* result for empty pattern */ |
1359 | | |
1360 | | /* Start from the point right after the previous match. */ |
1361 | 0 | if (state->last_match) |
1362 | 0 | start_ptr = state->last_match + state->last_match_len; |
1363 | 0 | else |
1364 | 0 | start_ptr = state->str1; |
1365 | |
|
1366 | 0 | retry: |
1367 | 0 | matchptr = text_position_next_internal(start_ptr, state); |
1368 | |
|
1369 | 0 | if (!matchptr) |
1370 | 0 | return false; |
1371 | | |
1372 | | /* |
1373 | | * Found a match for the byte sequence. If this is a multibyte encoding, |
1374 | | * where one character's byte sequence can appear inside a longer |
1375 | | * multi-byte character, we need to verify that the match was at a |
1376 | | * character boundary, not in the middle of a multi-byte character. |
1377 | | */ |
1378 | 0 | if (state->is_multibyte_char_in_char && state->locale->deterministic) |
1379 | 0 | { |
1380 | | /* Walk one character at a time, until we reach the match. */ |
1381 | | |
1382 | | /* the search should never move backwards. */ |
1383 | 0 | Assert(state->refpoint <= matchptr); |
1384 | |
|
1385 | 0 | while (state->refpoint < matchptr) |
1386 | 0 | { |
1387 | | /* step to next character. */ |
1388 | 0 | state->refpoint += pg_mblen(state->refpoint); |
1389 | 0 | state->refpos++; |
1390 | | |
1391 | | /* |
1392 | | * If we stepped over the match's start position, then it was a |
1393 | | * false positive, where the byte sequence appeared in the middle |
1394 | | * of a multi-byte character. Skip it, and continue the search at |
1395 | | * the next character boundary. |
1396 | | */ |
1397 | 0 | if (state->refpoint > matchptr) |
1398 | 0 | { |
1399 | 0 | start_ptr = state->refpoint; |
1400 | 0 | goto retry; |
1401 | 0 | } |
1402 | 0 | } |
1403 | 0 | } |
1404 | | |
1405 | 0 | state->last_match = matchptr; |
1406 | 0 | state->last_match_len = state->last_match_len_tmp; |
1407 | 0 | return true; |
1408 | 0 | } |
1409 | | |
1410 | | /* |
1411 | | * Subroutine of text_position_next(). This searches for the raw byte |
1412 | | * sequence, ignoring any multi-byte encoding issues. Returns the first |
1413 | | * match starting at 'start_ptr', or NULL if no match is found. |
1414 | | */ |
1415 | | static char * |
1416 | | text_position_next_internal(char *start_ptr, TextPositionState *state) |
1417 | 0 | { |
1418 | 0 | int haystack_len = state->len1; |
1419 | 0 | int needle_len = state->len2; |
1420 | 0 | int skiptablemask = state->skiptablemask; |
1421 | 0 | const char *haystack = state->str1; |
1422 | 0 | const char *needle = state->str2; |
1423 | 0 | const char *haystack_end = &haystack[haystack_len]; |
1424 | 0 | const char *hptr; |
1425 | |
|
1426 | 0 | Assert(start_ptr >= haystack && start_ptr <= haystack_end); |
1427 | |
|
1428 | 0 | state->last_match_len_tmp = needle_len; |
1429 | |
|
1430 | 0 | if (!state->locale->deterministic) |
1431 | 0 | { |
1432 | | /* |
1433 | | * With a nondeterministic collation, we have to use an unoptimized |
1434 | | * route. We walk through the haystack and see if at each position |
1435 | | * there is a substring of the remaining string that is equal to the |
1436 | | * needle under the given collation. |
1437 | | * |
1438 | | * Note, the found substring could have a different length than the |
1439 | | * needle, including being empty. Callers that want to skip over the |
1440 | | * found string need to read the length of the found substring from |
1441 | | * last_match_len rather than just using the length of their needle. |
1442 | | * |
1443 | | * Most callers will require "greedy" semantics, meaning that we need |
1444 | | * to find the longest such substring, not the shortest. For callers |
1445 | | * that don't need greedy semantics, we can finish on the first match. |
1446 | | */ |
1447 | 0 | const char *result_hptr = NULL; |
1448 | |
|
1449 | 0 | hptr = start_ptr; |
1450 | 0 | while (hptr < haystack_end) |
1451 | 0 | { |
1452 | | /* |
1453 | | * First check the common case that there is a match in the |
1454 | | * haystack of exactly the length of the needle. |
1455 | | */ |
1456 | 0 | if (!state->greedy && |
1457 | 0 | haystack_end - hptr >= needle_len && |
1458 | 0 | pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0) |
1459 | 0 | return (char *) hptr; |
1460 | | |
1461 | | /* |
1462 | | * Else check if any of the possible substrings starting at hptr |
1463 | | * are equal to the needle. |
1464 | | */ |
1465 | 0 | for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end)) |
1466 | 0 | { |
1467 | 0 | if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0) |
1468 | 0 | { |
1469 | 0 | state->last_match_len_tmp = (test_end - hptr); |
1470 | 0 | result_hptr = hptr; |
1471 | 0 | if (!state->greedy) |
1472 | 0 | break; |
1473 | 0 | } |
1474 | 0 | } |
1475 | 0 | if (result_hptr) |
1476 | 0 | break; |
1477 | | |
1478 | 0 | hptr += pg_mblen(hptr); |
1479 | 0 | } |
1480 | | |
1481 | 0 | return (char *) result_hptr; |
1482 | 0 | } |
1483 | 0 | else if (needle_len == 1) |
1484 | 0 | { |
1485 | | /* No point in using B-M-H for a one-character needle */ |
1486 | 0 | char nchar = *needle; |
1487 | |
|
1488 | 0 | hptr = start_ptr; |
1489 | 0 | while (hptr < haystack_end) |
1490 | 0 | { |
1491 | 0 | if (*hptr == nchar) |
1492 | 0 | return (char *) hptr; |
1493 | 0 | hptr++; |
1494 | 0 | } |
1495 | 0 | } |
1496 | 0 | else |
1497 | 0 | { |
1498 | 0 | const char *needle_last = &needle[needle_len - 1]; |
1499 | | |
1500 | | /* Start at startpos plus the length of the needle */ |
1501 | 0 | hptr = start_ptr + needle_len - 1; |
1502 | 0 | while (hptr < haystack_end) |
1503 | 0 | { |
1504 | | /* Match the needle scanning *backward* */ |
1505 | 0 | const char *nptr; |
1506 | 0 | const char *p; |
1507 | |
|
1508 | 0 | nptr = needle_last; |
1509 | 0 | p = hptr; |
1510 | 0 | while (*nptr == *p) |
1511 | 0 | { |
1512 | | /* Matched it all? If so, return 1-based position */ |
1513 | 0 | if (nptr == needle) |
1514 | 0 | return (char *) p; |
1515 | 0 | nptr--, p--; |
1516 | 0 | } |
1517 | | |
1518 | | /* |
1519 | | * No match, so use the haystack char at hptr to decide how far to |
1520 | | * advance. If the needle had any occurrence of that character |
1521 | | * (or more precisely, one sharing the same skiptable entry) |
1522 | | * before its last character, then we advance far enough to align |
1523 | | * the last such needle character with that haystack position. |
1524 | | * Otherwise we can advance by the whole needle length. |
1525 | | */ |
1526 | 0 | hptr += state->skiptable[(unsigned char) *hptr & skiptablemask]; |
1527 | 0 | } |
1528 | 0 | } |
1529 | | |
1530 | 0 | return 0; /* not found */ |
1531 | 0 | } |
1532 | | |
1533 | | /* |
1534 | | * Return a pointer to the current match. |
1535 | | * |
1536 | | * The returned pointer points into the original haystack string. |
1537 | | */ |
1538 | | static char * |
1539 | | text_position_get_match_ptr(TextPositionState *state) |
1540 | 0 | { |
1541 | 0 | return state->last_match; |
1542 | 0 | } |
1543 | | |
1544 | | /* |
1545 | | * Return the offset of the current match. |
1546 | | * |
1547 | | * The offset is in characters, 1-based. |
1548 | | */ |
1549 | | static int |
1550 | | text_position_get_match_pos(TextPositionState *state) |
1551 | 0 | { |
1552 | | /* Convert the byte position to char position. */ |
1553 | 0 | state->refpos += pg_mbstrlen_with_len(state->refpoint, |
1554 | 0 | state->last_match - state->refpoint); |
1555 | 0 | state->refpoint = state->last_match; |
1556 | 0 | return state->refpos + 1; |
1557 | 0 | } |
1558 | | |
1559 | | /* |
1560 | | * Reset search state to the initial state installed by text_position_setup. |
1561 | | * |
1562 | | * The next call to text_position_next will search from the beginning |
1563 | | * of the string. |
1564 | | */ |
1565 | | static void |
1566 | | text_position_reset(TextPositionState *state) |
1567 | 0 | { |
1568 | 0 | state->last_match = NULL; |
1569 | 0 | state->refpoint = state->str1; |
1570 | 0 | state->refpos = 0; |
1571 | 0 | } |
1572 | | |
1573 | | static void |
1574 | | text_position_cleanup(TextPositionState *state) |
1575 | 0 | { |
1576 | | /* no cleanup needed */ |
1577 | 0 | } |
1578 | | |
1579 | | |
1580 | | static void |
1581 | | check_collation_set(Oid collid) |
1582 | 0 | { |
1583 | 0 | if (!OidIsValid(collid)) |
1584 | 0 | { |
1585 | | /* |
1586 | | * This typically means that the parser could not resolve a conflict |
1587 | | * of implicit collations, so report it that way. |
1588 | | */ |
1589 | 0 | ereport(ERROR, |
1590 | 0 | (errcode(ERRCODE_INDETERMINATE_COLLATION), |
1591 | 0 | errmsg("could not determine which collation to use for string comparison"), |
1592 | 0 | errhint("Use the COLLATE clause to set the collation explicitly."))); |
1593 | 0 | } |
1594 | 0 | } |
1595 | | |
1596 | | /* |
1597 | | * varstr_cmp() |
1598 | | * |
1599 | | * Comparison function for text strings with given lengths, using the |
1600 | | * appropriate locale. Returns an integer less than, equal to, or greater than |
1601 | | * zero, indicating whether arg1 is less than, equal to, or greater than arg2. |
1602 | | * |
1603 | | * Note: many functions that depend on this are marked leakproof; therefore, |
1604 | | * avoid reporting the actual contents of the input when throwing errors. |
1605 | | * All errors herein should be things that can't happen except on corrupt |
1606 | | * data, anyway; otherwise we will have trouble with indexing strings that |
1607 | | * would cause them. |
1608 | | */ |
1609 | | int |
1610 | | varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) |
1611 | 0 | { |
1612 | 0 | int result; |
1613 | 0 | pg_locale_t mylocale; |
1614 | |
|
1615 | 0 | check_collation_set(collid); |
1616 | |
|
1617 | 0 | mylocale = pg_newlocale_from_collation(collid); |
1618 | |
|
1619 | 0 | if (mylocale->collate_is_c) |
1620 | 0 | { |
1621 | 0 | result = memcmp(arg1, arg2, Min(len1, len2)); |
1622 | 0 | if ((result == 0) && (len1 != len2)) |
1623 | 0 | result = (len1 < len2) ? -1 : 1; |
1624 | 0 | } |
1625 | 0 | else |
1626 | 0 | { |
1627 | | /* |
1628 | | * memcmp() can't tell us which of two unequal strings sorts first, |
1629 | | * but it's a cheap way to tell if they're equal. Testing shows that |
1630 | | * memcmp() followed by strcoll() is only trivially slower than |
1631 | | * strcoll() by itself, so we don't lose much if this doesn't work out |
1632 | | * very often, and if it does - for example, because there are many |
1633 | | * equal strings in the input - then we win big by avoiding expensive |
1634 | | * collation-aware comparisons. |
1635 | | */ |
1636 | 0 | if (len1 == len2 && memcmp(arg1, arg2, len1) == 0) |
1637 | 0 | return 0; |
1638 | | |
1639 | 0 | result = pg_strncoll(arg1, len1, arg2, len2, mylocale); |
1640 | | |
1641 | | /* Break tie if necessary. */ |
1642 | 0 | if (result == 0 && mylocale->deterministic) |
1643 | 0 | { |
1644 | 0 | result = memcmp(arg1, arg2, Min(len1, len2)); |
1645 | 0 | if ((result == 0) && (len1 != len2)) |
1646 | 0 | result = (len1 < len2) ? -1 : 1; |
1647 | 0 | } |
1648 | 0 | } |
1649 | | |
1650 | 0 | return result; |
1651 | 0 | } |
1652 | | |
1653 | | /* text_cmp() |
1654 | | * Internal comparison function for text strings. |
1655 | | * Returns -1, 0 or 1 |
1656 | | */ |
1657 | | static int |
1658 | | text_cmp(text *arg1, text *arg2, Oid collid) |
1659 | 0 | { |
1660 | 0 | char *a1p, |
1661 | 0 | *a2p; |
1662 | 0 | int len1, |
1663 | 0 | len2; |
1664 | |
|
1665 | 0 | a1p = VARDATA_ANY(arg1); |
1666 | 0 | a2p = VARDATA_ANY(arg2); |
1667 | |
|
1668 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
1669 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
1670 | |
|
1671 | 0 | return varstr_cmp(a1p, len1, a2p, len2, collid); |
1672 | 0 | } |
1673 | | |
1674 | | /* |
1675 | | * Comparison functions for text strings. |
1676 | | * |
1677 | | * Note: btree indexes need these routines not to leak memory; therefore, |
1678 | | * be careful to free working copies of toasted datums. Most places don't |
1679 | | * need to be so careful. |
1680 | | */ |
1681 | | |
1682 | | Datum |
1683 | | texteq(PG_FUNCTION_ARGS) |
1684 | 0 | { |
1685 | 0 | Oid collid = PG_GET_COLLATION(); |
1686 | 0 | pg_locale_t mylocale = 0; |
1687 | 0 | bool result; |
1688 | |
|
1689 | 0 | check_collation_set(collid); |
1690 | |
|
1691 | 0 | mylocale = pg_newlocale_from_collation(collid); |
1692 | |
|
1693 | 0 | if (mylocale->deterministic) |
1694 | 0 | { |
1695 | 0 | Datum arg1 = PG_GETARG_DATUM(0); |
1696 | 0 | Datum arg2 = PG_GETARG_DATUM(1); |
1697 | 0 | Size len1, |
1698 | 0 | len2; |
1699 | | |
1700 | | /* |
1701 | | * Since we only care about equality or not-equality, we can avoid all |
1702 | | * the expense of strcoll() here, and just do bitwise comparison. In |
1703 | | * fact, we don't even have to do a bitwise comparison if we can show |
1704 | | * the lengths of the strings are unequal; which might save us from |
1705 | | * having to detoast one or both values. |
1706 | | */ |
1707 | 0 | len1 = toast_raw_datum_size(arg1); |
1708 | 0 | len2 = toast_raw_datum_size(arg2); |
1709 | 0 | if (len1 != len2) |
1710 | 0 | result = false; |
1711 | 0 | else |
1712 | 0 | { |
1713 | 0 | text *targ1 = DatumGetTextPP(arg1); |
1714 | 0 | text *targ2 = DatumGetTextPP(arg2); |
1715 | |
|
1716 | 0 | result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2), |
1717 | 0 | len1 - VARHDRSZ) == 0); |
1718 | |
|
1719 | 0 | PG_FREE_IF_COPY(targ1, 0); |
1720 | 0 | PG_FREE_IF_COPY(targ2, 1); |
1721 | 0 | } |
1722 | 0 | } |
1723 | 0 | else |
1724 | 0 | { |
1725 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1726 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1727 | |
|
1728 | 0 | result = (text_cmp(arg1, arg2, collid) == 0); |
1729 | |
|
1730 | 0 | PG_FREE_IF_COPY(arg1, 0); |
1731 | 0 | PG_FREE_IF_COPY(arg2, 1); |
1732 | 0 | } |
1733 | |
|
1734 | 0 | PG_RETURN_BOOL(result); |
1735 | 0 | } |
1736 | | |
1737 | | Datum |
1738 | | textne(PG_FUNCTION_ARGS) |
1739 | 0 | { |
1740 | 0 | Oid collid = PG_GET_COLLATION(); |
1741 | 0 | pg_locale_t mylocale; |
1742 | 0 | bool result; |
1743 | |
|
1744 | 0 | check_collation_set(collid); |
1745 | |
|
1746 | 0 | mylocale = pg_newlocale_from_collation(collid); |
1747 | |
|
1748 | 0 | if (mylocale->deterministic) |
1749 | 0 | { |
1750 | 0 | Datum arg1 = PG_GETARG_DATUM(0); |
1751 | 0 | Datum arg2 = PG_GETARG_DATUM(1); |
1752 | 0 | Size len1, |
1753 | 0 | len2; |
1754 | | |
1755 | | /* See comment in texteq() */ |
1756 | 0 | len1 = toast_raw_datum_size(arg1); |
1757 | 0 | len2 = toast_raw_datum_size(arg2); |
1758 | 0 | if (len1 != len2) |
1759 | 0 | result = true; |
1760 | 0 | else |
1761 | 0 | { |
1762 | 0 | text *targ1 = DatumGetTextPP(arg1); |
1763 | 0 | text *targ2 = DatumGetTextPP(arg2); |
1764 | |
|
1765 | 0 | result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2), |
1766 | 0 | len1 - VARHDRSZ) != 0); |
1767 | |
|
1768 | 0 | PG_FREE_IF_COPY(targ1, 0); |
1769 | 0 | PG_FREE_IF_COPY(targ2, 1); |
1770 | 0 | } |
1771 | 0 | } |
1772 | 0 | else |
1773 | 0 | { |
1774 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1775 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1776 | |
|
1777 | 0 | result = (text_cmp(arg1, arg2, collid) != 0); |
1778 | |
|
1779 | 0 | PG_FREE_IF_COPY(arg1, 0); |
1780 | 0 | PG_FREE_IF_COPY(arg2, 1); |
1781 | 0 | } |
1782 | |
|
1783 | 0 | PG_RETURN_BOOL(result); |
1784 | 0 | } |
1785 | | |
1786 | | Datum |
1787 | | text_lt(PG_FUNCTION_ARGS) |
1788 | 0 | { |
1789 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1790 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1791 | 0 | bool result; |
1792 | |
|
1793 | 0 | result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0); |
1794 | |
|
1795 | 0 | PG_FREE_IF_COPY(arg1, 0); |
1796 | 0 | PG_FREE_IF_COPY(arg2, 1); |
1797 | |
|
1798 | 0 | PG_RETURN_BOOL(result); |
1799 | 0 | } |
1800 | | |
1801 | | Datum |
1802 | | text_le(PG_FUNCTION_ARGS) |
1803 | 0 | { |
1804 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1805 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1806 | 0 | bool result; |
1807 | |
|
1808 | 0 | result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0); |
1809 | |
|
1810 | 0 | PG_FREE_IF_COPY(arg1, 0); |
1811 | 0 | PG_FREE_IF_COPY(arg2, 1); |
1812 | |
|
1813 | 0 | PG_RETURN_BOOL(result); |
1814 | 0 | } |
1815 | | |
1816 | | Datum |
1817 | | text_gt(PG_FUNCTION_ARGS) |
1818 | 0 | { |
1819 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1820 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1821 | 0 | bool result; |
1822 | |
|
1823 | 0 | result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0); |
1824 | |
|
1825 | 0 | PG_FREE_IF_COPY(arg1, 0); |
1826 | 0 | PG_FREE_IF_COPY(arg2, 1); |
1827 | |
|
1828 | 0 | PG_RETURN_BOOL(result); |
1829 | 0 | } |
1830 | | |
1831 | | Datum |
1832 | | text_ge(PG_FUNCTION_ARGS) |
1833 | 0 | { |
1834 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1835 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1836 | 0 | bool result; |
1837 | |
|
1838 | 0 | result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0); |
1839 | |
|
1840 | 0 | PG_FREE_IF_COPY(arg1, 0); |
1841 | 0 | PG_FREE_IF_COPY(arg2, 1); |
1842 | |
|
1843 | 0 | PG_RETURN_BOOL(result); |
1844 | 0 | } |
1845 | | |
1846 | | Datum |
1847 | | text_starts_with(PG_FUNCTION_ARGS) |
1848 | 0 | { |
1849 | 0 | Datum arg1 = PG_GETARG_DATUM(0); |
1850 | 0 | Datum arg2 = PG_GETARG_DATUM(1); |
1851 | 0 | Oid collid = PG_GET_COLLATION(); |
1852 | 0 | pg_locale_t mylocale; |
1853 | 0 | bool result; |
1854 | 0 | Size len1, |
1855 | 0 | len2; |
1856 | |
|
1857 | 0 | check_collation_set(collid); |
1858 | |
|
1859 | 0 | mylocale = pg_newlocale_from_collation(collid); |
1860 | |
|
1861 | 0 | if (!mylocale->deterministic) |
1862 | 0 | ereport(ERROR, |
1863 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
1864 | 0 | errmsg("nondeterministic collations are not supported for substring searches"))); |
1865 | | |
1866 | 0 | len1 = toast_raw_datum_size(arg1); |
1867 | 0 | len2 = toast_raw_datum_size(arg2); |
1868 | 0 | if (len2 > len1) |
1869 | 0 | result = false; |
1870 | 0 | else |
1871 | 0 | { |
1872 | 0 | text *targ1 = text_substring(arg1, 1, len2, false); |
1873 | 0 | text *targ2 = DatumGetTextPP(arg2); |
1874 | |
|
1875 | 0 | result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2), |
1876 | 0 | VARSIZE_ANY_EXHDR(targ2)) == 0); |
1877 | |
|
1878 | 0 | PG_FREE_IF_COPY(targ1, 0); |
1879 | 0 | PG_FREE_IF_COPY(targ2, 1); |
1880 | 0 | } |
1881 | |
|
1882 | 0 | PG_RETURN_BOOL(result); |
1883 | 0 | } |
1884 | | |
1885 | | Datum |
1886 | | bttextcmp(PG_FUNCTION_ARGS) |
1887 | 0 | { |
1888 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1889 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1890 | 0 | int32 result; |
1891 | |
|
1892 | 0 | result = text_cmp(arg1, arg2, PG_GET_COLLATION()); |
1893 | |
|
1894 | 0 | PG_FREE_IF_COPY(arg1, 0); |
1895 | 0 | PG_FREE_IF_COPY(arg2, 1); |
1896 | |
|
1897 | 0 | PG_RETURN_INT32(result); |
1898 | 0 | } |
1899 | | |
1900 | | Datum |
1901 | | bttextsortsupport(PG_FUNCTION_ARGS) |
1902 | 0 | { |
1903 | 0 | SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); |
1904 | 0 | Oid collid = ssup->ssup_collation; |
1905 | 0 | MemoryContext oldcontext; |
1906 | |
|
1907 | 0 | oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); |
1908 | | |
1909 | | /* Use generic string SortSupport */ |
1910 | 0 | varstr_sortsupport(ssup, TEXTOID, collid); |
1911 | |
|
1912 | 0 | MemoryContextSwitchTo(oldcontext); |
1913 | |
|
1914 | 0 | PG_RETURN_VOID(); |
1915 | 0 | } |
1916 | | |
1917 | | /* |
1918 | | * Generic sortsupport interface for character type's operator classes. |
1919 | | * Includes locale support, and support for BpChar semantics (i.e. removing |
1920 | | * trailing spaces before comparison). |
1921 | | * |
1922 | | * Relies on the assumption that text, VarChar, BpChar, and bytea all have the |
1923 | | * same representation. Callers that always use the C collation (e.g. |
1924 | | * non-collatable type callers like bytea) may have NUL bytes in their strings; |
1925 | | * this will not work with any other collation, though. |
1926 | | */ |
1927 | | void |
1928 | | varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) |
1929 | 0 | { |
1930 | 0 | bool abbreviate = ssup->abbreviate; |
1931 | 0 | bool collate_c = false; |
1932 | 0 | VarStringSortSupport *sss; |
1933 | 0 | pg_locale_t locale; |
1934 | |
|
1935 | 0 | check_collation_set(collid); |
1936 | |
|
1937 | 0 | locale = pg_newlocale_from_collation(collid); |
1938 | | |
1939 | | /* |
1940 | | * If possible, set ssup->comparator to a function which can be used to |
1941 | | * directly compare two datums. If we can do this, we'll avoid the |
1942 | | * overhead of a trip through the fmgr layer for every comparison, which |
1943 | | * can be substantial. |
1944 | | * |
1945 | | * Most typically, we'll set the comparator to varlenafastcmp_locale, |
1946 | | * which uses strcoll() to perform comparisons. We use that for the |
1947 | | * BpChar case too, but type NAME uses namefastcmp_locale. However, if |
1948 | | * LC_COLLATE = C, we can make things quite a bit faster with |
1949 | | * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use |
1950 | | * memcmp() rather than strcoll(). |
1951 | | */ |
1952 | 0 | if (locale->collate_is_c) |
1953 | 0 | { |
1954 | 0 | if (typid == BPCHAROID) |
1955 | 0 | ssup->comparator = bpcharfastcmp_c; |
1956 | 0 | else if (typid == NAMEOID) |
1957 | 0 | { |
1958 | 0 | ssup->comparator = namefastcmp_c; |
1959 | | /* Not supporting abbreviation with type NAME, for now */ |
1960 | 0 | abbreviate = false; |
1961 | 0 | } |
1962 | 0 | else |
1963 | 0 | ssup->comparator = varstrfastcmp_c; |
1964 | |
|
1965 | 0 | collate_c = true; |
1966 | 0 | } |
1967 | 0 | else |
1968 | 0 | { |
1969 | | /* |
1970 | | * We use varlenafastcmp_locale except for type NAME. |
1971 | | */ |
1972 | 0 | if (typid == NAMEOID) |
1973 | 0 | { |
1974 | 0 | ssup->comparator = namefastcmp_locale; |
1975 | | /* Not supporting abbreviation with type NAME, for now */ |
1976 | 0 | abbreviate = false; |
1977 | 0 | } |
1978 | 0 | else |
1979 | 0 | ssup->comparator = varlenafastcmp_locale; |
1980 | | |
1981 | | /* |
1982 | | * Unfortunately, it seems that abbreviation for non-C collations is |
1983 | | * broken on many common platforms; see pg_strxfrm_enabled(). |
1984 | | * |
1985 | | * Even apart from the risk of broken locales, it's possible that |
1986 | | * there are platforms where the use of abbreviated keys should be |
1987 | | * disabled at compile time. Having only 4 byte datums could make |
1988 | | * worst-case performance drastically more likely, for example. |
1989 | | * Moreover, macOS's strxfrm() implementation is known to not |
1990 | | * effectively concentrate a significant amount of entropy from the |
1991 | | * original string in earlier transformed blobs. It's possible that |
1992 | | * other supported platforms are similarly encumbered. So, if we ever |
1993 | | * get past disabling this categorically, we may still want or need to |
1994 | | * disable it for particular platforms. |
1995 | | */ |
1996 | 0 | if (!pg_strxfrm_enabled(locale)) |
1997 | 0 | abbreviate = false; |
1998 | 0 | } |
1999 | | |
2000 | | /* |
2001 | | * If we're using abbreviated keys, or if we're using a locale-aware |
2002 | | * comparison, we need to initialize a VarStringSortSupport object. Both |
2003 | | * cases will make use of the temporary buffers we initialize here for |
2004 | | * scratch space (and to detect requirement for BpChar semantics from |
2005 | | * caller), and the abbreviation case requires additional state. |
2006 | | */ |
2007 | 0 | if (abbreviate || !collate_c) |
2008 | 0 | { |
2009 | 0 | sss = palloc(sizeof(VarStringSortSupport)); |
2010 | 0 | sss->buf1 = palloc(TEXTBUFLEN); |
2011 | 0 | sss->buflen1 = TEXTBUFLEN; |
2012 | 0 | sss->buf2 = palloc(TEXTBUFLEN); |
2013 | 0 | sss->buflen2 = TEXTBUFLEN; |
2014 | | /* Start with invalid values */ |
2015 | 0 | sss->last_len1 = -1; |
2016 | 0 | sss->last_len2 = -1; |
2017 | | /* Initialize */ |
2018 | 0 | sss->last_returned = 0; |
2019 | 0 | if (collate_c) |
2020 | 0 | sss->locale = NULL; |
2021 | 0 | else |
2022 | 0 | sss->locale = locale; |
2023 | | |
2024 | | /* |
2025 | | * To avoid somehow confusing a strxfrm() blob and an original string, |
2026 | | * constantly keep track of the variety of data that buf1 and buf2 |
2027 | | * currently contain. |
2028 | | * |
2029 | | * Comparisons may be interleaved with conversion calls. Frequently, |
2030 | | * conversions and comparisons are batched into two distinct phases, |
2031 | | * but the correctness of caching cannot hinge upon this. For |
2032 | | * comparison caching, buffer state is only trusted if cache_blob is |
2033 | | * found set to false, whereas strxfrm() caching only trusts the state |
2034 | | * when cache_blob is found set to true. |
2035 | | * |
2036 | | * Arbitrarily initialize cache_blob to true. |
2037 | | */ |
2038 | 0 | sss->cache_blob = true; |
2039 | 0 | sss->collate_c = collate_c; |
2040 | 0 | sss->typid = typid; |
2041 | 0 | ssup->ssup_extra = sss; |
2042 | | |
2043 | | /* |
2044 | | * If possible, plan to use the abbreviated keys optimization. The |
2045 | | * core code may switch back to authoritative comparator should |
2046 | | * abbreviation be aborted. |
2047 | | */ |
2048 | 0 | if (abbreviate) |
2049 | 0 | { |
2050 | 0 | sss->prop_card = 0.20; |
2051 | 0 | initHyperLogLog(&sss->abbr_card, 10); |
2052 | 0 | initHyperLogLog(&sss->full_card, 10); |
2053 | 0 | ssup->abbrev_full_comparator = ssup->comparator; |
2054 | 0 | ssup->comparator = ssup_datum_unsigned_cmp; |
2055 | 0 | ssup->abbrev_converter = varstr_abbrev_convert; |
2056 | 0 | ssup->abbrev_abort = varstr_abbrev_abort; |
2057 | 0 | } |
2058 | 0 | } |
2059 | 0 | } |
2060 | | |
2061 | | /* |
2062 | | * sortsupport comparison func (for C locale case) |
2063 | | */ |
2064 | | static int |
2065 | | varstrfastcmp_c(Datum x, Datum y, SortSupport ssup) |
2066 | 0 | { |
2067 | 0 | VarString *arg1 = DatumGetVarStringPP(x); |
2068 | 0 | VarString *arg2 = DatumGetVarStringPP(y); |
2069 | 0 | char *a1p, |
2070 | 0 | *a2p; |
2071 | 0 | int len1, |
2072 | 0 | len2, |
2073 | 0 | result; |
2074 | |
|
2075 | 0 | a1p = VARDATA_ANY(arg1); |
2076 | 0 | a2p = VARDATA_ANY(arg2); |
2077 | |
|
2078 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
2079 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
2080 | |
|
2081 | 0 | result = memcmp(a1p, a2p, Min(len1, len2)); |
2082 | 0 | if ((result == 0) && (len1 != len2)) |
2083 | 0 | result = (len1 < len2) ? -1 : 1; |
2084 | | |
2085 | | /* We can't afford to leak memory here. */ |
2086 | 0 | if (PointerGetDatum(arg1) != x) |
2087 | 0 | pfree(arg1); |
2088 | 0 | if (PointerGetDatum(arg2) != y) |
2089 | 0 | pfree(arg2); |
2090 | |
|
2091 | 0 | return result; |
2092 | 0 | } |
2093 | | |
2094 | | /* |
2095 | | * sortsupport comparison func (for BpChar C locale case) |
2096 | | * |
2097 | | * BpChar outsources its sortsupport to this module. Specialization for the |
2098 | | * varstr_sortsupport BpChar case, modeled on |
2099 | | * internal_bpchar_pattern_compare(). |
2100 | | */ |
2101 | | static int |
2102 | | bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup) |
2103 | 0 | { |
2104 | 0 | BpChar *arg1 = DatumGetBpCharPP(x); |
2105 | 0 | BpChar *arg2 = DatumGetBpCharPP(y); |
2106 | 0 | char *a1p, |
2107 | 0 | *a2p; |
2108 | 0 | int len1, |
2109 | 0 | len2, |
2110 | 0 | result; |
2111 | |
|
2112 | 0 | a1p = VARDATA_ANY(arg1); |
2113 | 0 | a2p = VARDATA_ANY(arg2); |
2114 | |
|
2115 | 0 | len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1)); |
2116 | 0 | len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2)); |
2117 | |
|
2118 | 0 | result = memcmp(a1p, a2p, Min(len1, len2)); |
2119 | 0 | if ((result == 0) && (len1 != len2)) |
2120 | 0 | result = (len1 < len2) ? -1 : 1; |
2121 | | |
2122 | | /* We can't afford to leak memory here. */ |
2123 | 0 | if (PointerGetDatum(arg1) != x) |
2124 | 0 | pfree(arg1); |
2125 | 0 | if (PointerGetDatum(arg2) != y) |
2126 | 0 | pfree(arg2); |
2127 | |
|
2128 | 0 | return result; |
2129 | 0 | } |
2130 | | |
2131 | | /* |
2132 | | * sortsupport comparison func (for NAME C locale case) |
2133 | | */ |
2134 | | static int |
2135 | | namefastcmp_c(Datum x, Datum y, SortSupport ssup) |
2136 | 0 | { |
2137 | 0 | Name arg1 = DatumGetName(x); |
2138 | 0 | Name arg2 = DatumGetName(y); |
2139 | |
|
2140 | 0 | return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN); |
2141 | 0 | } |
2142 | | |
2143 | | /* |
2144 | | * sortsupport comparison func (for locale case with all varlena types) |
2145 | | */ |
2146 | | static int |
2147 | | varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup) |
2148 | 0 | { |
2149 | 0 | VarString *arg1 = DatumGetVarStringPP(x); |
2150 | 0 | VarString *arg2 = DatumGetVarStringPP(y); |
2151 | 0 | char *a1p, |
2152 | 0 | *a2p; |
2153 | 0 | int len1, |
2154 | 0 | len2, |
2155 | 0 | result; |
2156 | |
|
2157 | 0 | a1p = VARDATA_ANY(arg1); |
2158 | 0 | a2p = VARDATA_ANY(arg2); |
2159 | |
|
2160 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
2161 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
2162 | |
|
2163 | 0 | result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup); |
2164 | | |
2165 | | /* We can't afford to leak memory here. */ |
2166 | 0 | if (PointerGetDatum(arg1) != x) |
2167 | 0 | pfree(arg1); |
2168 | 0 | if (PointerGetDatum(arg2) != y) |
2169 | 0 | pfree(arg2); |
2170 | |
|
2171 | 0 | return result; |
2172 | 0 | } |
2173 | | |
2174 | | /* |
2175 | | * sortsupport comparison func (for locale case with NAME type) |
2176 | | */ |
2177 | | static int |
2178 | | namefastcmp_locale(Datum x, Datum y, SortSupport ssup) |
2179 | 0 | { |
2180 | 0 | Name arg1 = DatumGetName(x); |
2181 | 0 | Name arg2 = DatumGetName(y); |
2182 | |
|
2183 | 0 | return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)), |
2184 | 0 | NameStr(*arg2), strlen(NameStr(*arg2)), |
2185 | 0 | ssup); |
2186 | 0 | } |
2187 | | |
2188 | | /* |
2189 | | * sortsupport comparison func for locale cases |
2190 | | */ |
2191 | | static int |
2192 | | varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) |
2193 | 0 | { |
2194 | 0 | VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra; |
2195 | 0 | int result; |
2196 | 0 | bool arg1_match; |
2197 | | |
2198 | | /* Fast pre-check for equality, as discussed in varstr_cmp() */ |
2199 | 0 | if (len1 == len2 && memcmp(a1p, a2p, len1) == 0) |
2200 | 0 | { |
2201 | | /* |
2202 | | * No change in buf1 or buf2 contents, so avoid changing last_len1 or |
2203 | | * last_len2. Existing contents of buffers might still be used by |
2204 | | * next call. |
2205 | | * |
2206 | | * It's fine to allow the comparison of BpChar padding bytes here, |
2207 | | * even though that implies that the memcmp() will usually be |
2208 | | * performed for BpChar callers (though multibyte characters could |
2209 | | * still prevent that from occurring). The memcmp() is still very |
2210 | | * cheap, and BpChar's funny semantics have us remove trailing spaces |
2211 | | * (not limited to padding), so we need make no distinction between |
2212 | | * padding space characters and "real" space characters. |
2213 | | */ |
2214 | 0 | return 0; |
2215 | 0 | } |
2216 | | |
2217 | 0 | if (sss->typid == BPCHAROID) |
2218 | 0 | { |
2219 | | /* Get true number of bytes, ignoring trailing spaces */ |
2220 | 0 | len1 = bpchartruelen(a1p, len1); |
2221 | 0 | len2 = bpchartruelen(a2p, len2); |
2222 | 0 | } |
2223 | |
|
2224 | 0 | if (len1 >= sss->buflen1) |
2225 | 0 | { |
2226 | 0 | sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize)); |
2227 | 0 | sss->buf1 = repalloc(sss->buf1, sss->buflen1); |
2228 | 0 | } |
2229 | 0 | if (len2 >= sss->buflen2) |
2230 | 0 | { |
2231 | 0 | sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize)); |
2232 | 0 | sss->buf2 = repalloc(sss->buf2, sss->buflen2); |
2233 | 0 | } |
2234 | | |
2235 | | /* |
2236 | | * We're likely to be asked to compare the same strings repeatedly, and |
2237 | | * memcmp() is so much cheaper than strcoll() that it pays to try to cache |
2238 | | * comparisons, even though in general there is no reason to think that |
2239 | | * that will work out (every string datum may be unique). Caching does |
2240 | | * not slow things down measurably when it doesn't work out, and can speed |
2241 | | * things up by rather a lot when it does. In part, this is because the |
2242 | | * memcmp() compares data from cachelines that are needed in L1 cache even |
2243 | | * when the last comparison's result cannot be reused. |
2244 | | */ |
2245 | 0 | arg1_match = true; |
2246 | 0 | if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0) |
2247 | 0 | { |
2248 | 0 | arg1_match = false; |
2249 | 0 | memcpy(sss->buf1, a1p, len1); |
2250 | 0 | sss->buf1[len1] = '\0'; |
2251 | 0 | sss->last_len1 = len1; |
2252 | 0 | } |
2253 | | |
2254 | | /* |
2255 | | * If we're comparing the same two strings as last time, we can return the |
2256 | | * same answer without calling strcoll() again. This is more likely than |
2257 | | * it seems (at least with moderate to low cardinality sets), because |
2258 | | * quicksort compares the same pivot against many values. |
2259 | | */ |
2260 | 0 | if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0) |
2261 | 0 | { |
2262 | 0 | memcpy(sss->buf2, a2p, len2); |
2263 | 0 | sss->buf2[len2] = '\0'; |
2264 | 0 | sss->last_len2 = len2; |
2265 | 0 | } |
2266 | 0 | else if (arg1_match && !sss->cache_blob) |
2267 | 0 | { |
2268 | | /* Use result cached following last actual strcoll() call */ |
2269 | 0 | return sss->last_returned; |
2270 | 0 | } |
2271 | | |
2272 | 0 | result = pg_strcoll(sss->buf1, sss->buf2, sss->locale); |
2273 | | |
2274 | | /* Break tie if necessary. */ |
2275 | 0 | if (result == 0 && sss->locale->deterministic) |
2276 | 0 | result = strcmp(sss->buf1, sss->buf2); |
2277 | | |
2278 | | /* Cache result, perhaps saving an expensive strcoll() call next time */ |
2279 | 0 | sss->cache_blob = false; |
2280 | 0 | sss->last_returned = result; |
2281 | 0 | return result; |
2282 | 0 | } |
2283 | | |
2284 | | /* |
2285 | | * Conversion routine for sortsupport. Converts original to abbreviated key |
2286 | | * representation. Our encoding strategy is simple -- pack the first 8 bytes |
2287 | | * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are |
2288 | | * stored in reverse order), and treat it as an unsigned integer. When the "C" |
2289 | | * locale is used, or in case of bytea, just memcpy() from original instead. |
2290 | | */ |
2291 | | static Datum |
2292 | | varstr_abbrev_convert(Datum original, SortSupport ssup) |
2293 | 0 | { |
2294 | 0 | const size_t max_prefix_bytes = sizeof(Datum); |
2295 | 0 | VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra; |
2296 | 0 | VarString *authoritative = DatumGetVarStringPP(original); |
2297 | 0 | char *authoritative_data = VARDATA_ANY(authoritative); |
2298 | | |
2299 | | /* working state */ |
2300 | 0 | Datum res; |
2301 | 0 | char *pres; |
2302 | 0 | int len; |
2303 | 0 | uint32 hash; |
2304 | |
|
2305 | 0 | pres = (char *) &res; |
2306 | | /* memset(), so any non-overwritten bytes are NUL */ |
2307 | 0 | memset(pres, 0, max_prefix_bytes); |
2308 | 0 | len = VARSIZE_ANY_EXHDR(authoritative); |
2309 | | |
2310 | | /* Get number of bytes, ignoring trailing spaces */ |
2311 | 0 | if (sss->typid == BPCHAROID) |
2312 | 0 | len = bpchartruelen(authoritative_data, len); |
2313 | | |
2314 | | /* |
2315 | | * If we're using the C collation, use memcpy(), rather than strxfrm(), to |
2316 | | * abbreviate keys. The full comparator for the C locale is always |
2317 | | * memcmp(). It would be incorrect to allow bytea callers (callers that |
2318 | | * always force the C collation -- bytea isn't a collatable type, but this |
2319 | | * approach is convenient) to use strxfrm(). This is because bytea |
2320 | | * strings may contain NUL bytes. Besides, this should be faster, too. |
2321 | | * |
2322 | | * More generally, it's okay that bytea callers can have NUL bytes in |
2323 | | * strings because abbreviated cmp need not make a distinction between |
2324 | | * terminating NUL bytes, and NUL bytes representing actual NULs in the |
2325 | | * authoritative representation. Hopefully a comparison at or past one |
2326 | | * abbreviated key's terminating NUL byte will resolve the comparison |
2327 | | * without consulting the authoritative representation; specifically, some |
2328 | | * later non-NUL byte in the longer string can resolve the comparison |
2329 | | * against a subsequent terminating NUL in the shorter string. There will |
2330 | | * usually be what is effectively a "length-wise" resolution there and |
2331 | | * then. |
2332 | | * |
2333 | | * If that doesn't work out -- if all bytes in the longer string |
2334 | | * positioned at or past the offset of the smaller string's (first) |
2335 | | * terminating NUL are actually representative of NUL bytes in the |
2336 | | * authoritative binary string (perhaps with some *terminating* NUL bytes |
2337 | | * towards the end of the longer string iff it happens to still be small) |
2338 | | * -- then an authoritative tie-breaker will happen, and do the right |
2339 | | * thing: explicitly consider string length. |
2340 | | */ |
2341 | 0 | if (sss->collate_c) |
2342 | 0 | memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); |
2343 | 0 | else |
2344 | 0 | { |
2345 | 0 | Size bsize; |
2346 | | |
2347 | | /* |
2348 | | * We're not using the C collation, so fall back on strxfrm or ICU |
2349 | | * analogs. |
2350 | | */ |
2351 | | |
2352 | | /* By convention, we use buffer 1 to store and NUL-terminate */ |
2353 | 0 | if (len >= sss->buflen1) |
2354 | 0 | { |
2355 | 0 | sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize)); |
2356 | 0 | sss->buf1 = repalloc(sss->buf1, sss->buflen1); |
2357 | 0 | } |
2358 | | |
2359 | | /* Might be able to reuse strxfrm() blob from last call */ |
2360 | 0 | if (sss->last_len1 == len && sss->cache_blob && |
2361 | 0 | memcmp(sss->buf1, authoritative_data, len) == 0) |
2362 | 0 | { |
2363 | 0 | memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2)); |
2364 | | /* No change affecting cardinality, so no hashing required */ |
2365 | 0 | goto done; |
2366 | 0 | } |
2367 | | |
2368 | 0 | memcpy(sss->buf1, authoritative_data, len); |
2369 | | |
2370 | | /* |
2371 | | * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings. |
2372 | | */ |
2373 | 0 | sss->buf1[len] = '\0'; |
2374 | 0 | sss->last_len1 = len; |
2375 | |
|
2376 | 0 | if (pg_strxfrm_prefix_enabled(sss->locale)) |
2377 | 0 | { |
2378 | 0 | if (sss->buflen2 < max_prefix_bytes) |
2379 | 0 | { |
2380 | 0 | sss->buflen2 = Max(max_prefix_bytes, |
2381 | 0 | Min(sss->buflen2 * 2, MaxAllocSize)); |
2382 | 0 | sss->buf2 = repalloc(sss->buf2, sss->buflen2); |
2383 | 0 | } |
2384 | |
|
2385 | 0 | bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1, |
2386 | 0 | max_prefix_bytes, sss->locale); |
2387 | 0 | sss->last_len2 = bsize; |
2388 | 0 | } |
2389 | 0 | else |
2390 | 0 | { |
2391 | | /* |
2392 | | * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try |
2393 | | * again. The pg_strxfrm() function leaves the result buffer |
2394 | | * content undefined if the result did not fit, so we need to |
2395 | | * retry until everything fits, even though we only need the first |
2396 | | * few bytes in the end. |
2397 | | */ |
2398 | 0 | for (;;) |
2399 | 0 | { |
2400 | 0 | bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2, |
2401 | 0 | sss->locale); |
2402 | |
|
2403 | 0 | sss->last_len2 = bsize; |
2404 | 0 | if (bsize < sss->buflen2) |
2405 | 0 | break; |
2406 | | |
2407 | | /* |
2408 | | * Grow buffer and retry. |
2409 | | */ |
2410 | 0 | sss->buflen2 = Max(bsize + 1, |
2411 | 0 | Min(sss->buflen2 * 2, MaxAllocSize)); |
2412 | 0 | sss->buf2 = repalloc(sss->buf2, sss->buflen2); |
2413 | 0 | } |
2414 | 0 | } |
2415 | | |
2416 | | /* |
2417 | | * Every Datum byte is always compared. This is safe because the |
2418 | | * strxfrm() blob is itself NUL terminated, leaving no danger of |
2419 | | * misinterpreting any NUL bytes not intended to be interpreted as |
2420 | | * logically representing termination. |
2421 | | * |
2422 | | * (Actually, even if there were NUL bytes in the blob it would be |
2423 | | * okay. See remarks on bytea case above.) |
2424 | | */ |
2425 | 0 | memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize)); |
2426 | 0 | } |
2427 | | |
2428 | | /* |
2429 | | * Maintain approximate cardinality of both abbreviated keys and original, |
2430 | | * authoritative keys using HyperLogLog. Used as cheap insurance against |
2431 | | * the worst case, where we do many string transformations for no saving |
2432 | | * in full strcoll()-based comparisons. These statistics are used by |
2433 | | * varstr_abbrev_abort(). |
2434 | | * |
2435 | | * First, Hash key proper, or a significant fraction of it. Mix in length |
2436 | | * in order to compensate for cases where differences are past |
2437 | | * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. |
2438 | | */ |
2439 | 0 | hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, |
2440 | 0 | Min(len, PG_CACHE_LINE_SIZE))); |
2441 | |
|
2442 | 0 | if (len > PG_CACHE_LINE_SIZE) |
2443 | 0 | hash ^= DatumGetUInt32(hash_uint32((uint32) len)); |
2444 | |
|
2445 | 0 | addHyperLogLog(&sss->full_card, hash); |
2446 | | |
2447 | | /* Hash abbreviated key */ |
2448 | 0 | #if SIZEOF_DATUM == 8 |
2449 | 0 | { |
2450 | 0 | uint32 lohalf, |
2451 | 0 | hihalf; |
2452 | |
|
2453 | 0 | lohalf = (uint32) res; |
2454 | 0 | hihalf = (uint32) (res >> 32); |
2455 | 0 | hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf)); |
2456 | 0 | } |
2457 | | #else /* SIZEOF_DATUM != 8 */ |
2458 | | hash = DatumGetUInt32(hash_uint32((uint32) res)); |
2459 | | #endif |
2460 | |
|
2461 | 0 | addHyperLogLog(&sss->abbr_card, hash); |
2462 | | |
2463 | | /* Cache result, perhaps saving an expensive strxfrm() call next time */ |
2464 | 0 | sss->cache_blob = true; |
2465 | 0 | done: |
2466 | | |
2467 | | /* |
2468 | | * Byteswap on little-endian machines. |
2469 | | * |
2470 | | * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer |
2471 | | * 3-way comparator) works correctly on all platforms. If we didn't do |
2472 | | * this, the comparator would have to call memcmp() with a pair of |
2473 | | * pointers to the first byte of each abbreviated key, which is slower. |
2474 | | */ |
2475 | 0 | res = DatumBigEndianToNative(res); |
2476 | | |
2477 | | /* Don't leak memory here */ |
2478 | 0 | if (PointerGetDatum(authoritative) != original) |
2479 | 0 | pfree(authoritative); |
2480 | |
|
2481 | 0 | return res; |
2482 | 0 | } |
2483 | | |
2484 | | /* |
2485 | | * Callback for estimating effectiveness of abbreviated key optimization, using |
2486 | | * heuristic rules. Returns value indicating if the abbreviation optimization |
2487 | | * should be aborted, based on its projected effectiveness. |
2488 | | */ |
2489 | | static bool |
2490 | | varstr_abbrev_abort(int memtupcount, SortSupport ssup) |
2491 | 0 | { |
2492 | 0 | VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra; |
2493 | 0 | double abbrev_distinct, |
2494 | 0 | key_distinct; |
2495 | |
|
2496 | 0 | Assert(ssup->abbreviate); |
2497 | | |
2498 | | /* Have a little patience */ |
2499 | 0 | if (memtupcount < 100) |
2500 | 0 | return false; |
2501 | | |
2502 | 0 | abbrev_distinct = estimateHyperLogLog(&sss->abbr_card); |
2503 | 0 | key_distinct = estimateHyperLogLog(&sss->full_card); |
2504 | | |
2505 | | /* |
2506 | | * Clamp cardinality estimates to at least one distinct value. While |
2507 | | * NULLs are generally disregarded, if only NULL values were seen so far, |
2508 | | * that might misrepresent costs if we failed to clamp. |
2509 | | */ |
2510 | 0 | if (abbrev_distinct <= 1.0) |
2511 | 0 | abbrev_distinct = 1.0; |
2512 | |
|
2513 | 0 | if (key_distinct <= 1.0) |
2514 | 0 | key_distinct = 1.0; |
2515 | | |
2516 | | /* |
2517 | | * In the worst case all abbreviated keys are identical, while at the same |
2518 | | * time there are differences within full key strings not captured in |
2519 | | * abbreviations. |
2520 | | */ |
2521 | 0 | if (trace_sort) |
2522 | 0 | { |
2523 | 0 | double norm_abbrev_card = abbrev_distinct / (double) memtupcount; |
2524 | |
|
2525 | 0 | elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f " |
2526 | 0 | "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)", |
2527 | 0 | memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card, |
2528 | 0 | sss->prop_card); |
2529 | 0 | } |
2530 | | |
2531 | | /* |
2532 | | * If the number of distinct abbreviated keys approximately matches the |
2533 | | * number of distinct authoritative original keys, that's reason enough to |
2534 | | * proceed. We can win even with a very low cardinality set if most |
2535 | | * tie-breakers only memcmp(). This is by far the most important |
2536 | | * consideration. |
2537 | | * |
2538 | | * While comparisons that are resolved at the abbreviated key level are |
2539 | | * considerably cheaper than tie-breakers resolved with memcmp(), both of |
2540 | | * those two outcomes are so much cheaper than a full strcoll() once |
2541 | | * sorting is underway that it doesn't seem worth it to weigh abbreviated |
2542 | | * cardinality against the overall size of the set in order to more |
2543 | | * accurately model costs. Assume that an abbreviated comparison, and an |
2544 | | * abbreviated comparison with a cheap memcmp()-based authoritative |
2545 | | * resolution are equivalent. |
2546 | | */ |
2547 | 0 | if (abbrev_distinct > key_distinct * sss->prop_card) |
2548 | 0 | { |
2549 | | /* |
2550 | | * When we have exceeded 10,000 tuples, decay required cardinality |
2551 | | * aggressively for next call. |
2552 | | * |
2553 | | * This is useful because the number of comparisons required on |
2554 | | * average increases at a linearithmic rate, and at roughly 10,000 |
2555 | | * tuples that factor will start to dominate over the linear costs of |
2556 | | * string transformation (this is a conservative estimate). The decay |
2557 | | * rate is chosen to be a little less aggressive than halving -- which |
2558 | | * (since we're called at points at which memtupcount has doubled) |
2559 | | * would never see the cost model actually abort past the first call |
2560 | | * following a decay. This decay rate is mostly a precaution against |
2561 | | * a sudden, violent swing in how well abbreviated cardinality tracks |
2562 | | * full key cardinality. The decay also serves to prevent a marginal |
2563 | | * case from being aborted too late, when too much has already been |
2564 | | * invested in string transformation. |
2565 | | * |
2566 | | * It's possible for sets of several million distinct strings with |
2567 | | * mere tens of thousands of distinct abbreviated keys to still |
2568 | | * benefit very significantly. This will generally occur provided |
2569 | | * each abbreviated key is a proxy for a roughly uniform number of the |
2570 | | * set's full keys. If it isn't so, we hope to catch that early and |
2571 | | * abort. If it isn't caught early, by the time the problem is |
2572 | | * apparent it's probably not worth aborting. |
2573 | | */ |
2574 | 0 | if (memtupcount > 10000) |
2575 | 0 | sss->prop_card *= 0.65; |
2576 | |
|
2577 | 0 | return false; |
2578 | 0 | } |
2579 | | |
2580 | | /* |
2581 | | * Abort abbreviation strategy. |
2582 | | * |
2583 | | * The worst case, where all abbreviated keys are identical while all |
2584 | | * original strings differ will typically only see a regression of about |
2585 | | * 10% in execution time for small to medium sized lists of strings. |
2586 | | * Whereas on modern CPUs where cache stalls are the dominant cost, we can |
2587 | | * often expect very large improvements, particularly with sets of strings |
2588 | | * of moderately high to high abbreviated cardinality. There is little to |
2589 | | * lose but much to gain, which our strategy reflects. |
2590 | | */ |
2591 | 0 | if (trace_sort) |
2592 | 0 | elog(LOG, "varstr_abbrev: aborted abbreviation at %d " |
2593 | 0 | "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)", |
2594 | 0 | memtupcount, abbrev_distinct, key_distinct, sss->prop_card); |
2595 | | |
2596 | 0 | return true; |
2597 | 0 | } |
2598 | | |
2599 | | /* |
2600 | | * Generic equalimage support function for character type's operator classes. |
2601 | | * Disables the use of deduplication with nondeterministic collations. |
2602 | | */ |
2603 | | Datum |
2604 | | btvarstrequalimage(PG_FUNCTION_ARGS) |
2605 | 0 | { |
2606 | | /* Oid opcintype = PG_GETARG_OID(0); */ |
2607 | 0 | Oid collid = PG_GET_COLLATION(); |
2608 | 0 | pg_locale_t locale; |
2609 | |
|
2610 | 0 | check_collation_set(collid); |
2611 | |
|
2612 | 0 | locale = pg_newlocale_from_collation(collid); |
2613 | |
|
2614 | 0 | PG_RETURN_BOOL(locale->deterministic); |
2615 | 0 | } |
2616 | | |
2617 | | Datum |
2618 | | text_larger(PG_FUNCTION_ARGS) |
2619 | 0 | { |
2620 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2621 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2622 | 0 | text *result; |
2623 | |
|
2624 | 0 | result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2); |
2625 | |
|
2626 | 0 | PG_RETURN_TEXT_P(result); |
2627 | 0 | } |
2628 | | |
2629 | | Datum |
2630 | | text_smaller(PG_FUNCTION_ARGS) |
2631 | 0 | { |
2632 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2633 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2634 | 0 | text *result; |
2635 | |
|
2636 | 0 | result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2); |
2637 | |
|
2638 | 0 | PG_RETURN_TEXT_P(result); |
2639 | 0 | } |
2640 | | |
2641 | | |
2642 | | /* |
2643 | | * Cross-type comparison functions for types text and name. |
2644 | | */ |
2645 | | |
2646 | | Datum |
2647 | | nameeqtext(PG_FUNCTION_ARGS) |
2648 | 0 | { |
2649 | 0 | Name arg1 = PG_GETARG_NAME(0); |
2650 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2651 | 0 | size_t len1 = strlen(NameStr(*arg1)); |
2652 | 0 | size_t len2 = VARSIZE_ANY_EXHDR(arg2); |
2653 | 0 | Oid collid = PG_GET_COLLATION(); |
2654 | 0 | bool result; |
2655 | |
|
2656 | 0 | check_collation_set(collid); |
2657 | |
|
2658 | 0 | if (collid == C_COLLATION_OID) |
2659 | 0 | result = (len1 == len2 && |
2660 | 0 | memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0); |
2661 | 0 | else |
2662 | 0 | result = (varstr_cmp(NameStr(*arg1), len1, |
2663 | 0 | VARDATA_ANY(arg2), len2, |
2664 | 0 | collid) == 0); |
2665 | |
|
2666 | 0 | PG_FREE_IF_COPY(arg2, 1); |
2667 | |
|
2668 | 0 | PG_RETURN_BOOL(result); |
2669 | 0 | } |
2670 | | |
2671 | | Datum |
2672 | | texteqname(PG_FUNCTION_ARGS) |
2673 | 0 | { |
2674 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2675 | 0 | Name arg2 = PG_GETARG_NAME(1); |
2676 | 0 | size_t len1 = VARSIZE_ANY_EXHDR(arg1); |
2677 | 0 | size_t len2 = strlen(NameStr(*arg2)); |
2678 | 0 | Oid collid = PG_GET_COLLATION(); |
2679 | 0 | bool result; |
2680 | |
|
2681 | 0 | check_collation_set(collid); |
2682 | |
|
2683 | 0 | if (collid == C_COLLATION_OID) |
2684 | 0 | result = (len1 == len2 && |
2685 | 0 | memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0); |
2686 | 0 | else |
2687 | 0 | result = (varstr_cmp(VARDATA_ANY(arg1), len1, |
2688 | 0 | NameStr(*arg2), len2, |
2689 | 0 | collid) == 0); |
2690 | |
|
2691 | 0 | PG_FREE_IF_COPY(arg1, 0); |
2692 | |
|
2693 | 0 | PG_RETURN_BOOL(result); |
2694 | 0 | } |
2695 | | |
2696 | | Datum |
2697 | | namenetext(PG_FUNCTION_ARGS) |
2698 | 0 | { |
2699 | 0 | Name arg1 = PG_GETARG_NAME(0); |
2700 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2701 | 0 | size_t len1 = strlen(NameStr(*arg1)); |
2702 | 0 | size_t len2 = VARSIZE_ANY_EXHDR(arg2); |
2703 | 0 | Oid collid = PG_GET_COLLATION(); |
2704 | 0 | bool result; |
2705 | |
|
2706 | 0 | check_collation_set(collid); |
2707 | |
|
2708 | 0 | if (collid == C_COLLATION_OID) |
2709 | 0 | result = !(len1 == len2 && |
2710 | 0 | memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0); |
2711 | 0 | else |
2712 | 0 | result = !(varstr_cmp(NameStr(*arg1), len1, |
2713 | 0 | VARDATA_ANY(arg2), len2, |
2714 | 0 | collid) == 0); |
2715 | |
|
2716 | 0 | PG_FREE_IF_COPY(arg2, 1); |
2717 | |
|
2718 | 0 | PG_RETURN_BOOL(result); |
2719 | 0 | } |
2720 | | |
2721 | | Datum |
2722 | | textnename(PG_FUNCTION_ARGS) |
2723 | 0 | { |
2724 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2725 | 0 | Name arg2 = PG_GETARG_NAME(1); |
2726 | 0 | size_t len1 = VARSIZE_ANY_EXHDR(arg1); |
2727 | 0 | size_t len2 = strlen(NameStr(*arg2)); |
2728 | 0 | Oid collid = PG_GET_COLLATION(); |
2729 | 0 | bool result; |
2730 | |
|
2731 | 0 | check_collation_set(collid); |
2732 | |
|
2733 | 0 | if (collid == C_COLLATION_OID) |
2734 | 0 | result = !(len1 == len2 && |
2735 | 0 | memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0); |
2736 | 0 | else |
2737 | 0 | result = !(varstr_cmp(VARDATA_ANY(arg1), len1, |
2738 | 0 | NameStr(*arg2), len2, |
2739 | 0 | collid) == 0); |
2740 | |
|
2741 | 0 | PG_FREE_IF_COPY(arg1, 0); |
2742 | |
|
2743 | 0 | PG_RETURN_BOOL(result); |
2744 | 0 | } |
2745 | | |
2746 | | Datum |
2747 | | btnametextcmp(PG_FUNCTION_ARGS) |
2748 | 0 | { |
2749 | 0 | Name arg1 = PG_GETARG_NAME(0); |
2750 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2751 | 0 | int32 result; |
2752 | |
|
2753 | 0 | result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)), |
2754 | 0 | VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2), |
2755 | 0 | PG_GET_COLLATION()); |
2756 | |
|
2757 | 0 | PG_FREE_IF_COPY(arg2, 1); |
2758 | |
|
2759 | 0 | PG_RETURN_INT32(result); |
2760 | 0 | } |
2761 | | |
2762 | | Datum |
2763 | | bttextnamecmp(PG_FUNCTION_ARGS) |
2764 | 0 | { |
2765 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2766 | 0 | Name arg2 = PG_GETARG_NAME(1); |
2767 | 0 | int32 result; |
2768 | |
|
2769 | 0 | result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1), |
2770 | 0 | NameStr(*arg2), strlen(NameStr(*arg2)), |
2771 | 0 | PG_GET_COLLATION()); |
2772 | |
|
2773 | 0 | PG_FREE_IF_COPY(arg1, 0); |
2774 | |
|
2775 | 0 | PG_RETURN_INT32(result); |
2776 | 0 | } |
2777 | | |
2778 | | #define CmpCall(cmpfunc) \ |
2779 | | DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \ |
2780 | | PG_GET_COLLATION(), \ |
2781 | | PG_GETARG_DATUM(0), \ |
2782 | | PG_GETARG_DATUM(1))) |
2783 | | |
2784 | | Datum |
2785 | | namelttext(PG_FUNCTION_ARGS) |
2786 | 0 | { |
2787 | 0 | PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0); |
2788 | 0 | } |
2789 | | |
2790 | | Datum |
2791 | | nameletext(PG_FUNCTION_ARGS) |
2792 | 0 | { |
2793 | 0 | PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0); |
2794 | 0 | } |
2795 | | |
2796 | | Datum |
2797 | | namegttext(PG_FUNCTION_ARGS) |
2798 | 0 | { |
2799 | 0 | PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0); |
2800 | 0 | } |
2801 | | |
2802 | | Datum |
2803 | | namegetext(PG_FUNCTION_ARGS) |
2804 | 0 | { |
2805 | 0 | PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0); |
2806 | 0 | } |
2807 | | |
2808 | | Datum |
2809 | | textltname(PG_FUNCTION_ARGS) |
2810 | 0 | { |
2811 | 0 | PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0); |
2812 | 0 | } |
2813 | | |
2814 | | Datum |
2815 | | textlename(PG_FUNCTION_ARGS) |
2816 | 0 | { |
2817 | 0 | PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0); |
2818 | 0 | } |
2819 | | |
2820 | | Datum |
2821 | | textgtname(PG_FUNCTION_ARGS) |
2822 | 0 | { |
2823 | 0 | PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0); |
2824 | 0 | } |
2825 | | |
2826 | | Datum |
2827 | | textgename(PG_FUNCTION_ARGS) |
2828 | 0 | { |
2829 | 0 | PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0); |
2830 | 0 | } |
2831 | | |
2832 | | #undef CmpCall |
2833 | | |
2834 | | |
2835 | | /* |
2836 | | * The following operators support character-by-character comparison |
2837 | | * of text datums, to allow building indexes suitable for LIKE clauses. |
2838 | | * Note that the regular texteq/textne comparison operators, and regular |
2839 | | * support functions 1 and 2 with "C" collation are assumed to be |
2840 | | * compatible with these! |
2841 | | */ |
2842 | | |
2843 | | static int |
2844 | | internal_text_pattern_compare(text *arg1, text *arg2) |
2845 | 0 | { |
2846 | 0 | int result; |
2847 | 0 | int len1, |
2848 | 0 | len2; |
2849 | |
|
2850 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
2851 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
2852 | |
|
2853 | 0 | result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
2854 | 0 | if (result != 0) |
2855 | 0 | return result; |
2856 | 0 | else if (len1 < len2) |
2857 | 0 | return -1; |
2858 | 0 | else if (len1 > len2) |
2859 | 0 | return 1; |
2860 | 0 | else |
2861 | 0 | return 0; |
2862 | 0 | } |
2863 | | |
2864 | | |
2865 | | Datum |
2866 | | text_pattern_lt(PG_FUNCTION_ARGS) |
2867 | 0 | { |
2868 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2869 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2870 | 0 | int result; |
2871 | |
|
2872 | 0 | result = internal_text_pattern_compare(arg1, arg2); |
2873 | |
|
2874 | 0 | PG_FREE_IF_COPY(arg1, 0); |
2875 | 0 | PG_FREE_IF_COPY(arg2, 1); |
2876 | |
|
2877 | 0 | PG_RETURN_BOOL(result < 0); |
2878 | 0 | } |
2879 | | |
2880 | | |
2881 | | Datum |
2882 | | text_pattern_le(PG_FUNCTION_ARGS) |
2883 | 0 | { |
2884 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2885 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2886 | 0 | int result; |
2887 | |
|
2888 | 0 | result = internal_text_pattern_compare(arg1, arg2); |
2889 | |
|
2890 | 0 | PG_FREE_IF_COPY(arg1, 0); |
2891 | 0 | PG_FREE_IF_COPY(arg2, 1); |
2892 | |
|
2893 | 0 | PG_RETURN_BOOL(result <= 0); |
2894 | 0 | } |
2895 | | |
2896 | | |
2897 | | Datum |
2898 | | text_pattern_ge(PG_FUNCTION_ARGS) |
2899 | 0 | { |
2900 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2901 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2902 | 0 | int result; |
2903 | |
|
2904 | 0 | result = internal_text_pattern_compare(arg1, arg2); |
2905 | |
|
2906 | 0 | PG_FREE_IF_COPY(arg1, 0); |
2907 | 0 | PG_FREE_IF_COPY(arg2, 1); |
2908 | |
|
2909 | 0 | PG_RETURN_BOOL(result >= 0); |
2910 | 0 | } |
2911 | | |
2912 | | |
2913 | | Datum |
2914 | | text_pattern_gt(PG_FUNCTION_ARGS) |
2915 | 0 | { |
2916 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2917 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2918 | 0 | int result; |
2919 | |
|
2920 | 0 | result = internal_text_pattern_compare(arg1, arg2); |
2921 | |
|
2922 | 0 | PG_FREE_IF_COPY(arg1, 0); |
2923 | 0 | PG_FREE_IF_COPY(arg2, 1); |
2924 | |
|
2925 | 0 | PG_RETURN_BOOL(result > 0); |
2926 | 0 | } |
2927 | | |
2928 | | |
2929 | | Datum |
2930 | | bttext_pattern_cmp(PG_FUNCTION_ARGS) |
2931 | 0 | { |
2932 | 0 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2933 | 0 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2934 | 0 | int result; |
2935 | |
|
2936 | 0 | result = internal_text_pattern_compare(arg1, arg2); |
2937 | |
|
2938 | 0 | PG_FREE_IF_COPY(arg1, 0); |
2939 | 0 | PG_FREE_IF_COPY(arg2, 1); |
2940 | |
|
2941 | 0 | PG_RETURN_INT32(result); |
2942 | 0 | } |
2943 | | |
2944 | | |
2945 | | Datum |
2946 | | bttext_pattern_sortsupport(PG_FUNCTION_ARGS) |
2947 | 0 | { |
2948 | 0 | SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); |
2949 | 0 | MemoryContext oldcontext; |
2950 | |
|
2951 | 0 | oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); |
2952 | | |
2953 | | /* Use generic string SortSupport, forcing "C" collation */ |
2954 | 0 | varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID); |
2955 | |
|
2956 | 0 | MemoryContextSwitchTo(oldcontext); |
2957 | |
|
2958 | 0 | PG_RETURN_VOID(); |
2959 | 0 | } |
2960 | | |
2961 | | |
2962 | | /*------------------------------------------------------------- |
2963 | | * byteaoctetlen |
2964 | | * |
2965 | | * get the number of bytes contained in an instance of type 'bytea' |
2966 | | *------------------------------------------------------------- |
2967 | | */ |
2968 | | Datum |
2969 | | byteaoctetlen(PG_FUNCTION_ARGS) |
2970 | 0 | { |
2971 | 0 | Datum str = PG_GETARG_DATUM(0); |
2972 | | |
2973 | | /* We need not detoast the input at all */ |
2974 | 0 | PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); |
2975 | 0 | } |
2976 | | |
2977 | | /* |
2978 | | * byteacat - |
2979 | | * takes two bytea* and returns a bytea* that is the concatenation of |
2980 | | * the two. |
2981 | | * |
2982 | | * Cloned from textcat and modified as required. |
2983 | | */ |
2984 | | Datum |
2985 | | byteacat(PG_FUNCTION_ARGS) |
2986 | 0 | { |
2987 | 0 | bytea *t1 = PG_GETARG_BYTEA_PP(0); |
2988 | 0 | bytea *t2 = PG_GETARG_BYTEA_PP(1); |
2989 | |
|
2990 | 0 | PG_RETURN_BYTEA_P(bytea_catenate(t1, t2)); |
2991 | 0 | } |
2992 | | |
2993 | | /* |
2994 | | * bytea_catenate |
2995 | | * Guts of byteacat(), broken out so it can be used by other functions |
2996 | | * |
2997 | | * Arguments can be in short-header form, but not compressed or out-of-line |
2998 | | */ |
2999 | | static bytea * |
3000 | | bytea_catenate(bytea *t1, bytea *t2) |
3001 | 0 | { |
3002 | 0 | bytea *result; |
3003 | 0 | int len1, |
3004 | 0 | len2, |
3005 | 0 | len; |
3006 | 0 | char *ptr; |
3007 | |
|
3008 | 0 | len1 = VARSIZE_ANY_EXHDR(t1); |
3009 | 0 | len2 = VARSIZE_ANY_EXHDR(t2); |
3010 | | |
3011 | | /* paranoia ... probably should throw error instead? */ |
3012 | 0 | if (len1 < 0) |
3013 | 0 | len1 = 0; |
3014 | 0 | if (len2 < 0) |
3015 | 0 | len2 = 0; |
3016 | |
|
3017 | 0 | len = len1 + len2 + VARHDRSZ; |
3018 | 0 | result = (bytea *) palloc(len); |
3019 | | |
3020 | | /* Set size of result string... */ |
3021 | 0 | SET_VARSIZE(result, len); |
3022 | | |
3023 | | /* Fill data field of result string... */ |
3024 | 0 | ptr = VARDATA(result); |
3025 | 0 | if (len1 > 0) |
3026 | 0 | memcpy(ptr, VARDATA_ANY(t1), len1); |
3027 | 0 | if (len2 > 0) |
3028 | 0 | memcpy(ptr + len1, VARDATA_ANY(t2), len2); |
3029 | |
|
3030 | 0 | return result; |
3031 | 0 | } |
3032 | | |
3033 | | #define PG_STR_GET_BYTEA(str_) \ |
3034 | 0 | DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_))) |
3035 | | |
3036 | | /* |
3037 | | * bytea_substr() |
3038 | | * Return a substring starting at the specified position. |
3039 | | * Cloned from text_substr and modified as required. |
3040 | | * |
3041 | | * Input: |
3042 | | * - string |
3043 | | * - starting position (is one-based) |
3044 | | * - string length (optional) |
3045 | | * |
3046 | | * If the starting position is zero or less, then return from the start of the string |
3047 | | * adjusting the length to be consistent with the "negative start" per SQL. |
3048 | | * If the length is less than zero, an ERROR is thrown. If no third argument |
3049 | | * (length) is provided, the length to the end of the string is assumed. |
3050 | | */ |
3051 | | Datum |
3052 | | bytea_substr(PG_FUNCTION_ARGS) |
3053 | 0 | { |
3054 | 0 | PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), |
3055 | 0 | PG_GETARG_INT32(1), |
3056 | 0 | PG_GETARG_INT32(2), |
3057 | 0 | false)); |
3058 | 0 | } |
3059 | | |
3060 | | /* |
3061 | | * bytea_substr_no_len - |
3062 | | * Wrapper to avoid opr_sanity failure due to |
3063 | | * one function accepting a different number of args. |
3064 | | */ |
3065 | | Datum |
3066 | | bytea_substr_no_len(PG_FUNCTION_ARGS) |
3067 | 0 | { |
3068 | 0 | PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), |
3069 | 0 | PG_GETARG_INT32(1), |
3070 | 0 | -1, |
3071 | 0 | true)); |
3072 | 0 | } |
3073 | | |
3074 | | static bytea * |
3075 | | bytea_substring(Datum str, |
3076 | | int S, |
3077 | | int L, |
3078 | | bool length_not_specified) |
3079 | 0 | { |
3080 | 0 | int32 S1; /* adjusted start position */ |
3081 | 0 | int32 L1; /* adjusted substring length */ |
3082 | 0 | int32 E; /* end position */ |
3083 | | |
3084 | | /* |
3085 | | * The logic here should generally match text_substring(). |
3086 | | */ |
3087 | 0 | S1 = Max(S, 1); |
3088 | |
|
3089 | 0 | if (length_not_specified) |
3090 | 0 | { |
3091 | | /* |
3092 | | * Not passed a length - DatumGetByteaPSlice() grabs everything to the |
3093 | | * end of the string if we pass it a negative value for length. |
3094 | | */ |
3095 | 0 | L1 = -1; |
3096 | 0 | } |
3097 | 0 | else if (L < 0) |
3098 | 0 | { |
3099 | | /* SQL99 says to throw an error for E < S, i.e., negative length */ |
3100 | 0 | ereport(ERROR, |
3101 | 0 | (errcode(ERRCODE_SUBSTRING_ERROR), |
3102 | 0 | errmsg("negative substring length not allowed"))); |
3103 | 0 | L1 = -1; /* silence stupider compilers */ |
3104 | 0 | } |
3105 | 0 | else if (pg_add_s32_overflow(S, L, &E)) |
3106 | 0 | { |
3107 | | /* |
3108 | | * L could be large enough for S + L to overflow, in which case the |
3109 | | * substring must run to end of string. |
3110 | | */ |
3111 | 0 | L1 = -1; |
3112 | 0 | } |
3113 | 0 | else |
3114 | 0 | { |
3115 | | /* |
3116 | | * A zero or negative value for the end position can happen if the |
3117 | | * start was negative or one. SQL99 says to return a zero-length |
3118 | | * string. |
3119 | | */ |
3120 | 0 | if (E < 1) |
3121 | 0 | return PG_STR_GET_BYTEA(""); |
3122 | | |
3123 | 0 | L1 = E - S1; |
3124 | 0 | } |
3125 | | |
3126 | | /* |
3127 | | * If the start position is past the end of the string, SQL99 says to |
3128 | | * return a zero-length string -- DatumGetByteaPSlice() will do that for |
3129 | | * us. We need only convert S1 to zero-based starting position. |
3130 | | */ |
3131 | 0 | return DatumGetByteaPSlice(str, S1 - 1, L1); |
3132 | 0 | } |
3133 | | |
3134 | | /* |
3135 | | * byteaoverlay |
3136 | | * Replace specified substring of first string with second |
3137 | | * |
3138 | | * The SQL standard defines OVERLAY() in terms of substring and concatenation. |
3139 | | * This code is a direct implementation of what the standard says. |
3140 | | */ |
3141 | | Datum |
3142 | | byteaoverlay(PG_FUNCTION_ARGS) |
3143 | 0 | { |
3144 | 0 | bytea *t1 = PG_GETARG_BYTEA_PP(0); |
3145 | 0 | bytea *t2 = PG_GETARG_BYTEA_PP(1); |
3146 | 0 | int sp = PG_GETARG_INT32(2); /* substring start position */ |
3147 | 0 | int sl = PG_GETARG_INT32(3); /* substring length */ |
3148 | |
|
3149 | 0 | PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); |
3150 | 0 | } |
3151 | | |
3152 | | Datum |
3153 | | byteaoverlay_no_len(PG_FUNCTION_ARGS) |
3154 | 0 | { |
3155 | 0 | bytea *t1 = PG_GETARG_BYTEA_PP(0); |
3156 | 0 | bytea *t2 = PG_GETARG_BYTEA_PP(1); |
3157 | 0 | int sp = PG_GETARG_INT32(2); /* substring start position */ |
3158 | 0 | int sl; |
3159 | |
|
3160 | 0 | sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */ |
3161 | 0 | PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); |
3162 | 0 | } |
3163 | | |
3164 | | static bytea * |
3165 | | bytea_overlay(bytea *t1, bytea *t2, int sp, int sl) |
3166 | 0 | { |
3167 | 0 | bytea *result; |
3168 | 0 | bytea *s1; |
3169 | 0 | bytea *s2; |
3170 | 0 | int sp_pl_sl; |
3171 | | |
3172 | | /* |
3173 | | * Check for possible integer-overflow cases. For negative sp, throw a |
3174 | | * "substring length" error because that's what should be expected |
3175 | | * according to the spec's definition of OVERLAY(). |
3176 | | */ |
3177 | 0 | if (sp <= 0) |
3178 | 0 | ereport(ERROR, |
3179 | 0 | (errcode(ERRCODE_SUBSTRING_ERROR), |
3180 | 0 | errmsg("negative substring length not allowed"))); |
3181 | 0 | if (pg_add_s32_overflow(sp, sl, &sp_pl_sl)) |
3182 | 0 | ereport(ERROR, |
3183 | 0 | (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
3184 | 0 | errmsg("integer out of range"))); |
3185 | | |
3186 | 0 | s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false); |
3187 | 0 | s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true); |
3188 | 0 | result = bytea_catenate(s1, t2); |
3189 | 0 | result = bytea_catenate(result, s2); |
3190 | |
|
3191 | 0 | return result; |
3192 | 0 | } |
3193 | | |
3194 | | /* |
3195 | | * bit_count |
3196 | | */ |
3197 | | Datum |
3198 | | bytea_bit_count(PG_FUNCTION_ARGS) |
3199 | 0 | { |
3200 | 0 | bytea *t1 = PG_GETARG_BYTEA_PP(0); |
3201 | |
|
3202 | 0 | PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1))); |
3203 | 0 | } |
3204 | | |
3205 | | /* |
3206 | | * byteapos - |
3207 | | * Return the position of the specified substring. |
3208 | | * Implements the SQL POSITION() function. |
3209 | | * Cloned from textpos and modified as required. |
3210 | | */ |
3211 | | Datum |
3212 | | byteapos(PG_FUNCTION_ARGS) |
3213 | 0 | { |
3214 | 0 | bytea *t1 = PG_GETARG_BYTEA_PP(0); |
3215 | 0 | bytea *t2 = PG_GETARG_BYTEA_PP(1); |
3216 | 0 | int pos; |
3217 | 0 | int px, |
3218 | 0 | p; |
3219 | 0 | int len1, |
3220 | 0 | len2; |
3221 | 0 | char *p1, |
3222 | 0 | *p2; |
3223 | |
|
3224 | 0 | len1 = VARSIZE_ANY_EXHDR(t1); |
3225 | 0 | len2 = VARSIZE_ANY_EXHDR(t2); |
3226 | |
|
3227 | 0 | if (len2 <= 0) |
3228 | 0 | PG_RETURN_INT32(1); /* result for empty pattern */ |
3229 | | |
3230 | 0 | p1 = VARDATA_ANY(t1); |
3231 | 0 | p2 = VARDATA_ANY(t2); |
3232 | |
|
3233 | 0 | pos = 0; |
3234 | 0 | px = (len1 - len2); |
3235 | 0 | for (p = 0; p <= px; p++) |
3236 | 0 | { |
3237 | 0 | if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0)) |
3238 | 0 | { |
3239 | 0 | pos = p + 1; |
3240 | 0 | break; |
3241 | 0 | }; |
3242 | 0 | p1++; |
3243 | 0 | }; |
3244 | |
|
3245 | 0 | PG_RETURN_INT32(pos); |
3246 | 0 | } |
3247 | | |
3248 | | /*------------------------------------------------------------- |
3249 | | * byteaGetByte |
3250 | | * |
3251 | | * this routine treats "bytea" as an array of bytes. |
3252 | | * It returns the Nth byte (a number between 0 and 255). |
3253 | | *------------------------------------------------------------- |
3254 | | */ |
3255 | | Datum |
3256 | | byteaGetByte(PG_FUNCTION_ARGS) |
3257 | 0 | { |
3258 | 0 | bytea *v = PG_GETARG_BYTEA_PP(0); |
3259 | 0 | int32 n = PG_GETARG_INT32(1); |
3260 | 0 | int len; |
3261 | 0 | int byte; |
3262 | |
|
3263 | 0 | len = VARSIZE_ANY_EXHDR(v); |
3264 | |
|
3265 | 0 | if (n < 0 || n >= len) |
3266 | 0 | ereport(ERROR, |
3267 | 0 | (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
3268 | 0 | errmsg("index %d out of valid range, 0..%d", |
3269 | 0 | n, len - 1))); |
3270 | | |
3271 | 0 | byte = ((unsigned char *) VARDATA_ANY(v))[n]; |
3272 | |
|
3273 | 0 | PG_RETURN_INT32(byte); |
3274 | 0 | } |
3275 | | |
3276 | | /*------------------------------------------------------------- |
3277 | | * byteaGetBit |
3278 | | * |
3279 | | * This routine treats a "bytea" type like an array of bits. |
3280 | | * It returns the value of the Nth bit (0 or 1). |
3281 | | * |
3282 | | *------------------------------------------------------------- |
3283 | | */ |
3284 | | Datum |
3285 | | byteaGetBit(PG_FUNCTION_ARGS) |
3286 | 0 | { |
3287 | 0 | bytea *v = PG_GETARG_BYTEA_PP(0); |
3288 | 0 | int64 n = PG_GETARG_INT64(1); |
3289 | 0 | int byteNo, |
3290 | 0 | bitNo; |
3291 | 0 | int len; |
3292 | 0 | int byte; |
3293 | |
|
3294 | 0 | len = VARSIZE_ANY_EXHDR(v); |
3295 | |
|
3296 | 0 | if (n < 0 || n >= (int64) len * 8) |
3297 | 0 | ereport(ERROR, |
3298 | 0 | (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
3299 | 0 | errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, |
3300 | 0 | n, (int64) len * 8 - 1))); |
3301 | | |
3302 | | /* n/8 is now known < len, so safe to cast to int */ |
3303 | 0 | byteNo = (int) (n / 8); |
3304 | 0 | bitNo = (int) (n % 8); |
3305 | |
|
3306 | 0 | byte = ((unsigned char *) VARDATA_ANY(v))[byteNo]; |
3307 | |
|
3308 | 0 | if (byte & (1 << bitNo)) |
3309 | 0 | PG_RETURN_INT32(1); |
3310 | 0 | else |
3311 | 0 | PG_RETURN_INT32(0); |
3312 | 0 | } |
3313 | | |
3314 | | /*------------------------------------------------------------- |
3315 | | * byteaSetByte |
3316 | | * |
3317 | | * Given an instance of type 'bytea' creates a new one with |
3318 | | * the Nth byte set to the given value. |
3319 | | * |
3320 | | *------------------------------------------------------------- |
3321 | | */ |
3322 | | Datum |
3323 | | byteaSetByte(PG_FUNCTION_ARGS) |
3324 | 0 | { |
3325 | 0 | bytea *res = PG_GETARG_BYTEA_P_COPY(0); |
3326 | 0 | int32 n = PG_GETARG_INT32(1); |
3327 | 0 | int32 newByte = PG_GETARG_INT32(2); |
3328 | 0 | int len; |
3329 | |
|
3330 | 0 | len = VARSIZE(res) - VARHDRSZ; |
3331 | |
|
3332 | 0 | if (n < 0 || n >= len) |
3333 | 0 | ereport(ERROR, |
3334 | 0 | (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
3335 | 0 | errmsg("index %d out of valid range, 0..%d", |
3336 | 0 | n, len - 1))); |
3337 | | |
3338 | | /* |
3339 | | * Now set the byte. |
3340 | | */ |
3341 | 0 | ((unsigned char *) VARDATA(res))[n] = newByte; |
3342 | |
|
3343 | 0 | PG_RETURN_BYTEA_P(res); |
3344 | 0 | } |
3345 | | |
3346 | | /*------------------------------------------------------------- |
3347 | | * byteaSetBit |
3348 | | * |
3349 | | * Given an instance of type 'bytea' creates a new one with |
3350 | | * the Nth bit set to the given value. |
3351 | | * |
3352 | | *------------------------------------------------------------- |
3353 | | */ |
3354 | | Datum |
3355 | | byteaSetBit(PG_FUNCTION_ARGS) |
3356 | 0 | { |
3357 | 0 | bytea *res = PG_GETARG_BYTEA_P_COPY(0); |
3358 | 0 | int64 n = PG_GETARG_INT64(1); |
3359 | 0 | int32 newBit = PG_GETARG_INT32(2); |
3360 | 0 | int len; |
3361 | 0 | int oldByte, |
3362 | 0 | newByte; |
3363 | 0 | int byteNo, |
3364 | 0 | bitNo; |
3365 | |
|
3366 | 0 | len = VARSIZE(res) - VARHDRSZ; |
3367 | |
|
3368 | 0 | if (n < 0 || n >= (int64) len * 8) |
3369 | 0 | ereport(ERROR, |
3370 | 0 | (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
3371 | 0 | errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, |
3372 | 0 | n, (int64) len * 8 - 1))); |
3373 | | |
3374 | | /* n/8 is now known < len, so safe to cast to int */ |
3375 | 0 | byteNo = (int) (n / 8); |
3376 | 0 | bitNo = (int) (n % 8); |
3377 | | |
3378 | | /* |
3379 | | * sanity check! |
3380 | | */ |
3381 | 0 | if (newBit != 0 && newBit != 1) |
3382 | 0 | ereport(ERROR, |
3383 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
3384 | 0 | errmsg("new bit must be 0 or 1"))); |
3385 | | |
3386 | | /* |
3387 | | * Update the byte. |
3388 | | */ |
3389 | 0 | oldByte = ((unsigned char *) VARDATA(res))[byteNo]; |
3390 | |
|
3391 | 0 | if (newBit == 0) |
3392 | 0 | newByte = oldByte & (~(1 << bitNo)); |
3393 | 0 | else |
3394 | 0 | newByte = oldByte | (1 << bitNo); |
3395 | |
|
3396 | 0 | ((unsigned char *) VARDATA(res))[byteNo] = newByte; |
3397 | |
|
3398 | 0 | PG_RETURN_BYTEA_P(res); |
3399 | 0 | } |
3400 | | |
3401 | | /* |
3402 | | * Return reversed bytea |
3403 | | */ |
3404 | | Datum |
3405 | | bytea_reverse(PG_FUNCTION_ARGS) |
3406 | 0 | { |
3407 | 0 | bytea *v = PG_GETARG_BYTEA_PP(0); |
3408 | 0 | const char *p = VARDATA_ANY(v); |
3409 | 0 | int len = VARSIZE_ANY_EXHDR(v); |
3410 | 0 | const char *endp = p + len; |
3411 | 0 | bytea *result = palloc(len + VARHDRSZ); |
3412 | 0 | char *dst = (char *) VARDATA(result) + len; |
3413 | |
|
3414 | 0 | SET_VARSIZE(result, len + VARHDRSZ); |
3415 | |
|
3416 | 0 | while (p < endp) |
3417 | 0 | *(--dst) = *p++; |
3418 | |
|
3419 | 0 | PG_RETURN_BYTEA_P(result); |
3420 | 0 | } |
3421 | | |
3422 | | |
3423 | | /* text_name() |
3424 | | * Converts a text type to a Name type. |
3425 | | */ |
3426 | | Datum |
3427 | | text_name(PG_FUNCTION_ARGS) |
3428 | 0 | { |
3429 | 0 | text *s = PG_GETARG_TEXT_PP(0); |
3430 | 0 | Name result; |
3431 | 0 | int len; |
3432 | |
|
3433 | 0 | len = VARSIZE_ANY_EXHDR(s); |
3434 | | |
3435 | | /* Truncate oversize input */ |
3436 | 0 | if (len >= NAMEDATALEN) |
3437 | 0 | len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1); |
3438 | | |
3439 | | /* We use palloc0 here to ensure result is zero-padded */ |
3440 | 0 | result = (Name) palloc0(NAMEDATALEN); |
3441 | 0 | memcpy(NameStr(*result), VARDATA_ANY(s), len); |
3442 | |
|
3443 | 0 | PG_RETURN_NAME(result); |
3444 | 0 | } |
3445 | | |
3446 | | /* name_text() |
3447 | | * Converts a Name type to a text type. |
3448 | | */ |
3449 | | Datum |
3450 | | name_text(PG_FUNCTION_ARGS) |
3451 | 0 | { |
3452 | 0 | Name s = PG_GETARG_NAME(0); |
3453 | |
|
3454 | 0 | PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s))); |
3455 | 0 | } |
3456 | | |
3457 | | |
3458 | | /* |
3459 | | * textToQualifiedNameList - convert a text object to list of names |
3460 | | * |
3461 | | * This implements the input parsing needed by nextval() and other |
3462 | | * functions that take a text parameter representing a qualified name. |
3463 | | * We split the name at dots, downcase if not double-quoted, and |
3464 | | * truncate names if they're too long. |
3465 | | */ |
3466 | | List * |
3467 | | textToQualifiedNameList(text *textval) |
3468 | 0 | { |
3469 | 0 | char *rawname; |
3470 | 0 | List *result = NIL; |
3471 | 0 | List *namelist; |
3472 | 0 | ListCell *l; |
3473 | | |
3474 | | /* Convert to C string (handles possible detoasting). */ |
3475 | | /* Note we rely on being able to modify rawname below. */ |
3476 | 0 | rawname = text_to_cstring(textval); |
3477 | |
|
3478 | 0 | if (!SplitIdentifierString(rawname, '.', &namelist)) |
3479 | 0 | ereport(ERROR, |
3480 | 0 | (errcode(ERRCODE_INVALID_NAME), |
3481 | 0 | errmsg("invalid name syntax"))); |
3482 | | |
3483 | 0 | if (namelist == NIL) |
3484 | 0 | ereport(ERROR, |
3485 | 0 | (errcode(ERRCODE_INVALID_NAME), |
3486 | 0 | errmsg("invalid name syntax"))); |
3487 | | |
3488 | 0 | foreach(l, namelist) |
3489 | 0 | { |
3490 | 0 | char *curname = (char *) lfirst(l); |
3491 | |
|
3492 | 0 | result = lappend(result, makeString(pstrdup(curname))); |
3493 | 0 | } |
3494 | |
|
3495 | 0 | pfree(rawname); |
3496 | 0 | list_free(namelist); |
3497 | |
|
3498 | 0 | return result; |
3499 | 0 | } |
3500 | | |
3501 | | /* |
3502 | | * SplitIdentifierString --- parse a string containing identifiers |
3503 | | * |
3504 | | * This is the guts of textToQualifiedNameList, and is exported for use in |
3505 | | * other situations such as parsing GUC variables. In the GUC case, it's |
3506 | | * important to avoid memory leaks, so the API is designed to minimize the |
3507 | | * amount of stuff that needs to be allocated and freed. |
3508 | | * |
3509 | | * Inputs: |
3510 | | * rawstring: the input string; must be overwritable! On return, it's |
3511 | | * been modified to contain the separated identifiers. |
3512 | | * separator: the separator punctuation expected between identifiers |
3513 | | * (typically '.' or ','). Whitespace may also appear around |
3514 | | * identifiers. |
3515 | | * Outputs: |
3516 | | * namelist: filled with a palloc'd list of pointers to identifiers within |
3517 | | * rawstring. Caller should list_free() this even on error return. |
3518 | | * |
3519 | | * Returns true if okay, false if there is a syntax error in the string. |
3520 | | * |
3521 | | * Note that an empty string is considered okay here, though not in |
3522 | | * textToQualifiedNameList. |
3523 | | */ |
3524 | | bool |
3525 | | SplitIdentifierString(char *rawstring, char separator, |
3526 | | List **namelist) |
3527 | 0 | { |
3528 | 0 | char *nextp = rawstring; |
3529 | 0 | bool done = false; |
3530 | |
|
3531 | 0 | *namelist = NIL; |
3532 | |
|
3533 | 0 | while (scanner_isspace(*nextp)) |
3534 | 0 | nextp++; /* skip leading whitespace */ |
3535 | |
|
3536 | 0 | if (*nextp == '\0') |
3537 | 0 | return true; /* allow empty string */ |
3538 | | |
3539 | | /* At the top of the loop, we are at start of a new identifier. */ |
3540 | 0 | do |
3541 | 0 | { |
3542 | 0 | char *curname; |
3543 | 0 | char *endp; |
3544 | |
|
3545 | 0 | if (*nextp == '"') |
3546 | 0 | { |
3547 | | /* Quoted name --- collapse quote-quote pairs, no downcasing */ |
3548 | 0 | curname = nextp + 1; |
3549 | 0 | for (;;) |
3550 | 0 | { |
3551 | 0 | endp = strchr(nextp + 1, '"'); |
3552 | 0 | if (endp == NULL) |
3553 | 0 | return false; /* mismatched quotes */ |
3554 | 0 | if (endp[1] != '"') |
3555 | 0 | break; /* found end of quoted name */ |
3556 | | /* Collapse adjacent quotes into one quote, and look again */ |
3557 | 0 | memmove(endp, endp + 1, strlen(endp)); |
3558 | 0 | nextp = endp; |
3559 | 0 | } |
3560 | | /* endp now points at the terminating quote */ |
3561 | 0 | nextp = endp + 1; |
3562 | 0 | } |
3563 | 0 | else |
3564 | 0 | { |
3565 | | /* Unquoted name --- extends to separator or whitespace */ |
3566 | 0 | char *downname; |
3567 | 0 | int len; |
3568 | |
|
3569 | 0 | curname = nextp; |
3570 | 0 | while (*nextp && *nextp != separator && |
3571 | 0 | !scanner_isspace(*nextp)) |
3572 | 0 | nextp++; |
3573 | 0 | endp = nextp; |
3574 | 0 | if (curname == nextp) |
3575 | 0 | return false; /* empty unquoted name not allowed */ |
3576 | | |
3577 | | /* |
3578 | | * Downcase the identifier, using same code as main lexer does. |
3579 | | * |
3580 | | * XXX because we want to overwrite the input in-place, we cannot |
3581 | | * support a downcasing transformation that increases the string |
3582 | | * length. This is not a problem given the current implementation |
3583 | | * of downcase_truncate_identifier, but we'll probably have to do |
3584 | | * something about this someday. |
3585 | | */ |
3586 | 0 | len = endp - curname; |
3587 | 0 | downname = downcase_truncate_identifier(curname, len, false); |
3588 | 0 | Assert(strlen(downname) <= len); |
3589 | 0 | strncpy(curname, downname, len); /* strncpy is required here */ |
3590 | 0 | pfree(downname); |
3591 | 0 | } |
3592 | | |
3593 | 0 | while (scanner_isspace(*nextp)) |
3594 | 0 | nextp++; /* skip trailing whitespace */ |
3595 | |
|
3596 | 0 | if (*nextp == separator) |
3597 | 0 | { |
3598 | 0 | nextp++; |
3599 | 0 | while (scanner_isspace(*nextp)) |
3600 | 0 | nextp++; /* skip leading whitespace for next */ |
3601 | | /* we expect another name, so done remains false */ |
3602 | 0 | } |
3603 | 0 | else if (*nextp == '\0') |
3604 | 0 | done = true; |
3605 | 0 | else |
3606 | 0 | return false; /* invalid syntax */ |
3607 | | |
3608 | | /* Now safe to overwrite separator with a null */ |
3609 | 0 | *endp = '\0'; |
3610 | | |
3611 | | /* Truncate name if it's overlength */ |
3612 | 0 | truncate_identifier(curname, strlen(curname), false); |
3613 | | |
3614 | | /* |
3615 | | * Finished isolating current name --- add it to list |
3616 | | */ |
3617 | 0 | *namelist = lappend(*namelist, curname); |
3618 | | |
3619 | | /* Loop back if we didn't reach end of string */ |
3620 | 0 | } while (!done); |
3621 | | |
3622 | 0 | return true; |
3623 | 0 | } |
3624 | | |
3625 | | |
3626 | | /* |
3627 | | * SplitDirectoriesString --- parse a string containing file/directory names |
3628 | | * |
3629 | | * This works fine on file names too; the function name is historical. |
3630 | | * |
3631 | | * This is similar to SplitIdentifierString, except that the parsing |
3632 | | * rules are meant to handle pathnames instead of identifiers: there is |
3633 | | * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1, |
3634 | | * and we apply canonicalize_path() to each extracted string. Because of the |
3635 | | * last, the returned strings are separately palloc'd rather than being |
3636 | | * pointers into rawstring --- but we still scribble on rawstring. |
3637 | | * |
3638 | | * Inputs: |
3639 | | * rawstring: the input string; must be modifiable! |
3640 | | * separator: the separator punctuation expected between directories |
3641 | | * (typically ',' or ';'). Whitespace may also appear around |
3642 | | * directories. |
3643 | | * Outputs: |
3644 | | * namelist: filled with a palloc'd list of directory names. |
3645 | | * Caller should list_free_deep() this even on error return. |
3646 | | * |
3647 | | * Returns true if okay, false if there is a syntax error in the string. |
3648 | | * |
3649 | | * Note that an empty string is considered okay here. |
3650 | | */ |
3651 | | bool |
3652 | | SplitDirectoriesString(char *rawstring, char separator, |
3653 | | List **namelist) |
3654 | 0 | { |
3655 | 0 | char *nextp = rawstring; |
3656 | 0 | bool done = false; |
3657 | |
|
3658 | 0 | *namelist = NIL; |
3659 | |
|
3660 | 0 | while (scanner_isspace(*nextp)) |
3661 | 0 | nextp++; /* skip leading whitespace */ |
3662 | |
|
3663 | 0 | if (*nextp == '\0') |
3664 | 0 | return true; /* allow empty string */ |
3665 | | |
3666 | | /* At the top of the loop, we are at start of a new directory. */ |
3667 | 0 | do |
3668 | 0 | { |
3669 | 0 | char *curname; |
3670 | 0 | char *endp; |
3671 | |
|
3672 | 0 | if (*nextp == '"') |
3673 | 0 | { |
3674 | | /* Quoted name --- collapse quote-quote pairs */ |
3675 | 0 | curname = nextp + 1; |
3676 | 0 | for (;;) |
3677 | 0 | { |
3678 | 0 | endp = strchr(nextp + 1, '"'); |
3679 | 0 | if (endp == NULL) |
3680 | 0 | return false; /* mismatched quotes */ |
3681 | 0 | if (endp[1] != '"') |
3682 | 0 | break; /* found end of quoted name */ |
3683 | | /* Collapse adjacent quotes into one quote, and look again */ |
3684 | 0 | memmove(endp, endp + 1, strlen(endp)); |
3685 | 0 | nextp = endp; |
3686 | 0 | } |
3687 | | /* endp now points at the terminating quote */ |
3688 | 0 | nextp = endp + 1; |
3689 | 0 | } |
3690 | 0 | else |
3691 | 0 | { |
3692 | | /* Unquoted name --- extends to separator or end of string */ |
3693 | 0 | curname = endp = nextp; |
3694 | 0 | while (*nextp && *nextp != separator) |
3695 | 0 | { |
3696 | | /* trailing whitespace should not be included in name */ |
3697 | 0 | if (!scanner_isspace(*nextp)) |
3698 | 0 | endp = nextp + 1; |
3699 | 0 | nextp++; |
3700 | 0 | } |
3701 | 0 | if (curname == endp) |
3702 | 0 | return false; /* empty unquoted name not allowed */ |
3703 | 0 | } |
3704 | | |
3705 | 0 | while (scanner_isspace(*nextp)) |
3706 | 0 | nextp++; /* skip trailing whitespace */ |
3707 | |
|
3708 | 0 | if (*nextp == separator) |
3709 | 0 | { |
3710 | 0 | nextp++; |
3711 | 0 | while (scanner_isspace(*nextp)) |
3712 | 0 | nextp++; /* skip leading whitespace for next */ |
3713 | | /* we expect another name, so done remains false */ |
3714 | 0 | } |
3715 | 0 | else if (*nextp == '\0') |
3716 | 0 | done = true; |
3717 | 0 | else |
3718 | 0 | return false; /* invalid syntax */ |
3719 | | |
3720 | | /* Now safe to overwrite separator with a null */ |
3721 | 0 | *endp = '\0'; |
3722 | | |
3723 | | /* Truncate path if it's overlength */ |
3724 | 0 | if (strlen(curname) >= MAXPGPATH) |
3725 | 0 | curname[MAXPGPATH - 1] = '\0'; |
3726 | | |
3727 | | /* |
3728 | | * Finished isolating current name --- add it to list |
3729 | | */ |
3730 | 0 | curname = pstrdup(curname); |
3731 | 0 | canonicalize_path(curname); |
3732 | 0 | *namelist = lappend(*namelist, curname); |
3733 | | |
3734 | | /* Loop back if we didn't reach end of string */ |
3735 | 0 | } while (!done); |
3736 | | |
3737 | 0 | return true; |
3738 | 0 | } |
3739 | | |
3740 | | |
3741 | | /* |
3742 | | * SplitGUCList --- parse a string containing identifiers or file names |
3743 | | * |
3744 | | * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without |
3745 | | * presuming whether the elements will be taken as identifiers or file names. |
3746 | | * We assume the input has already been through flatten_set_variable_args(), |
3747 | | * so that we need never downcase (if appropriate, that was done already). |
3748 | | * Nor do we ever truncate, since we don't know the correct max length. |
3749 | | * We disallow embedded whitespace for simplicity (it shouldn't matter, |
3750 | | * because any embedded whitespace should have led to double-quoting). |
3751 | | * Otherwise the API is identical to SplitIdentifierString. |
3752 | | * |
3753 | | * XXX it's annoying to have so many copies of this string-splitting logic. |
3754 | | * However, it's not clear that having one function with a bunch of option |
3755 | | * flags would be much better. |
3756 | | * |
3757 | | * XXX there is a version of this function in src/bin/pg_dump/dumputils.c. |
3758 | | * Be sure to update that if you have to change this. |
3759 | | * |
3760 | | * Inputs: |
3761 | | * rawstring: the input string; must be overwritable! On return, it's |
3762 | | * been modified to contain the separated identifiers. |
3763 | | * separator: the separator punctuation expected between identifiers |
3764 | | * (typically '.' or ','). Whitespace may also appear around |
3765 | | * identifiers. |
3766 | | * Outputs: |
3767 | | * namelist: filled with a palloc'd list of pointers to identifiers within |
3768 | | * rawstring. Caller should list_free() this even on error return. |
3769 | | * |
3770 | | * Returns true if okay, false if there is a syntax error in the string. |
3771 | | */ |
3772 | | bool |
3773 | | SplitGUCList(char *rawstring, char separator, |
3774 | | List **namelist) |
3775 | 0 | { |
3776 | 0 | char *nextp = rawstring; |
3777 | 0 | bool done = false; |
3778 | |
|
3779 | 0 | *namelist = NIL; |
3780 | |
|
3781 | 0 | while (scanner_isspace(*nextp)) |
3782 | 0 | nextp++; /* skip leading whitespace */ |
3783 | |
|
3784 | 0 | if (*nextp == '\0') |
3785 | 0 | return true; /* allow empty string */ |
3786 | | |
3787 | | /* At the top of the loop, we are at start of a new identifier. */ |
3788 | 0 | do |
3789 | 0 | { |
3790 | 0 | char *curname; |
3791 | 0 | char *endp; |
3792 | |
|
3793 | 0 | if (*nextp == '"') |
3794 | 0 | { |
3795 | | /* Quoted name --- collapse quote-quote pairs */ |
3796 | 0 | curname = nextp + 1; |
3797 | 0 | for (;;) |
3798 | 0 | { |
3799 | 0 | endp = strchr(nextp + 1, '"'); |
3800 | 0 | if (endp == NULL) |
3801 | 0 | return false; /* mismatched quotes */ |
3802 | 0 | if (endp[1] != '"') |
3803 | 0 | break; /* found end of quoted name */ |
3804 | | /* Collapse adjacent quotes into one quote, and look again */ |
3805 | 0 | memmove(endp, endp + 1, strlen(endp)); |
3806 | 0 | nextp = endp; |
3807 | 0 | } |
3808 | | /* endp now points at the terminating quote */ |
3809 | 0 | nextp = endp + 1; |
3810 | 0 | } |
3811 | 0 | else |
3812 | 0 | { |
3813 | | /* Unquoted name --- extends to separator or whitespace */ |
3814 | 0 | curname = nextp; |
3815 | 0 | while (*nextp && *nextp != separator && |
3816 | 0 | !scanner_isspace(*nextp)) |
3817 | 0 | nextp++; |
3818 | 0 | endp = nextp; |
3819 | 0 | if (curname == nextp) |
3820 | 0 | return false; /* empty unquoted name not allowed */ |
3821 | 0 | } |
3822 | | |
3823 | 0 | while (scanner_isspace(*nextp)) |
3824 | 0 | nextp++; /* skip trailing whitespace */ |
3825 | |
|
3826 | 0 | if (*nextp == separator) |
3827 | 0 | { |
3828 | 0 | nextp++; |
3829 | 0 | while (scanner_isspace(*nextp)) |
3830 | 0 | nextp++; /* skip leading whitespace for next */ |
3831 | | /* we expect another name, so done remains false */ |
3832 | 0 | } |
3833 | 0 | else if (*nextp == '\0') |
3834 | 0 | done = true; |
3835 | 0 | else |
3836 | 0 | return false; /* invalid syntax */ |
3837 | | |
3838 | | /* Now safe to overwrite separator with a null */ |
3839 | 0 | *endp = '\0'; |
3840 | | |
3841 | | /* |
3842 | | * Finished isolating current name --- add it to list |
3843 | | */ |
3844 | 0 | *namelist = lappend(*namelist, curname); |
3845 | | |
3846 | | /* Loop back if we didn't reach end of string */ |
3847 | 0 | } while (!done); |
3848 | | |
3849 | 0 | return true; |
3850 | 0 | } |
3851 | | |
3852 | | |
3853 | | /***************************************************************************** |
3854 | | * Comparison Functions used for bytea |
3855 | | * |
3856 | | * Note: btree indexes need these routines not to leak memory; therefore, |
3857 | | * be careful to free working copies of toasted datums. Most places don't |
3858 | | * need to be so careful. |
3859 | | *****************************************************************************/ |
3860 | | |
3861 | | Datum |
3862 | | byteaeq(PG_FUNCTION_ARGS) |
3863 | 0 | { |
3864 | 0 | Datum arg1 = PG_GETARG_DATUM(0); |
3865 | 0 | Datum arg2 = PG_GETARG_DATUM(1); |
3866 | 0 | bool result; |
3867 | 0 | Size len1, |
3868 | 0 | len2; |
3869 | | |
3870 | | /* |
3871 | | * We can use a fast path for unequal lengths, which might save us from |
3872 | | * having to detoast one or both values. |
3873 | | */ |
3874 | 0 | len1 = toast_raw_datum_size(arg1); |
3875 | 0 | len2 = toast_raw_datum_size(arg2); |
3876 | 0 | if (len1 != len2) |
3877 | 0 | result = false; |
3878 | 0 | else |
3879 | 0 | { |
3880 | 0 | bytea *barg1 = DatumGetByteaPP(arg1); |
3881 | 0 | bytea *barg2 = DatumGetByteaPP(arg2); |
3882 | |
|
3883 | 0 | result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), |
3884 | 0 | len1 - VARHDRSZ) == 0); |
3885 | |
|
3886 | 0 | PG_FREE_IF_COPY(barg1, 0); |
3887 | 0 | PG_FREE_IF_COPY(barg2, 1); |
3888 | 0 | } |
3889 | |
|
3890 | 0 | PG_RETURN_BOOL(result); |
3891 | 0 | } |
3892 | | |
3893 | | Datum |
3894 | | byteane(PG_FUNCTION_ARGS) |
3895 | 0 | { |
3896 | 0 | Datum arg1 = PG_GETARG_DATUM(0); |
3897 | 0 | Datum arg2 = PG_GETARG_DATUM(1); |
3898 | 0 | bool result; |
3899 | 0 | Size len1, |
3900 | 0 | len2; |
3901 | | |
3902 | | /* |
3903 | | * We can use a fast path for unequal lengths, which might save us from |
3904 | | * having to detoast one or both values. |
3905 | | */ |
3906 | 0 | len1 = toast_raw_datum_size(arg1); |
3907 | 0 | len2 = toast_raw_datum_size(arg2); |
3908 | 0 | if (len1 != len2) |
3909 | 0 | result = true; |
3910 | 0 | else |
3911 | 0 | { |
3912 | 0 | bytea *barg1 = DatumGetByteaPP(arg1); |
3913 | 0 | bytea *barg2 = DatumGetByteaPP(arg2); |
3914 | |
|
3915 | 0 | result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), |
3916 | 0 | len1 - VARHDRSZ) != 0); |
3917 | |
|
3918 | 0 | PG_FREE_IF_COPY(barg1, 0); |
3919 | 0 | PG_FREE_IF_COPY(barg2, 1); |
3920 | 0 | } |
3921 | |
|
3922 | 0 | PG_RETURN_BOOL(result); |
3923 | 0 | } |
3924 | | |
3925 | | Datum |
3926 | | bytealt(PG_FUNCTION_ARGS) |
3927 | 0 | { |
3928 | 0 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
3929 | 0 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
3930 | 0 | int len1, |
3931 | 0 | len2; |
3932 | 0 | int cmp; |
3933 | |
|
3934 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
3935 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
3936 | |
|
3937 | 0 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
3938 | |
|
3939 | 0 | PG_FREE_IF_COPY(arg1, 0); |
3940 | 0 | PG_FREE_IF_COPY(arg2, 1); |
3941 | |
|
3942 | 0 | PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2))); |
3943 | 0 | } |
3944 | | |
3945 | | Datum |
3946 | | byteale(PG_FUNCTION_ARGS) |
3947 | 0 | { |
3948 | 0 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
3949 | 0 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
3950 | 0 | int len1, |
3951 | 0 | len2; |
3952 | 0 | int cmp; |
3953 | |
|
3954 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
3955 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
3956 | |
|
3957 | 0 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
3958 | |
|
3959 | 0 | PG_FREE_IF_COPY(arg1, 0); |
3960 | 0 | PG_FREE_IF_COPY(arg2, 1); |
3961 | |
|
3962 | 0 | PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2))); |
3963 | 0 | } |
3964 | | |
3965 | | Datum |
3966 | | byteagt(PG_FUNCTION_ARGS) |
3967 | 0 | { |
3968 | 0 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
3969 | 0 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
3970 | 0 | int len1, |
3971 | 0 | len2; |
3972 | 0 | int cmp; |
3973 | |
|
3974 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
3975 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
3976 | |
|
3977 | 0 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
3978 | |
|
3979 | 0 | PG_FREE_IF_COPY(arg1, 0); |
3980 | 0 | PG_FREE_IF_COPY(arg2, 1); |
3981 | |
|
3982 | 0 | PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2))); |
3983 | 0 | } |
3984 | | |
3985 | | Datum |
3986 | | byteage(PG_FUNCTION_ARGS) |
3987 | 0 | { |
3988 | 0 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
3989 | 0 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
3990 | 0 | int len1, |
3991 | 0 | len2; |
3992 | 0 | int cmp; |
3993 | |
|
3994 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
3995 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
3996 | |
|
3997 | 0 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
3998 | |
|
3999 | 0 | PG_FREE_IF_COPY(arg1, 0); |
4000 | 0 | PG_FREE_IF_COPY(arg2, 1); |
4001 | |
|
4002 | 0 | PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2))); |
4003 | 0 | } |
4004 | | |
4005 | | Datum |
4006 | | byteacmp(PG_FUNCTION_ARGS) |
4007 | 0 | { |
4008 | 0 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
4009 | 0 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
4010 | 0 | int len1, |
4011 | 0 | len2; |
4012 | 0 | int cmp; |
4013 | |
|
4014 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
4015 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
4016 | |
|
4017 | 0 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
4018 | 0 | if ((cmp == 0) && (len1 != len2)) |
4019 | 0 | cmp = (len1 < len2) ? -1 : 1; |
4020 | |
|
4021 | 0 | PG_FREE_IF_COPY(arg1, 0); |
4022 | 0 | PG_FREE_IF_COPY(arg2, 1); |
4023 | |
|
4024 | 0 | PG_RETURN_INT32(cmp); |
4025 | 0 | } |
4026 | | |
4027 | | Datum |
4028 | | bytea_larger(PG_FUNCTION_ARGS) |
4029 | 0 | { |
4030 | 0 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
4031 | 0 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
4032 | 0 | bytea *result; |
4033 | 0 | int len1, |
4034 | 0 | len2; |
4035 | 0 | int cmp; |
4036 | |
|
4037 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
4038 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
4039 | |
|
4040 | 0 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
4041 | 0 | result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2); |
4042 | |
|
4043 | 0 | PG_RETURN_BYTEA_P(result); |
4044 | 0 | } |
4045 | | |
4046 | | Datum |
4047 | | bytea_smaller(PG_FUNCTION_ARGS) |
4048 | 0 | { |
4049 | 0 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
4050 | 0 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
4051 | 0 | bytea *result; |
4052 | 0 | int len1, |
4053 | 0 | len2; |
4054 | 0 | int cmp; |
4055 | |
|
4056 | 0 | len1 = VARSIZE_ANY_EXHDR(arg1); |
4057 | 0 | len2 = VARSIZE_ANY_EXHDR(arg2); |
4058 | |
|
4059 | 0 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
4060 | 0 | result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2); |
4061 | |
|
4062 | 0 | PG_RETURN_BYTEA_P(result); |
4063 | 0 | } |
4064 | | |
4065 | | Datum |
4066 | | bytea_sortsupport(PG_FUNCTION_ARGS) |
4067 | 0 | { |
4068 | 0 | SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); |
4069 | 0 | MemoryContext oldcontext; |
4070 | |
|
4071 | 0 | oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); |
4072 | | |
4073 | | /* Use generic string SortSupport, forcing "C" collation */ |
4074 | 0 | varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID); |
4075 | |
|
4076 | 0 | MemoryContextSwitchTo(oldcontext); |
4077 | |
|
4078 | 0 | PG_RETURN_VOID(); |
4079 | 0 | } |
4080 | | |
4081 | | /* Cast bytea -> int2 */ |
4082 | | Datum |
4083 | | bytea_int2(PG_FUNCTION_ARGS) |
4084 | 0 | { |
4085 | 0 | bytea *v = PG_GETARG_BYTEA_PP(0); |
4086 | 0 | int len = VARSIZE_ANY_EXHDR(v); |
4087 | 0 | uint16 result; |
4088 | | |
4089 | | /* Check that the byte array is not too long */ |
4090 | 0 | if (len > sizeof(result)) |
4091 | 0 | ereport(ERROR, |
4092 | 0 | errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
4093 | 0 | errmsg("smallint out of range")); |
4094 | | |
4095 | | /* Convert it to an integer; most significant bytes come first */ |
4096 | 0 | result = 0; |
4097 | 0 | for (int i = 0; i < len; i++) |
4098 | 0 | { |
4099 | 0 | result <<= BITS_PER_BYTE; |
4100 | 0 | result |= ((unsigned char *) VARDATA_ANY(v))[i]; |
4101 | 0 | } |
4102 | |
|
4103 | 0 | PG_RETURN_INT16(result); |
4104 | 0 | } |
4105 | | |
4106 | | /* Cast bytea -> int4 */ |
4107 | | Datum |
4108 | | bytea_int4(PG_FUNCTION_ARGS) |
4109 | 0 | { |
4110 | 0 | bytea *v = PG_GETARG_BYTEA_PP(0); |
4111 | 0 | int len = VARSIZE_ANY_EXHDR(v); |
4112 | 0 | uint32 result; |
4113 | | |
4114 | | /* Check that the byte array is not too long */ |
4115 | 0 | if (len > sizeof(result)) |
4116 | 0 | ereport(ERROR, |
4117 | 0 | errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
4118 | 0 | errmsg("integer out of range")); |
4119 | | |
4120 | | /* Convert it to an integer; most significant bytes come first */ |
4121 | 0 | result = 0; |
4122 | 0 | for (int i = 0; i < len; i++) |
4123 | 0 | { |
4124 | 0 | result <<= BITS_PER_BYTE; |
4125 | 0 | result |= ((unsigned char *) VARDATA_ANY(v))[i]; |
4126 | 0 | } |
4127 | |
|
4128 | 0 | PG_RETURN_INT32(result); |
4129 | 0 | } |
4130 | | |
4131 | | /* Cast bytea -> int8 */ |
4132 | | Datum |
4133 | | bytea_int8(PG_FUNCTION_ARGS) |
4134 | 0 | { |
4135 | 0 | bytea *v = PG_GETARG_BYTEA_PP(0); |
4136 | 0 | int len = VARSIZE_ANY_EXHDR(v); |
4137 | 0 | uint64 result; |
4138 | | |
4139 | | /* Check that the byte array is not too long */ |
4140 | 0 | if (len > sizeof(result)) |
4141 | 0 | ereport(ERROR, |
4142 | 0 | errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
4143 | 0 | errmsg("bigint out of range")); |
4144 | | |
4145 | | /* Convert it to an integer; most significant bytes come first */ |
4146 | 0 | result = 0; |
4147 | 0 | for (int i = 0; i < len; i++) |
4148 | 0 | { |
4149 | 0 | result <<= BITS_PER_BYTE; |
4150 | 0 | result |= ((unsigned char *) VARDATA_ANY(v))[i]; |
4151 | 0 | } |
4152 | |
|
4153 | 0 | PG_RETURN_INT64(result); |
4154 | 0 | } |
4155 | | |
4156 | | /* Cast int2 -> bytea; can just use int2send() */ |
4157 | | Datum |
4158 | | int2_bytea(PG_FUNCTION_ARGS) |
4159 | 0 | { |
4160 | 0 | return int2send(fcinfo); |
4161 | 0 | } |
4162 | | |
4163 | | /* Cast int4 -> bytea; can just use int4send() */ |
4164 | | Datum |
4165 | | int4_bytea(PG_FUNCTION_ARGS) |
4166 | 0 | { |
4167 | 0 | return int4send(fcinfo); |
4168 | 0 | } |
4169 | | |
4170 | | /* Cast int8 -> bytea; can just use int8send() */ |
4171 | | Datum |
4172 | | int8_bytea(PG_FUNCTION_ARGS) |
4173 | 0 | { |
4174 | 0 | return int8send(fcinfo); |
4175 | 0 | } |
4176 | | |
4177 | | /* |
4178 | | * appendStringInfoText |
4179 | | * |
4180 | | * Append a text to str. |
4181 | | * Like appendStringInfoString(str, text_to_cstring(t)) but faster. |
4182 | | */ |
4183 | | static void |
4184 | | appendStringInfoText(StringInfo str, const text *t) |
4185 | 0 | { |
4186 | 0 | appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t)); |
4187 | 0 | } |
4188 | | |
4189 | | /* |
4190 | | * replace_text |
4191 | | * replace all occurrences of 'old_sub_str' in 'orig_str' |
4192 | | * with 'new_sub_str' to form 'new_str' |
4193 | | * |
4194 | | * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == '' |
4195 | | * otherwise returns 'new_str' |
4196 | | */ |
4197 | | Datum |
4198 | | replace_text(PG_FUNCTION_ARGS) |
4199 | 0 | { |
4200 | 0 | text *src_text = PG_GETARG_TEXT_PP(0); |
4201 | 0 | text *from_sub_text = PG_GETARG_TEXT_PP(1); |
4202 | 0 | text *to_sub_text = PG_GETARG_TEXT_PP(2); |
4203 | 0 | int src_text_len; |
4204 | 0 | int from_sub_text_len; |
4205 | 0 | TextPositionState state; |
4206 | 0 | text *ret_text; |
4207 | 0 | int chunk_len; |
4208 | 0 | char *curr_ptr; |
4209 | 0 | char *start_ptr; |
4210 | 0 | StringInfoData str; |
4211 | 0 | bool found; |
4212 | |
|
4213 | 0 | src_text_len = VARSIZE_ANY_EXHDR(src_text); |
4214 | 0 | from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text); |
4215 | | |
4216 | | /* Return unmodified source string if empty source or pattern */ |
4217 | 0 | if (src_text_len < 1 || from_sub_text_len < 1) |
4218 | 0 | { |
4219 | 0 | PG_RETURN_TEXT_P(src_text); |
4220 | 0 | } |
4221 | | |
4222 | 0 | text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state); |
4223 | |
|
4224 | 0 | found = text_position_next(&state); |
4225 | | |
4226 | | /* When the from_sub_text is not found, there is nothing to do. */ |
4227 | 0 | if (!found) |
4228 | 0 | { |
4229 | 0 | text_position_cleanup(&state); |
4230 | 0 | PG_RETURN_TEXT_P(src_text); |
4231 | 0 | } |
4232 | 0 | curr_ptr = text_position_get_match_ptr(&state); |
4233 | 0 | start_ptr = VARDATA_ANY(src_text); |
4234 | |
|
4235 | 0 | initStringInfo(&str); |
4236 | |
|
4237 | 0 | do |
4238 | 0 | { |
4239 | 0 | CHECK_FOR_INTERRUPTS(); |
4240 | | |
4241 | | /* copy the data skipped over by last text_position_next() */ |
4242 | 0 | chunk_len = curr_ptr - start_ptr; |
4243 | 0 | appendBinaryStringInfo(&str, start_ptr, chunk_len); |
4244 | |
|
4245 | 0 | appendStringInfoText(&str, to_sub_text); |
4246 | |
|
4247 | 0 | start_ptr = curr_ptr + state.last_match_len; |
4248 | |
|
4249 | 0 | found = text_position_next(&state); |
4250 | 0 | if (found) |
4251 | 0 | curr_ptr = text_position_get_match_ptr(&state); |
4252 | 0 | } |
4253 | 0 | while (found); |
4254 | | |
4255 | | /* copy trailing data */ |
4256 | 0 | chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr; |
4257 | 0 | appendBinaryStringInfo(&str, start_ptr, chunk_len); |
4258 | |
|
4259 | 0 | text_position_cleanup(&state); |
4260 | |
|
4261 | 0 | ret_text = cstring_to_text_with_len(str.data, str.len); |
4262 | 0 | pfree(str.data); |
4263 | |
|
4264 | 0 | PG_RETURN_TEXT_P(ret_text); |
4265 | 0 | } |
4266 | | |
4267 | | /* |
4268 | | * check_replace_text_has_escape |
4269 | | * |
4270 | | * Returns 0 if text contains no backslashes that need processing. |
4271 | | * Returns 1 if text contains backslashes, but not regexp submatch specifiers. |
4272 | | * Returns 2 if text contains regexp submatch specifiers (\1 .. \9). |
4273 | | */ |
4274 | | static int |
4275 | | check_replace_text_has_escape(const text *replace_text) |
4276 | 0 | { |
4277 | 0 | int result = 0; |
4278 | 0 | const char *p = VARDATA_ANY(replace_text); |
4279 | 0 | const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text); |
4280 | |
|
4281 | 0 | while (p < p_end) |
4282 | 0 | { |
4283 | | /* Find next escape char, if any. */ |
4284 | 0 | p = memchr(p, '\\', p_end - p); |
4285 | 0 | if (p == NULL) |
4286 | 0 | break; |
4287 | 0 | p++; |
4288 | | /* Note: a backslash at the end doesn't require extra processing. */ |
4289 | 0 | if (p < p_end) |
4290 | 0 | { |
4291 | 0 | if (*p >= '1' && *p <= '9') |
4292 | 0 | return 2; /* Found a submatch specifier, so done */ |
4293 | 0 | result = 1; /* Found some other sequence, keep looking */ |
4294 | 0 | p++; |
4295 | 0 | } |
4296 | 0 | } |
4297 | 0 | return result; |
4298 | 0 | } |
4299 | | |
4300 | | /* |
4301 | | * appendStringInfoRegexpSubstr |
4302 | | * |
4303 | | * Append replace_text to str, substituting regexp back references for |
4304 | | * \n escapes. start_ptr is the start of the match in the source string, |
4305 | | * at logical character position data_pos. |
4306 | | */ |
4307 | | static void |
4308 | | appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, |
4309 | | regmatch_t *pmatch, |
4310 | | char *start_ptr, int data_pos) |
4311 | 0 | { |
4312 | 0 | const char *p = VARDATA_ANY(replace_text); |
4313 | 0 | const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text); |
4314 | |
|
4315 | 0 | while (p < p_end) |
4316 | 0 | { |
4317 | 0 | const char *chunk_start = p; |
4318 | 0 | int so; |
4319 | 0 | int eo; |
4320 | | |
4321 | | /* Find next escape char, if any. */ |
4322 | 0 | p = memchr(p, '\\', p_end - p); |
4323 | 0 | if (p == NULL) |
4324 | 0 | p = p_end; |
4325 | | |
4326 | | /* Copy the text we just scanned over, if any. */ |
4327 | 0 | if (p > chunk_start) |
4328 | 0 | appendBinaryStringInfo(str, chunk_start, p - chunk_start); |
4329 | | |
4330 | | /* Done if at end of string, else advance over escape char. */ |
4331 | 0 | if (p >= p_end) |
4332 | 0 | break; |
4333 | 0 | p++; |
4334 | |
|
4335 | 0 | if (p >= p_end) |
4336 | 0 | { |
4337 | | /* Escape at very end of input. Treat same as unexpected char */ |
4338 | 0 | appendStringInfoChar(str, '\\'); |
4339 | 0 | break; |
4340 | 0 | } |
4341 | | |
4342 | 0 | if (*p >= '1' && *p <= '9') |
4343 | 0 | { |
4344 | | /* Use the back reference of regexp. */ |
4345 | 0 | int idx = *p - '0'; |
4346 | |
|
4347 | 0 | so = pmatch[idx].rm_so; |
4348 | 0 | eo = pmatch[idx].rm_eo; |
4349 | 0 | p++; |
4350 | 0 | } |
4351 | 0 | else if (*p == '&') |
4352 | 0 | { |
4353 | | /* Use the entire matched string. */ |
4354 | 0 | so = pmatch[0].rm_so; |
4355 | 0 | eo = pmatch[0].rm_eo; |
4356 | 0 | p++; |
4357 | 0 | } |
4358 | 0 | else if (*p == '\\') |
4359 | 0 | { |
4360 | | /* \\ means transfer one \ to output. */ |
4361 | 0 | appendStringInfoChar(str, '\\'); |
4362 | 0 | p++; |
4363 | 0 | continue; |
4364 | 0 | } |
4365 | 0 | else |
4366 | 0 | { |
4367 | | /* |
4368 | | * If escape char is not followed by any expected char, just treat |
4369 | | * it as ordinary data to copy. (XXX would it be better to throw |
4370 | | * an error?) |
4371 | | */ |
4372 | 0 | appendStringInfoChar(str, '\\'); |
4373 | 0 | continue; |
4374 | 0 | } |
4375 | | |
4376 | 0 | if (so >= 0 && eo >= 0) |
4377 | 0 | { |
4378 | | /* |
4379 | | * Copy the text that is back reference of regexp. Note so and eo |
4380 | | * are counted in characters not bytes. |
4381 | | */ |
4382 | 0 | char *chunk_start; |
4383 | 0 | int chunk_len; |
4384 | |
|
4385 | 0 | Assert(so >= data_pos); |
4386 | 0 | chunk_start = start_ptr; |
4387 | 0 | chunk_start += charlen_to_bytelen(chunk_start, so - data_pos); |
4388 | 0 | chunk_len = charlen_to_bytelen(chunk_start, eo - so); |
4389 | 0 | appendBinaryStringInfo(str, chunk_start, chunk_len); |
4390 | 0 | } |
4391 | 0 | } |
4392 | 0 | } |
4393 | | |
4394 | | /* |
4395 | | * replace_text_regexp |
4396 | | * |
4397 | | * replace substring(s) in src_text that match pattern with replace_text. |
4398 | | * The replace_text can contain backslash markers to substitute |
4399 | | * (parts of) the matched text. |
4400 | | * |
4401 | | * cflags: regexp compile flags. |
4402 | | * collation: collation to use. |
4403 | | * search_start: the character (not byte) offset in src_text at which to |
4404 | | * begin searching. |
4405 | | * n: if 0, replace all matches; if > 0, replace only the N'th match. |
4406 | | */ |
4407 | | text * |
4408 | | replace_text_regexp(text *src_text, text *pattern_text, |
4409 | | text *replace_text, |
4410 | | int cflags, Oid collation, |
4411 | | int search_start, int n) |
4412 | 0 | { |
4413 | 0 | text *ret_text; |
4414 | 0 | regex_t *re; |
4415 | 0 | int src_text_len = VARSIZE_ANY_EXHDR(src_text); |
4416 | 0 | int nmatches = 0; |
4417 | 0 | StringInfoData buf; |
4418 | 0 | regmatch_t pmatch[10]; /* main match, plus \1 to \9 */ |
4419 | 0 | int nmatch = lengthof(pmatch); |
4420 | 0 | pg_wchar *data; |
4421 | 0 | size_t data_len; |
4422 | 0 | int data_pos; |
4423 | 0 | char *start_ptr; |
4424 | 0 | int escape_status; |
4425 | |
|
4426 | 0 | initStringInfo(&buf); |
4427 | | |
4428 | | /* Convert data string to wide characters. */ |
4429 | 0 | data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar)); |
4430 | 0 | data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len); |
4431 | | |
4432 | | /* Check whether replace_text has escapes, especially regexp submatches. */ |
4433 | 0 | escape_status = check_replace_text_has_escape(replace_text); |
4434 | | |
4435 | | /* If no regexp submatches, we can use REG_NOSUB. */ |
4436 | 0 | if (escape_status < 2) |
4437 | 0 | { |
4438 | 0 | cflags |= REG_NOSUB; |
4439 | | /* Also tell pg_regexec we only want the whole-match location. */ |
4440 | 0 | nmatch = 1; |
4441 | 0 | } |
4442 | | |
4443 | | /* Prepare the regexp. */ |
4444 | 0 | re = RE_compile_and_cache(pattern_text, cflags, collation); |
4445 | | |
4446 | | /* start_ptr points to the data_pos'th character of src_text */ |
4447 | 0 | start_ptr = (char *) VARDATA_ANY(src_text); |
4448 | 0 | data_pos = 0; |
4449 | |
|
4450 | 0 | while (search_start <= data_len) |
4451 | 0 | { |
4452 | 0 | int regexec_result; |
4453 | |
|
4454 | 0 | CHECK_FOR_INTERRUPTS(); |
4455 | |
|
4456 | 0 | regexec_result = pg_regexec(re, |
4457 | 0 | data, |
4458 | 0 | data_len, |
4459 | 0 | search_start, |
4460 | 0 | NULL, /* no details */ |
4461 | 0 | nmatch, |
4462 | 0 | pmatch, |
4463 | 0 | 0); |
4464 | |
|
4465 | 0 | if (regexec_result == REG_NOMATCH) |
4466 | 0 | break; |
4467 | | |
4468 | 0 | if (regexec_result != REG_OKAY) |
4469 | 0 | { |
4470 | 0 | char errMsg[100]; |
4471 | |
|
4472 | 0 | pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); |
4473 | 0 | ereport(ERROR, |
4474 | 0 | (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), |
4475 | 0 | errmsg("regular expression failed: %s", errMsg))); |
4476 | 0 | } |
4477 | | |
4478 | | /* |
4479 | | * Count matches, and decide whether to replace this match. |
4480 | | */ |
4481 | 0 | nmatches++; |
4482 | 0 | if (n > 0 && nmatches != n) |
4483 | 0 | { |
4484 | | /* |
4485 | | * No, so advance search_start, but not start_ptr/data_pos. (Thus, |
4486 | | * we treat the matched text as if it weren't matched, and copy it |
4487 | | * to the output later.) |
4488 | | */ |
4489 | 0 | search_start = pmatch[0].rm_eo; |
4490 | 0 | if (pmatch[0].rm_so == pmatch[0].rm_eo) |
4491 | 0 | search_start++; |
4492 | 0 | continue; |
4493 | 0 | } |
4494 | | |
4495 | | /* |
4496 | | * Copy the text to the left of the match position. Note we are given |
4497 | | * character not byte indexes. |
4498 | | */ |
4499 | 0 | if (pmatch[0].rm_so - data_pos > 0) |
4500 | 0 | { |
4501 | 0 | int chunk_len; |
4502 | |
|
4503 | 0 | chunk_len = charlen_to_bytelen(start_ptr, |
4504 | 0 | pmatch[0].rm_so - data_pos); |
4505 | 0 | appendBinaryStringInfo(&buf, start_ptr, chunk_len); |
4506 | | |
4507 | | /* |
4508 | | * Advance start_ptr over that text, to avoid multiple rescans of |
4509 | | * it if the replace_text contains multiple back-references. |
4510 | | */ |
4511 | 0 | start_ptr += chunk_len; |
4512 | 0 | data_pos = pmatch[0].rm_so; |
4513 | 0 | } |
4514 | | |
4515 | | /* |
4516 | | * Copy the replace_text, processing escapes if any are present. |
4517 | | */ |
4518 | 0 | if (escape_status > 0) |
4519 | 0 | appendStringInfoRegexpSubstr(&buf, replace_text, pmatch, |
4520 | 0 | start_ptr, data_pos); |
4521 | 0 | else |
4522 | 0 | appendStringInfoText(&buf, replace_text); |
4523 | | |
4524 | | /* Advance start_ptr and data_pos over the matched text. */ |
4525 | 0 | start_ptr += charlen_to_bytelen(start_ptr, |
4526 | 0 | pmatch[0].rm_eo - data_pos); |
4527 | 0 | data_pos = pmatch[0].rm_eo; |
4528 | | |
4529 | | /* |
4530 | | * If we only want to replace one occurrence, we're done. |
4531 | | */ |
4532 | 0 | if (n > 0) |
4533 | 0 | break; |
4534 | | |
4535 | | /* |
4536 | | * Advance search position. Normally we start the next search at the |
4537 | | * end of the previous match; but if the match was of zero length, we |
4538 | | * have to advance by one character, or we'd just find the same match |
4539 | | * again. |
4540 | | */ |
4541 | 0 | search_start = data_pos; |
4542 | 0 | if (pmatch[0].rm_so == pmatch[0].rm_eo) |
4543 | 0 | search_start++; |
4544 | 0 | } |
4545 | | |
4546 | | /* |
4547 | | * Copy the text to the right of the last match. |
4548 | | */ |
4549 | 0 | if (data_pos < data_len) |
4550 | 0 | { |
4551 | 0 | int chunk_len; |
4552 | |
|
4553 | 0 | chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr; |
4554 | 0 | appendBinaryStringInfo(&buf, start_ptr, chunk_len); |
4555 | 0 | } |
4556 | |
|
4557 | 0 | ret_text = cstring_to_text_with_len(buf.data, buf.len); |
4558 | 0 | pfree(buf.data); |
4559 | 0 | pfree(data); |
4560 | |
|
4561 | 0 | return ret_text; |
4562 | 0 | } |
4563 | | |
4564 | | /* |
4565 | | * split_part |
4566 | | * parse input string based on provided field separator |
4567 | | * return N'th item (1 based, negative counts from end) |
4568 | | */ |
4569 | | Datum |
4570 | | split_part(PG_FUNCTION_ARGS) |
4571 | 0 | { |
4572 | 0 | text *inputstring = PG_GETARG_TEXT_PP(0); |
4573 | 0 | text *fldsep = PG_GETARG_TEXT_PP(1); |
4574 | 0 | int fldnum = PG_GETARG_INT32(2); |
4575 | 0 | int inputstring_len; |
4576 | 0 | int fldsep_len; |
4577 | 0 | TextPositionState state; |
4578 | 0 | char *start_ptr; |
4579 | 0 | char *end_ptr; |
4580 | 0 | text *result_text; |
4581 | 0 | bool found; |
4582 | | |
4583 | | /* field number is 1 based */ |
4584 | 0 | if (fldnum == 0) |
4585 | 0 | ereport(ERROR, |
4586 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
4587 | 0 | errmsg("field position must not be zero"))); |
4588 | | |
4589 | 0 | inputstring_len = VARSIZE_ANY_EXHDR(inputstring); |
4590 | 0 | fldsep_len = VARSIZE_ANY_EXHDR(fldsep); |
4591 | | |
4592 | | /* return empty string for empty input string */ |
4593 | 0 | if (inputstring_len < 1) |
4594 | 0 | PG_RETURN_TEXT_P(cstring_to_text("")); |
4595 | | |
4596 | | /* handle empty field separator */ |
4597 | 0 | if (fldsep_len < 1) |
4598 | 0 | { |
4599 | | /* if first or last field, return input string, else empty string */ |
4600 | 0 | if (fldnum == 1 || fldnum == -1) |
4601 | 0 | PG_RETURN_TEXT_P(inputstring); |
4602 | 0 | else |
4603 | 0 | PG_RETURN_TEXT_P(cstring_to_text("")); |
4604 | 0 | } |
4605 | | |
4606 | | /* find the first field separator */ |
4607 | 0 | text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state); |
4608 | |
|
4609 | 0 | found = text_position_next(&state); |
4610 | | |
4611 | | /* special case if fldsep not found at all */ |
4612 | 0 | if (!found) |
4613 | 0 | { |
4614 | 0 | text_position_cleanup(&state); |
4615 | | /* if first or last field, return input string, else empty string */ |
4616 | 0 | if (fldnum == 1 || fldnum == -1) |
4617 | 0 | PG_RETURN_TEXT_P(inputstring); |
4618 | 0 | else |
4619 | 0 | PG_RETURN_TEXT_P(cstring_to_text("")); |
4620 | 0 | } |
4621 | | |
4622 | | /* |
4623 | | * take care of a negative field number (i.e. count from the right) by |
4624 | | * converting to a positive field number; we need total number of fields |
4625 | | */ |
4626 | 0 | if (fldnum < 0) |
4627 | 0 | { |
4628 | | /* we found a fldsep, so there are at least two fields */ |
4629 | 0 | int numfields = 2; |
4630 | |
|
4631 | 0 | while (text_position_next(&state)) |
4632 | 0 | numfields++; |
4633 | | |
4634 | | /* special case of last field does not require an extra pass */ |
4635 | 0 | if (fldnum == -1) |
4636 | 0 | { |
4637 | 0 | start_ptr = text_position_get_match_ptr(&state) + state.last_match_len; |
4638 | 0 | end_ptr = VARDATA_ANY(inputstring) + inputstring_len; |
4639 | 0 | text_position_cleanup(&state); |
4640 | 0 | PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr, |
4641 | 0 | end_ptr - start_ptr)); |
4642 | 0 | } |
4643 | | |
4644 | | /* else, convert fldnum to positive notation */ |
4645 | 0 | fldnum += numfields + 1; |
4646 | | |
4647 | | /* if nonexistent field, return empty string */ |
4648 | 0 | if (fldnum <= 0) |
4649 | 0 | { |
4650 | 0 | text_position_cleanup(&state); |
4651 | 0 | PG_RETURN_TEXT_P(cstring_to_text("")); |
4652 | 0 | } |
4653 | | |
4654 | | /* reset to pointing at first match, but now with positive fldnum */ |
4655 | 0 | text_position_reset(&state); |
4656 | 0 | found = text_position_next(&state); |
4657 | 0 | Assert(found); |
4658 | 0 | } |
4659 | | |
4660 | | /* identify bounds of first field */ |
4661 | 0 | start_ptr = VARDATA_ANY(inputstring); |
4662 | 0 | end_ptr = text_position_get_match_ptr(&state); |
4663 | |
|
4664 | 0 | while (found && --fldnum > 0) |
4665 | 0 | { |
4666 | | /* identify bounds of next field */ |
4667 | 0 | start_ptr = end_ptr + state.last_match_len; |
4668 | 0 | found = text_position_next(&state); |
4669 | 0 | if (found) |
4670 | 0 | end_ptr = text_position_get_match_ptr(&state); |
4671 | 0 | } |
4672 | |
|
4673 | 0 | text_position_cleanup(&state); |
4674 | |
|
4675 | 0 | if (fldnum > 0) |
4676 | 0 | { |
4677 | | /* N'th field separator not found */ |
4678 | | /* if last field requested, return it, else empty string */ |
4679 | 0 | if (fldnum == 1) |
4680 | 0 | { |
4681 | 0 | int last_len = start_ptr - VARDATA_ANY(inputstring); |
4682 | |
|
4683 | 0 | result_text = cstring_to_text_with_len(start_ptr, |
4684 | 0 | inputstring_len - last_len); |
4685 | 0 | } |
4686 | 0 | else |
4687 | 0 | result_text = cstring_to_text(""); |
4688 | 0 | } |
4689 | 0 | else |
4690 | 0 | { |
4691 | | /* non-last field requested */ |
4692 | 0 | result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr); |
4693 | 0 | } |
4694 | |
|
4695 | 0 | PG_RETURN_TEXT_P(result_text); |
4696 | 0 | } |
4697 | | |
4698 | | /* |
4699 | | * Convenience function to return true when two text params are equal. |
4700 | | */ |
4701 | | static bool |
4702 | | text_isequal(text *txt1, text *txt2, Oid collid) |
4703 | 0 | { |
4704 | 0 | return DatumGetBool(DirectFunctionCall2Coll(texteq, |
4705 | 0 | collid, |
4706 | 0 | PointerGetDatum(txt1), |
4707 | 0 | PointerGetDatum(txt2))); |
4708 | 0 | } |
4709 | | |
4710 | | /* |
4711 | | * text_to_array |
4712 | | * parse input string and return text array of elements, |
4713 | | * based on provided field separator |
4714 | | */ |
4715 | | Datum |
4716 | | text_to_array(PG_FUNCTION_ARGS) |
4717 | 0 | { |
4718 | 0 | SplitTextOutputData tstate; |
4719 | | |
4720 | | /* For array output, tstate should start as all zeroes */ |
4721 | 0 | memset(&tstate, 0, sizeof(tstate)); |
4722 | |
|
4723 | 0 | if (!split_text(fcinfo, &tstate)) |
4724 | 0 | PG_RETURN_NULL(); |
4725 | | |
4726 | 0 | if (tstate.astate == NULL) |
4727 | 0 | PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID)); |
4728 | | |
4729 | 0 | PG_RETURN_DATUM(makeArrayResult(tstate.astate, |
4730 | 0 | CurrentMemoryContext)); |
4731 | 0 | } |
4732 | | |
4733 | | /* |
4734 | | * text_to_array_null |
4735 | | * parse input string and return text array of elements, |
4736 | | * based on provided field separator and null string |
4737 | | * |
4738 | | * This is a separate entry point only to prevent the regression tests from |
4739 | | * complaining about different argument sets for the same internal function. |
4740 | | */ |
4741 | | Datum |
4742 | | text_to_array_null(PG_FUNCTION_ARGS) |
4743 | 0 | { |
4744 | 0 | return text_to_array(fcinfo); |
4745 | 0 | } |
4746 | | |
4747 | | /* |
4748 | | * text_to_table |
4749 | | * parse input string and return table of elements, |
4750 | | * based on provided field separator |
4751 | | */ |
4752 | | Datum |
4753 | | text_to_table(PG_FUNCTION_ARGS) |
4754 | 0 | { |
4755 | 0 | ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo; |
4756 | 0 | SplitTextOutputData tstate; |
4757 | |
|
4758 | 0 | tstate.astate = NULL; |
4759 | 0 | InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC); |
4760 | 0 | tstate.tupstore = rsi->setResult; |
4761 | 0 | tstate.tupdesc = rsi->setDesc; |
4762 | |
|
4763 | 0 | (void) split_text(fcinfo, &tstate); |
4764 | |
|
4765 | 0 | return (Datum) 0; |
4766 | 0 | } |
4767 | | |
4768 | | /* |
4769 | | * text_to_table_null |
4770 | | * parse input string and return table of elements, |
4771 | | * based on provided field separator and null string |
4772 | | * |
4773 | | * This is a separate entry point only to prevent the regression tests from |
4774 | | * complaining about different argument sets for the same internal function. |
4775 | | */ |
4776 | | Datum |
4777 | | text_to_table_null(PG_FUNCTION_ARGS) |
4778 | 0 | { |
4779 | 0 | return text_to_table(fcinfo); |
4780 | 0 | } |
4781 | | |
4782 | | /* |
4783 | | * Common code for text_to_array, text_to_array_null, text_to_table |
4784 | | * and text_to_table_null functions. |
4785 | | * |
4786 | | * These are not strict so we have to test for null inputs explicitly. |
4787 | | * Returns false if result is to be null, else returns true. |
4788 | | * |
4789 | | * Note that if the result is valid but empty (zero elements), we return |
4790 | | * without changing *tstate --- caller must handle that case, too. |
4791 | | */ |
4792 | | static bool |
4793 | | split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) |
4794 | 0 | { |
4795 | 0 | text *inputstring; |
4796 | 0 | text *fldsep; |
4797 | 0 | text *null_string; |
4798 | 0 | Oid collation = PG_GET_COLLATION(); |
4799 | 0 | int inputstring_len; |
4800 | 0 | int fldsep_len; |
4801 | 0 | char *start_ptr; |
4802 | 0 | text *result_text; |
4803 | | |
4804 | | /* when input string is NULL, then result is NULL too */ |
4805 | 0 | if (PG_ARGISNULL(0)) |
4806 | 0 | return false; |
4807 | | |
4808 | 0 | inputstring = PG_GETARG_TEXT_PP(0); |
4809 | | |
4810 | | /* fldsep can be NULL */ |
4811 | 0 | if (!PG_ARGISNULL(1)) |
4812 | 0 | fldsep = PG_GETARG_TEXT_PP(1); |
4813 | 0 | else |
4814 | 0 | fldsep = NULL; |
4815 | | |
4816 | | /* null_string can be NULL or omitted */ |
4817 | 0 | if (PG_NARGS() > 2 && !PG_ARGISNULL(2)) |
4818 | 0 | null_string = PG_GETARG_TEXT_PP(2); |
4819 | 0 | else |
4820 | 0 | null_string = NULL; |
4821 | |
|
4822 | 0 | if (fldsep != NULL) |
4823 | 0 | { |
4824 | | /* |
4825 | | * Normal case with non-null fldsep. Use the text_position machinery |
4826 | | * to search for occurrences of fldsep. |
4827 | | */ |
4828 | 0 | TextPositionState state; |
4829 | |
|
4830 | 0 | inputstring_len = VARSIZE_ANY_EXHDR(inputstring); |
4831 | 0 | fldsep_len = VARSIZE_ANY_EXHDR(fldsep); |
4832 | | |
4833 | | /* return empty set for empty input string */ |
4834 | 0 | if (inputstring_len < 1) |
4835 | 0 | return true; |
4836 | | |
4837 | | /* empty field separator: return input string as a one-element set */ |
4838 | 0 | if (fldsep_len < 1) |
4839 | 0 | { |
4840 | 0 | split_text_accum_result(tstate, inputstring, |
4841 | 0 | null_string, collation); |
4842 | 0 | return true; |
4843 | 0 | } |
4844 | | |
4845 | 0 | text_position_setup(inputstring, fldsep, collation, &state); |
4846 | |
|
4847 | 0 | start_ptr = VARDATA_ANY(inputstring); |
4848 | |
|
4849 | 0 | for (;;) |
4850 | 0 | { |
4851 | 0 | bool found; |
4852 | 0 | char *end_ptr; |
4853 | 0 | int chunk_len; |
4854 | |
|
4855 | 0 | CHECK_FOR_INTERRUPTS(); |
4856 | |
|
4857 | 0 | found = text_position_next(&state); |
4858 | 0 | if (!found) |
4859 | 0 | { |
4860 | | /* fetch last field */ |
4861 | 0 | chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr; |
4862 | 0 | end_ptr = NULL; /* not used, but some compilers complain */ |
4863 | 0 | } |
4864 | 0 | else |
4865 | 0 | { |
4866 | | /* fetch non-last field */ |
4867 | 0 | end_ptr = text_position_get_match_ptr(&state); |
4868 | 0 | chunk_len = end_ptr - start_ptr; |
4869 | 0 | } |
4870 | | |
4871 | | /* build a temp text datum to pass to split_text_accum_result */ |
4872 | 0 | result_text = cstring_to_text_with_len(start_ptr, chunk_len); |
4873 | | |
4874 | | /* stash away this field */ |
4875 | 0 | split_text_accum_result(tstate, result_text, |
4876 | 0 | null_string, collation); |
4877 | |
|
4878 | 0 | pfree(result_text); |
4879 | |
|
4880 | 0 | if (!found) |
4881 | 0 | break; |
4882 | | |
4883 | 0 | start_ptr = end_ptr + state.last_match_len; |
4884 | 0 | } |
4885 | |
|
4886 | 0 | text_position_cleanup(&state); |
4887 | 0 | } |
4888 | 0 | else |
4889 | 0 | { |
4890 | | /* |
4891 | | * When fldsep is NULL, each character in the input string becomes a |
4892 | | * separate element in the result set. The separator is effectively |
4893 | | * the space between characters. |
4894 | | */ |
4895 | 0 | inputstring_len = VARSIZE_ANY_EXHDR(inputstring); |
4896 | |
|
4897 | 0 | start_ptr = VARDATA_ANY(inputstring); |
4898 | |
|
4899 | 0 | while (inputstring_len > 0) |
4900 | 0 | { |
4901 | 0 | int chunk_len = pg_mblen(start_ptr); |
4902 | |
|
4903 | 0 | CHECK_FOR_INTERRUPTS(); |
4904 | | |
4905 | | /* build a temp text datum to pass to split_text_accum_result */ |
4906 | 0 | result_text = cstring_to_text_with_len(start_ptr, chunk_len); |
4907 | | |
4908 | | /* stash away this field */ |
4909 | 0 | split_text_accum_result(tstate, result_text, |
4910 | 0 | null_string, collation); |
4911 | |
|
4912 | 0 | pfree(result_text); |
4913 | |
|
4914 | 0 | start_ptr += chunk_len; |
4915 | 0 | inputstring_len -= chunk_len; |
4916 | 0 | } |
4917 | 0 | } |
4918 | | |
4919 | 0 | return true; |
4920 | 0 | } |
4921 | | |
4922 | | /* |
4923 | | * Add text item to result set (table or array). |
4924 | | * |
4925 | | * This is also responsible for checking to see if the item matches |
4926 | | * the null_string, in which case we should emit NULL instead. |
4927 | | */ |
4928 | | static void |
4929 | | split_text_accum_result(SplitTextOutputData *tstate, |
4930 | | text *field_value, |
4931 | | text *null_string, |
4932 | | Oid collation) |
4933 | 0 | { |
4934 | 0 | bool is_null = false; |
4935 | |
|
4936 | 0 | if (null_string && text_isequal(field_value, null_string, collation)) |
4937 | 0 | is_null = true; |
4938 | |
|
4939 | 0 | if (tstate->tupstore) |
4940 | 0 | { |
4941 | 0 | Datum values[1]; |
4942 | 0 | bool nulls[1]; |
4943 | |
|
4944 | 0 | values[0] = PointerGetDatum(field_value); |
4945 | 0 | nulls[0] = is_null; |
4946 | |
|
4947 | 0 | tuplestore_putvalues(tstate->tupstore, |
4948 | 0 | tstate->tupdesc, |
4949 | 0 | values, |
4950 | 0 | nulls); |
4951 | 0 | } |
4952 | 0 | else |
4953 | 0 | { |
4954 | 0 | tstate->astate = accumArrayResult(tstate->astate, |
4955 | 0 | PointerGetDatum(field_value), |
4956 | 0 | is_null, |
4957 | 0 | TEXTOID, |
4958 | 0 | CurrentMemoryContext); |
4959 | 0 | } |
4960 | 0 | } |
4961 | | |
4962 | | /* |
4963 | | * array_to_text |
4964 | | * concatenate Cstring representation of input array elements |
4965 | | * using provided field separator |
4966 | | */ |
4967 | | Datum |
4968 | | array_to_text(PG_FUNCTION_ARGS) |
4969 | 0 | { |
4970 | 0 | ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); |
4971 | 0 | char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1)); |
4972 | |
|
4973 | 0 | PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL)); |
4974 | 0 | } |
4975 | | |
4976 | | /* |
4977 | | * array_to_text_null |
4978 | | * concatenate Cstring representation of input array elements |
4979 | | * using provided field separator and null string |
4980 | | * |
4981 | | * This version is not strict so we have to test for null inputs explicitly. |
4982 | | */ |
4983 | | Datum |
4984 | | array_to_text_null(PG_FUNCTION_ARGS) |
4985 | 0 | { |
4986 | 0 | ArrayType *v; |
4987 | 0 | char *fldsep; |
4988 | 0 | char *null_string; |
4989 | | |
4990 | | /* returns NULL when first or second parameter is NULL */ |
4991 | 0 | if (PG_ARGISNULL(0) || PG_ARGISNULL(1)) |
4992 | 0 | PG_RETURN_NULL(); |
4993 | | |
4994 | 0 | v = PG_GETARG_ARRAYTYPE_P(0); |
4995 | 0 | fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1)); |
4996 | | |
4997 | | /* NULL null string is passed through as a null pointer */ |
4998 | 0 | if (!PG_ARGISNULL(2)) |
4999 | 0 | null_string = text_to_cstring(PG_GETARG_TEXT_PP(2)); |
5000 | 0 | else |
5001 | 0 | null_string = NULL; |
5002 | |
|
5003 | 0 | PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string)); |
5004 | 0 | } |
5005 | | |
5006 | | /* |
5007 | | * common code for array_to_text and array_to_text_null functions |
5008 | | */ |
5009 | | static text * |
5010 | | array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, |
5011 | | const char *fldsep, const char *null_string) |
5012 | 0 | { |
5013 | 0 | text *result; |
5014 | 0 | int nitems, |
5015 | 0 | *dims, |
5016 | 0 | ndims; |
5017 | 0 | Oid element_type; |
5018 | 0 | int typlen; |
5019 | 0 | bool typbyval; |
5020 | 0 | char typalign; |
5021 | 0 | StringInfoData buf; |
5022 | 0 | bool printed = false; |
5023 | 0 | char *p; |
5024 | 0 | bits8 *bitmap; |
5025 | 0 | int bitmask; |
5026 | 0 | int i; |
5027 | 0 | ArrayMetaState *my_extra; |
5028 | |
|
5029 | 0 | ndims = ARR_NDIM(v); |
5030 | 0 | dims = ARR_DIMS(v); |
5031 | 0 | nitems = ArrayGetNItems(ndims, dims); |
5032 | | |
5033 | | /* if there are no elements, return an empty string */ |
5034 | 0 | if (nitems == 0) |
5035 | 0 | return cstring_to_text_with_len("", 0); |
5036 | | |
5037 | 0 | element_type = ARR_ELEMTYPE(v); |
5038 | 0 | initStringInfo(&buf); |
5039 | | |
5040 | | /* |
5041 | | * We arrange to look up info about element type, including its output |
5042 | | * conversion proc, only once per series of calls, assuming the element |
5043 | | * type doesn't change underneath us. |
5044 | | */ |
5045 | 0 | my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra; |
5046 | 0 | if (my_extra == NULL) |
5047 | 0 | { |
5048 | 0 | fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, |
5049 | 0 | sizeof(ArrayMetaState)); |
5050 | 0 | my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra; |
5051 | 0 | my_extra->element_type = ~element_type; |
5052 | 0 | } |
5053 | |
|
5054 | 0 | if (my_extra->element_type != element_type) |
5055 | 0 | { |
5056 | | /* |
5057 | | * Get info about element type, including its output conversion proc |
5058 | | */ |
5059 | 0 | get_type_io_data(element_type, IOFunc_output, |
5060 | 0 | &my_extra->typlen, &my_extra->typbyval, |
5061 | 0 | &my_extra->typalign, &my_extra->typdelim, |
5062 | 0 | &my_extra->typioparam, &my_extra->typiofunc); |
5063 | 0 | fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc, |
5064 | 0 | fcinfo->flinfo->fn_mcxt); |
5065 | 0 | my_extra->element_type = element_type; |
5066 | 0 | } |
5067 | 0 | typlen = my_extra->typlen; |
5068 | 0 | typbyval = my_extra->typbyval; |
5069 | 0 | typalign = my_extra->typalign; |
5070 | |
|
5071 | 0 | p = ARR_DATA_PTR(v); |
5072 | 0 | bitmap = ARR_NULLBITMAP(v); |
5073 | 0 | bitmask = 1; |
5074 | |
|
5075 | 0 | for (i = 0; i < nitems; i++) |
5076 | 0 | { |
5077 | 0 | Datum itemvalue; |
5078 | 0 | char *value; |
5079 | | |
5080 | | /* Get source element, checking for NULL */ |
5081 | 0 | if (bitmap && (*bitmap & bitmask) == 0) |
5082 | 0 | { |
5083 | | /* if null_string is NULL, we just ignore null elements */ |
5084 | 0 | if (null_string != NULL) |
5085 | 0 | { |
5086 | 0 | if (printed) |
5087 | 0 | appendStringInfo(&buf, "%s%s", fldsep, null_string); |
5088 | 0 | else |
5089 | 0 | appendStringInfoString(&buf, null_string); |
5090 | 0 | printed = true; |
5091 | 0 | } |
5092 | 0 | } |
5093 | 0 | else |
5094 | 0 | { |
5095 | 0 | itemvalue = fetch_att(p, typbyval, typlen); |
5096 | |
|
5097 | 0 | value = OutputFunctionCall(&my_extra->proc, itemvalue); |
5098 | |
|
5099 | 0 | if (printed) |
5100 | 0 | appendStringInfo(&buf, "%s%s", fldsep, value); |
5101 | 0 | else |
5102 | 0 | appendStringInfoString(&buf, value); |
5103 | 0 | printed = true; |
5104 | |
|
5105 | 0 | p = att_addlength_pointer(p, typlen, p); |
5106 | 0 | p = (char *) att_align_nominal(p, typalign); |
5107 | 0 | } |
5108 | | |
5109 | | /* advance bitmap pointer if any */ |
5110 | 0 | if (bitmap) |
5111 | 0 | { |
5112 | 0 | bitmask <<= 1; |
5113 | 0 | if (bitmask == 0x100) |
5114 | 0 | { |
5115 | 0 | bitmap++; |
5116 | 0 | bitmask = 1; |
5117 | 0 | } |
5118 | 0 | } |
5119 | 0 | } |
5120 | |
|
5121 | 0 | result = cstring_to_text_with_len(buf.data, buf.len); |
5122 | 0 | pfree(buf.data); |
5123 | |
|
5124 | 0 | return result; |
5125 | 0 | } |
5126 | | |
5127 | | /* |
5128 | | * Workhorse for to_bin, to_oct, and to_hex. Note that base must be > 1 and <= |
5129 | | * 16. |
5130 | | */ |
5131 | | static inline text * |
5132 | | convert_to_base(uint64 value, int base) |
5133 | 0 | { |
5134 | 0 | const char *digits = "0123456789abcdef"; |
5135 | | |
5136 | | /* We size the buffer for to_bin's longest possible return value. */ |
5137 | 0 | char buf[sizeof(uint64) * BITS_PER_BYTE]; |
5138 | 0 | char *const end = buf + sizeof(buf); |
5139 | 0 | char *ptr = end; |
5140 | |
|
5141 | 0 | Assert(base > 1); |
5142 | 0 | Assert(base <= 16); |
5143 | |
|
5144 | 0 | do |
5145 | 0 | { |
5146 | 0 | *--ptr = digits[value % base]; |
5147 | 0 | value /= base; |
5148 | 0 | } while (ptr > buf && value); |
5149 | |
|
5150 | 0 | return cstring_to_text_with_len(ptr, end - ptr); |
5151 | 0 | } |
5152 | | |
5153 | | /* |
5154 | | * Convert an integer to a string containing a base-2 (binary) representation |
5155 | | * of the number. |
5156 | | */ |
5157 | | Datum |
5158 | | to_bin32(PG_FUNCTION_ARGS) |
5159 | 0 | { |
5160 | 0 | uint64 value = (uint32) PG_GETARG_INT32(0); |
5161 | |
|
5162 | 0 | PG_RETURN_TEXT_P(convert_to_base(value, 2)); |
5163 | 0 | } |
5164 | | Datum |
5165 | | to_bin64(PG_FUNCTION_ARGS) |
5166 | 0 | { |
5167 | 0 | uint64 value = (uint64) PG_GETARG_INT64(0); |
5168 | |
|
5169 | 0 | PG_RETURN_TEXT_P(convert_to_base(value, 2)); |
5170 | 0 | } |
5171 | | |
5172 | | /* |
5173 | | * Convert an integer to a string containing a base-8 (oct) representation of |
5174 | | * the number. |
5175 | | */ |
5176 | | Datum |
5177 | | to_oct32(PG_FUNCTION_ARGS) |
5178 | 0 | { |
5179 | 0 | uint64 value = (uint32) PG_GETARG_INT32(0); |
5180 | |
|
5181 | 0 | PG_RETURN_TEXT_P(convert_to_base(value, 8)); |
5182 | 0 | } |
5183 | | Datum |
5184 | | to_oct64(PG_FUNCTION_ARGS) |
5185 | 0 | { |
5186 | 0 | uint64 value = (uint64) PG_GETARG_INT64(0); |
5187 | |
|
5188 | 0 | PG_RETURN_TEXT_P(convert_to_base(value, 8)); |
5189 | 0 | } |
5190 | | |
5191 | | /* |
5192 | | * Convert an integer to a string containing a base-16 (hex) representation of |
5193 | | * the number. |
5194 | | */ |
5195 | | Datum |
5196 | | to_hex32(PG_FUNCTION_ARGS) |
5197 | 0 | { |
5198 | 0 | uint64 value = (uint32) PG_GETARG_INT32(0); |
5199 | |
|
5200 | 0 | PG_RETURN_TEXT_P(convert_to_base(value, 16)); |
5201 | 0 | } |
5202 | | Datum |
5203 | | to_hex64(PG_FUNCTION_ARGS) |
5204 | 0 | { |
5205 | 0 | uint64 value = (uint64) PG_GETARG_INT64(0); |
5206 | |
|
5207 | 0 | PG_RETURN_TEXT_P(convert_to_base(value, 16)); |
5208 | 0 | } |
5209 | | |
5210 | | /* |
5211 | | * Return the size of a datum, possibly compressed |
5212 | | * |
5213 | | * Works on any data type |
5214 | | */ |
5215 | | Datum |
5216 | | pg_column_size(PG_FUNCTION_ARGS) |
5217 | 0 | { |
5218 | 0 | Datum value = PG_GETARG_DATUM(0); |
5219 | 0 | int32 result; |
5220 | 0 | int typlen; |
5221 | | |
5222 | | /* On first call, get the input type's typlen, and save at *fn_extra */ |
5223 | 0 | if (fcinfo->flinfo->fn_extra == NULL) |
5224 | 0 | { |
5225 | | /* Lookup the datatype of the supplied argument */ |
5226 | 0 | Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); |
5227 | |
|
5228 | 0 | typlen = get_typlen(argtypeid); |
5229 | 0 | if (typlen == 0) /* should not happen */ |
5230 | 0 | elog(ERROR, "cache lookup failed for type %u", argtypeid); |
5231 | | |
5232 | 0 | fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, |
5233 | 0 | sizeof(int)); |
5234 | 0 | *((int *) fcinfo->flinfo->fn_extra) = typlen; |
5235 | 0 | } |
5236 | 0 | else |
5237 | 0 | typlen = *((int *) fcinfo->flinfo->fn_extra); |
5238 | | |
5239 | 0 | if (typlen == -1) |
5240 | 0 | { |
5241 | | /* varlena type, possibly toasted */ |
5242 | 0 | result = toast_datum_size(value); |
5243 | 0 | } |
5244 | 0 | else if (typlen == -2) |
5245 | 0 | { |
5246 | | /* cstring */ |
5247 | 0 | result = strlen(DatumGetCString(value)) + 1; |
5248 | 0 | } |
5249 | 0 | else |
5250 | 0 | { |
5251 | | /* ordinary fixed-width type */ |
5252 | 0 | result = typlen; |
5253 | 0 | } |
5254 | |
|
5255 | 0 | PG_RETURN_INT32(result); |
5256 | 0 | } |
5257 | | |
5258 | | /* |
5259 | | * Return the compression method stored in the compressed attribute. Return |
5260 | | * NULL for non varlena type or uncompressed data. |
5261 | | */ |
5262 | | Datum |
5263 | | pg_column_compression(PG_FUNCTION_ARGS) |
5264 | 0 | { |
5265 | 0 | int typlen; |
5266 | 0 | char *result; |
5267 | 0 | ToastCompressionId cmid; |
5268 | | |
5269 | | /* On first call, get the input type's typlen, and save at *fn_extra */ |
5270 | 0 | if (fcinfo->flinfo->fn_extra == NULL) |
5271 | 0 | { |
5272 | | /* Lookup the datatype of the supplied argument */ |
5273 | 0 | Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); |
5274 | |
|
5275 | 0 | typlen = get_typlen(argtypeid); |
5276 | 0 | if (typlen == 0) /* should not happen */ |
5277 | 0 | elog(ERROR, "cache lookup failed for type %u", argtypeid); |
5278 | | |
5279 | 0 | fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, |
5280 | 0 | sizeof(int)); |
5281 | 0 | *((int *) fcinfo->flinfo->fn_extra) = typlen; |
5282 | 0 | } |
5283 | 0 | else |
5284 | 0 | typlen = *((int *) fcinfo->flinfo->fn_extra); |
5285 | | |
5286 | 0 | if (typlen != -1) |
5287 | 0 | PG_RETURN_NULL(); |
5288 | | |
5289 | | /* get the compression method id stored in the compressed varlena */ |
5290 | 0 | cmid = toast_get_compression_id((struct varlena *) |
5291 | 0 | DatumGetPointer(PG_GETARG_DATUM(0))); |
5292 | 0 | if (cmid == TOAST_INVALID_COMPRESSION_ID) |
5293 | 0 | PG_RETURN_NULL(); |
5294 | | |
5295 | | /* convert compression method id to compression method name */ |
5296 | 0 | switch (cmid) |
5297 | 0 | { |
5298 | 0 | case TOAST_PGLZ_COMPRESSION_ID: |
5299 | 0 | result = "pglz"; |
5300 | 0 | break; |
5301 | 0 | case TOAST_LZ4_COMPRESSION_ID: |
5302 | 0 | result = "lz4"; |
5303 | 0 | break; |
5304 | 0 | default: |
5305 | 0 | elog(ERROR, "invalid compression method id %d", cmid); |
5306 | 0 | } |
5307 | | |
5308 | 0 | PG_RETURN_TEXT_P(cstring_to_text(result)); |
5309 | 0 | } |
5310 | | |
5311 | | /* |
5312 | | * Return the chunk_id of the on-disk TOASTed value. Return NULL if the value |
5313 | | * is un-TOASTed or not on-disk. |
5314 | | */ |
5315 | | Datum |
5316 | | pg_column_toast_chunk_id(PG_FUNCTION_ARGS) |
5317 | 0 | { |
5318 | 0 | int typlen; |
5319 | 0 | struct varlena *attr; |
5320 | 0 | struct varatt_external toast_pointer; |
5321 | | |
5322 | | /* On first call, get the input type's typlen, and save at *fn_extra */ |
5323 | 0 | if (fcinfo->flinfo->fn_extra == NULL) |
5324 | 0 | { |
5325 | | /* Lookup the datatype of the supplied argument */ |
5326 | 0 | Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); |
5327 | |
|
5328 | 0 | typlen = get_typlen(argtypeid); |
5329 | 0 | if (typlen == 0) /* should not happen */ |
5330 | 0 | elog(ERROR, "cache lookup failed for type %u", argtypeid); |
5331 | | |
5332 | 0 | fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, |
5333 | 0 | sizeof(int)); |
5334 | 0 | *((int *) fcinfo->flinfo->fn_extra) = typlen; |
5335 | 0 | } |
5336 | 0 | else |
5337 | 0 | typlen = *((int *) fcinfo->flinfo->fn_extra); |
5338 | | |
5339 | 0 | if (typlen != -1) |
5340 | 0 | PG_RETURN_NULL(); |
5341 | | |
5342 | 0 | attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0)); |
5343 | |
|
5344 | 0 | if (!VARATT_IS_EXTERNAL_ONDISK(attr)) |
5345 | 0 | PG_RETURN_NULL(); |
5346 | | |
5347 | 0 | VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); |
5348 | |
|
5349 | 0 | PG_RETURN_OID(toast_pointer.va_valueid); |
5350 | 0 | } |
5351 | | |
5352 | | /* |
5353 | | * string_agg - Concatenates values and returns string. |
5354 | | * |
5355 | | * Syntax: string_agg(value text, delimiter text) RETURNS text |
5356 | | * |
5357 | | * Note: Any NULL values are ignored. The first-call delimiter isn't |
5358 | | * actually used at all, and on subsequent calls the delimiter precedes |
5359 | | * the associated value. |
5360 | | */ |
5361 | | |
5362 | | /* subroutine to initialize state */ |
5363 | | static StringInfo |
5364 | | makeStringAggState(FunctionCallInfo fcinfo) |
5365 | 0 | { |
5366 | 0 | StringInfo state; |
5367 | 0 | MemoryContext aggcontext; |
5368 | 0 | MemoryContext oldcontext; |
5369 | |
|
5370 | 0 | if (!AggCheckCallContext(fcinfo, &aggcontext)) |
5371 | 0 | { |
5372 | | /* cannot be called directly because of internal-type argument */ |
5373 | 0 | elog(ERROR, "string_agg_transfn called in non-aggregate context"); |
5374 | 0 | } |
5375 | | |
5376 | | /* |
5377 | | * Create state in aggregate context. It'll stay there across subsequent |
5378 | | * calls. |
5379 | | */ |
5380 | 0 | oldcontext = MemoryContextSwitchTo(aggcontext); |
5381 | 0 | state = makeStringInfo(); |
5382 | 0 | MemoryContextSwitchTo(oldcontext); |
5383 | |
|
5384 | 0 | return state; |
5385 | 0 | } |
5386 | | |
5387 | | Datum |
5388 | | string_agg_transfn(PG_FUNCTION_ARGS) |
5389 | 0 | { |
5390 | 0 | StringInfo state; |
5391 | |
|
5392 | 0 | state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); |
5393 | | |
5394 | | /* Append the value unless null, preceding it with the delimiter. */ |
5395 | 0 | if (!PG_ARGISNULL(1)) |
5396 | 0 | { |
5397 | 0 | text *value = PG_GETARG_TEXT_PP(1); |
5398 | 0 | bool isfirst = false; |
5399 | | |
5400 | | /* |
5401 | | * You might think we can just throw away the first delimiter, however |
5402 | | * we must keep it as we may be a parallel worker doing partial |
5403 | | * aggregation building a state to send to the main process. We need |
5404 | | * to keep the delimiter of every aggregation so that the combine |
5405 | | * function can properly join up the strings of two separately |
5406 | | * partially aggregated results. The first delimiter is only stripped |
5407 | | * off in the final function. To know how much to strip off the front |
5408 | | * of the string, we store the length of the first delimiter in the |
5409 | | * StringInfo's cursor field, which we don't otherwise need here. |
5410 | | */ |
5411 | 0 | if (state == NULL) |
5412 | 0 | { |
5413 | 0 | state = makeStringAggState(fcinfo); |
5414 | 0 | isfirst = true; |
5415 | 0 | } |
5416 | |
|
5417 | 0 | if (!PG_ARGISNULL(2)) |
5418 | 0 | { |
5419 | 0 | text *delim = PG_GETARG_TEXT_PP(2); |
5420 | |
|
5421 | 0 | appendStringInfoText(state, delim); |
5422 | 0 | if (isfirst) |
5423 | 0 | state->cursor = VARSIZE_ANY_EXHDR(delim); |
5424 | 0 | } |
5425 | |
|
5426 | 0 | appendStringInfoText(state, value); |
5427 | 0 | } |
5428 | | |
5429 | | /* |
5430 | | * The transition type for string_agg() is declared to be "internal", |
5431 | | * which is a pass-by-value type the same size as a pointer. |
5432 | | */ |
5433 | 0 | if (state) |
5434 | 0 | PG_RETURN_POINTER(state); |
5435 | 0 | PG_RETURN_NULL(); |
5436 | 0 | } |
5437 | | |
5438 | | /* |
5439 | | * string_agg_combine |
5440 | | * Aggregate combine function for string_agg(text) and string_agg(bytea) |
5441 | | */ |
5442 | | Datum |
5443 | | string_agg_combine(PG_FUNCTION_ARGS) |
5444 | 0 | { |
5445 | 0 | StringInfo state1; |
5446 | 0 | StringInfo state2; |
5447 | 0 | MemoryContext agg_context; |
5448 | |
|
5449 | 0 | if (!AggCheckCallContext(fcinfo, &agg_context)) |
5450 | 0 | elog(ERROR, "aggregate function called in non-aggregate context"); |
5451 | | |
5452 | 0 | state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); |
5453 | 0 | state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1); |
5454 | |
|
5455 | 0 | if (state2 == NULL) |
5456 | 0 | { |
5457 | | /* |
5458 | | * NULL state2 is easy, just return state1, which we know is already |
5459 | | * in the agg_context |
5460 | | */ |
5461 | 0 | if (state1 == NULL) |
5462 | 0 | PG_RETURN_NULL(); |
5463 | 0 | PG_RETURN_POINTER(state1); |
5464 | 0 | } |
5465 | | |
5466 | 0 | if (state1 == NULL) |
5467 | 0 | { |
5468 | | /* We must copy state2's data into the agg_context */ |
5469 | 0 | MemoryContext old_context; |
5470 | |
|
5471 | 0 | old_context = MemoryContextSwitchTo(agg_context); |
5472 | 0 | state1 = makeStringAggState(fcinfo); |
5473 | 0 | appendBinaryStringInfo(state1, state2->data, state2->len); |
5474 | 0 | state1->cursor = state2->cursor; |
5475 | 0 | MemoryContextSwitchTo(old_context); |
5476 | 0 | } |
5477 | 0 | else if (state2->len > 0) |
5478 | 0 | { |
5479 | | /* Combine ... state1->cursor does not change in this case */ |
5480 | 0 | appendBinaryStringInfo(state1, state2->data, state2->len); |
5481 | 0 | } |
5482 | |
|
5483 | 0 | PG_RETURN_POINTER(state1); |
5484 | 0 | } |
5485 | | |
5486 | | /* |
5487 | | * string_agg_serialize |
5488 | | * Aggregate serialize function for string_agg(text) and string_agg(bytea) |
5489 | | * |
5490 | | * This is strict, so we need not handle NULL input |
5491 | | */ |
5492 | | Datum |
5493 | | string_agg_serialize(PG_FUNCTION_ARGS) |
5494 | 0 | { |
5495 | 0 | StringInfo state; |
5496 | 0 | StringInfoData buf; |
5497 | 0 | bytea *result; |
5498 | | |
5499 | | /* cannot be called directly because of internal-type argument */ |
5500 | 0 | Assert(AggCheckCallContext(fcinfo, NULL)); |
5501 | |
|
5502 | 0 | state = (StringInfo) PG_GETARG_POINTER(0); |
5503 | |
|
5504 | 0 | pq_begintypsend(&buf); |
5505 | | |
5506 | | /* cursor */ |
5507 | 0 | pq_sendint(&buf, state->cursor, 4); |
5508 | | |
5509 | | /* data */ |
5510 | 0 | pq_sendbytes(&buf, state->data, state->len); |
5511 | |
|
5512 | 0 | result = pq_endtypsend(&buf); |
5513 | |
|
5514 | 0 | PG_RETURN_BYTEA_P(result); |
5515 | 0 | } |
5516 | | |
5517 | | /* |
5518 | | * string_agg_deserialize |
5519 | | * Aggregate deserial function for string_agg(text) and string_agg(bytea) |
5520 | | * |
5521 | | * This is strict, so we need not handle NULL input |
5522 | | */ |
5523 | | Datum |
5524 | | string_agg_deserialize(PG_FUNCTION_ARGS) |
5525 | 0 | { |
5526 | 0 | bytea *sstate; |
5527 | 0 | StringInfo result; |
5528 | 0 | StringInfoData buf; |
5529 | 0 | char *data; |
5530 | 0 | int datalen; |
5531 | | |
5532 | | /* cannot be called directly because of internal-type argument */ |
5533 | 0 | Assert(AggCheckCallContext(fcinfo, NULL)); |
5534 | |
|
5535 | 0 | sstate = PG_GETARG_BYTEA_PP(0); |
5536 | | |
5537 | | /* |
5538 | | * Initialize a StringInfo so that we can "receive" it using the standard |
5539 | | * recv-function infrastructure. |
5540 | | */ |
5541 | 0 | initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate), |
5542 | 0 | VARSIZE_ANY_EXHDR(sstate)); |
5543 | |
|
5544 | 0 | result = makeStringAggState(fcinfo); |
5545 | | |
5546 | | /* cursor */ |
5547 | 0 | result->cursor = pq_getmsgint(&buf, 4); |
5548 | | |
5549 | | /* data */ |
5550 | 0 | datalen = VARSIZE_ANY_EXHDR(sstate) - 4; |
5551 | 0 | data = (char *) pq_getmsgbytes(&buf, datalen); |
5552 | 0 | appendBinaryStringInfo(result, data, datalen); |
5553 | |
|
5554 | 0 | pq_getmsgend(&buf); |
5555 | |
|
5556 | 0 | PG_RETURN_POINTER(result); |
5557 | 0 | } |
5558 | | |
5559 | | Datum |
5560 | | string_agg_finalfn(PG_FUNCTION_ARGS) |
5561 | 0 | { |
5562 | 0 | StringInfo state; |
5563 | | |
5564 | | /* cannot be called directly because of internal-type argument */ |
5565 | 0 | Assert(AggCheckCallContext(fcinfo, NULL)); |
5566 | |
|
5567 | 0 | state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); |
5568 | |
|
5569 | 0 | if (state != NULL) |
5570 | 0 | { |
5571 | | /* As per comment in transfn, strip data before the cursor position */ |
5572 | 0 | PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor], |
5573 | 0 | state->len - state->cursor)); |
5574 | 0 | } |
5575 | 0 | else |
5576 | 0 | PG_RETURN_NULL(); |
5577 | 0 | } |
5578 | | |
5579 | | /* |
5580 | | * Prepare cache with fmgr info for the output functions of the datatypes of |
5581 | | * the arguments of a concat-like function, beginning with argument "argidx". |
5582 | | * (Arguments before that will have corresponding slots in the resulting |
5583 | | * FmgrInfo array, but we don't fill those slots.) |
5584 | | */ |
5585 | | static FmgrInfo * |
5586 | | build_concat_foutcache(FunctionCallInfo fcinfo, int argidx) |
5587 | 0 | { |
5588 | 0 | FmgrInfo *foutcache; |
5589 | 0 | int i; |
5590 | | |
5591 | | /* We keep the info in fn_mcxt so it survives across calls */ |
5592 | 0 | foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, |
5593 | 0 | PG_NARGS() * sizeof(FmgrInfo)); |
5594 | |
|
5595 | 0 | for (i = argidx; i < PG_NARGS(); i++) |
5596 | 0 | { |
5597 | 0 | Oid valtype; |
5598 | 0 | Oid typOutput; |
5599 | 0 | bool typIsVarlena; |
5600 | |
|
5601 | 0 | valtype = get_fn_expr_argtype(fcinfo->flinfo, i); |
5602 | 0 | if (!OidIsValid(valtype)) |
5603 | 0 | elog(ERROR, "could not determine data type of concat() input"); |
5604 | | |
5605 | 0 | getTypeOutputInfo(valtype, &typOutput, &typIsVarlena); |
5606 | 0 | fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt); |
5607 | 0 | } |
5608 | | |
5609 | 0 | fcinfo->flinfo->fn_extra = foutcache; |
5610 | |
|
5611 | 0 | return foutcache; |
5612 | 0 | } |
5613 | | |
5614 | | /* |
5615 | | * Implementation of both concat() and concat_ws(). |
5616 | | * |
5617 | | * sepstr is the separator string to place between values. |
5618 | | * argidx identifies the first argument to concatenate (counting from zero); |
5619 | | * note that this must be constant across any one series of calls. |
5620 | | * |
5621 | | * Returns NULL if result should be NULL, else text value. |
5622 | | */ |
5623 | | static text * |
5624 | | concat_internal(const char *sepstr, int argidx, |
5625 | | FunctionCallInfo fcinfo) |
5626 | 0 | { |
5627 | 0 | text *result; |
5628 | 0 | StringInfoData str; |
5629 | 0 | FmgrInfo *foutcache; |
5630 | 0 | bool first_arg = true; |
5631 | 0 | int i; |
5632 | | |
5633 | | /* |
5634 | | * concat(VARIADIC some-array) is essentially equivalent to |
5635 | | * array_to_text(), ie concat the array elements with the given separator. |
5636 | | * So we just pass the case off to that code. |
5637 | | */ |
5638 | 0 | if (get_fn_expr_variadic(fcinfo->flinfo)) |
5639 | 0 | { |
5640 | 0 | ArrayType *arr; |
5641 | | |
5642 | | /* Should have just the one argument */ |
5643 | 0 | Assert(argidx == PG_NARGS() - 1); |
5644 | | |
5645 | | /* concat(VARIADIC NULL) is defined as NULL */ |
5646 | 0 | if (PG_ARGISNULL(argidx)) |
5647 | 0 | return NULL; |
5648 | | |
5649 | | /* |
5650 | | * Non-null argument had better be an array. We assume that any call |
5651 | | * context that could let get_fn_expr_variadic return true will have |
5652 | | * checked that a VARIADIC-labeled parameter actually is an array. So |
5653 | | * it should be okay to just Assert that it's an array rather than |
5654 | | * doing a full-fledged error check. |
5655 | | */ |
5656 | 0 | Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx)))); |
5657 | | |
5658 | | /* OK, safe to fetch the array value */ |
5659 | 0 | arr = PG_GETARG_ARRAYTYPE_P(argidx); |
5660 | | |
5661 | | /* |
5662 | | * And serialize the array. We tell array_to_text to ignore null |
5663 | | * elements, which matches the behavior of the loop below. |
5664 | | */ |
5665 | 0 | return array_to_text_internal(fcinfo, arr, sepstr, NULL); |
5666 | 0 | } |
5667 | | |
5668 | | /* Normal case without explicit VARIADIC marker */ |
5669 | 0 | initStringInfo(&str); |
5670 | | |
5671 | | /* Get output function info, building it if first time through */ |
5672 | 0 | foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra; |
5673 | 0 | if (foutcache == NULL) |
5674 | 0 | foutcache = build_concat_foutcache(fcinfo, argidx); |
5675 | |
|
5676 | 0 | for (i = argidx; i < PG_NARGS(); i++) |
5677 | 0 | { |
5678 | 0 | if (!PG_ARGISNULL(i)) |
5679 | 0 | { |
5680 | 0 | Datum value = PG_GETARG_DATUM(i); |
5681 | | |
5682 | | /* add separator if appropriate */ |
5683 | 0 | if (first_arg) |
5684 | 0 | first_arg = false; |
5685 | 0 | else |
5686 | 0 | appendStringInfoString(&str, sepstr); |
5687 | | |
5688 | | /* call the appropriate type output function, append the result */ |
5689 | 0 | appendStringInfoString(&str, |
5690 | 0 | OutputFunctionCall(&foutcache[i], value)); |
5691 | 0 | } |
5692 | 0 | } |
5693 | |
|
5694 | 0 | result = cstring_to_text_with_len(str.data, str.len); |
5695 | 0 | pfree(str.data); |
5696 | |
|
5697 | 0 | return result; |
5698 | 0 | } |
5699 | | |
5700 | | /* |
5701 | | * Concatenate all arguments. NULL arguments are ignored. |
5702 | | */ |
5703 | | Datum |
5704 | | text_concat(PG_FUNCTION_ARGS) |
5705 | 0 | { |
5706 | 0 | text *result; |
5707 | |
|
5708 | 0 | result = concat_internal("", 0, fcinfo); |
5709 | 0 | if (result == NULL) |
5710 | 0 | PG_RETURN_NULL(); |
5711 | 0 | PG_RETURN_TEXT_P(result); |
5712 | 0 | } |
5713 | | |
5714 | | /* |
5715 | | * Concatenate all but first argument value with separators. The first |
5716 | | * parameter is used as the separator. NULL arguments are ignored. |
5717 | | */ |
5718 | | Datum |
5719 | | text_concat_ws(PG_FUNCTION_ARGS) |
5720 | 0 | { |
5721 | 0 | char *sep; |
5722 | 0 | text *result; |
5723 | | |
5724 | | /* return NULL when separator is NULL */ |
5725 | 0 | if (PG_ARGISNULL(0)) |
5726 | 0 | PG_RETURN_NULL(); |
5727 | 0 | sep = text_to_cstring(PG_GETARG_TEXT_PP(0)); |
5728 | |
|
5729 | 0 | result = concat_internal(sep, 1, fcinfo); |
5730 | 0 | if (result == NULL) |
5731 | 0 | PG_RETURN_NULL(); |
5732 | 0 | PG_RETURN_TEXT_P(result); |
5733 | 0 | } |
5734 | | |
5735 | | /* |
5736 | | * Return first n characters in the string. When n is negative, |
5737 | | * return all but last |n| characters. |
5738 | | */ |
5739 | | Datum |
5740 | | text_left(PG_FUNCTION_ARGS) |
5741 | 0 | { |
5742 | 0 | int n = PG_GETARG_INT32(1); |
5743 | |
|
5744 | 0 | if (n < 0) |
5745 | 0 | { |
5746 | 0 | text *str = PG_GETARG_TEXT_PP(0); |
5747 | 0 | const char *p = VARDATA_ANY(str); |
5748 | 0 | int len = VARSIZE_ANY_EXHDR(str); |
5749 | 0 | int rlen; |
5750 | |
|
5751 | 0 | n = pg_mbstrlen_with_len(p, len) + n; |
5752 | 0 | rlen = pg_mbcharcliplen(p, len, n); |
5753 | 0 | PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen)); |
5754 | 0 | } |
5755 | 0 | else |
5756 | 0 | PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false)); |
5757 | 0 | } |
5758 | | |
5759 | | /* |
5760 | | * Return last n characters in the string. When n is negative, |
5761 | | * return all but first |n| characters. |
5762 | | */ |
5763 | | Datum |
5764 | | text_right(PG_FUNCTION_ARGS) |
5765 | 0 | { |
5766 | 0 | text *str = PG_GETARG_TEXT_PP(0); |
5767 | 0 | const char *p = VARDATA_ANY(str); |
5768 | 0 | int len = VARSIZE_ANY_EXHDR(str); |
5769 | 0 | int n = PG_GETARG_INT32(1); |
5770 | 0 | int off; |
5771 | |
|
5772 | 0 | if (n < 0) |
5773 | 0 | n = -n; |
5774 | 0 | else |
5775 | 0 | n = pg_mbstrlen_with_len(p, len) - n; |
5776 | 0 | off = pg_mbcharcliplen(p, len, n); |
5777 | |
|
5778 | 0 | PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off)); |
5779 | 0 | } |
5780 | | |
5781 | | /* |
5782 | | * Return reversed string |
5783 | | */ |
5784 | | Datum |
5785 | | text_reverse(PG_FUNCTION_ARGS) |
5786 | 0 | { |
5787 | 0 | text *str = PG_GETARG_TEXT_PP(0); |
5788 | 0 | const char *p = VARDATA_ANY(str); |
5789 | 0 | int len = VARSIZE_ANY_EXHDR(str); |
5790 | 0 | const char *endp = p + len; |
5791 | 0 | text *result; |
5792 | 0 | char *dst; |
5793 | |
|
5794 | 0 | result = palloc(len + VARHDRSZ); |
5795 | 0 | dst = (char *) VARDATA(result) + len; |
5796 | 0 | SET_VARSIZE(result, len + VARHDRSZ); |
5797 | |
|
5798 | 0 | if (pg_database_encoding_max_length() > 1) |
5799 | 0 | { |
5800 | | /* multibyte version */ |
5801 | 0 | while (p < endp) |
5802 | 0 | { |
5803 | 0 | int sz; |
5804 | |
|
5805 | 0 | sz = pg_mblen(p); |
5806 | 0 | dst -= sz; |
5807 | 0 | memcpy(dst, p, sz); |
5808 | 0 | p += sz; |
5809 | 0 | } |
5810 | 0 | } |
5811 | 0 | else |
5812 | 0 | { |
5813 | | /* single byte version */ |
5814 | 0 | while (p < endp) |
5815 | 0 | *(--dst) = *p++; |
5816 | 0 | } |
5817 | |
|
5818 | 0 | PG_RETURN_TEXT_P(result); |
5819 | 0 | } |
5820 | | |
5821 | | |
5822 | | /* |
5823 | | * Support macros for text_format() |
5824 | | */ |
5825 | 0 | #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */ |
5826 | | |
5827 | | #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \ |
5828 | 0 | do { \ |
5829 | 0 | if (++(ptr) >= (end_ptr)) \ |
5830 | 0 | ereport(ERROR, \ |
5831 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \ |
5832 | 0 | errmsg("unterminated format() type specifier"), \ |
5833 | 0 | errhint("For a single \"%%\" use \"%%%%\"."))); \ |
5834 | 0 | } while (0) |
5835 | | |
5836 | | /* |
5837 | | * Returns a formatted string |
5838 | | */ |
5839 | | Datum |
5840 | | text_format(PG_FUNCTION_ARGS) |
5841 | 0 | { |
5842 | 0 | text *fmt; |
5843 | 0 | StringInfoData str; |
5844 | 0 | const char *cp; |
5845 | 0 | const char *start_ptr; |
5846 | 0 | const char *end_ptr; |
5847 | 0 | text *result; |
5848 | 0 | int arg; |
5849 | 0 | bool funcvariadic; |
5850 | 0 | int nargs; |
5851 | 0 | Datum *elements = NULL; |
5852 | 0 | bool *nulls = NULL; |
5853 | 0 | Oid element_type = InvalidOid; |
5854 | 0 | Oid prev_type = InvalidOid; |
5855 | 0 | Oid prev_width_type = InvalidOid; |
5856 | 0 | FmgrInfo typoutputfinfo; |
5857 | 0 | FmgrInfo typoutputinfo_width; |
5858 | | |
5859 | | /* When format string is null, immediately return null */ |
5860 | 0 | if (PG_ARGISNULL(0)) |
5861 | 0 | PG_RETURN_NULL(); |
5862 | | |
5863 | | /* If argument is marked VARIADIC, expand array into elements */ |
5864 | 0 | if (get_fn_expr_variadic(fcinfo->flinfo)) |
5865 | 0 | { |
5866 | 0 | ArrayType *arr; |
5867 | 0 | int16 elmlen; |
5868 | 0 | bool elmbyval; |
5869 | 0 | char elmalign; |
5870 | 0 | int nitems; |
5871 | | |
5872 | | /* Should have just the one argument */ |
5873 | 0 | Assert(PG_NARGS() == 2); |
5874 | | |
5875 | | /* If argument is NULL, we treat it as zero-length array */ |
5876 | 0 | if (PG_ARGISNULL(1)) |
5877 | 0 | nitems = 0; |
5878 | 0 | else |
5879 | 0 | { |
5880 | | /* |
5881 | | * Non-null argument had better be an array. We assume that any |
5882 | | * call context that could let get_fn_expr_variadic return true |
5883 | | * will have checked that a VARIADIC-labeled parameter actually is |
5884 | | * an array. So it should be okay to just Assert that it's an |
5885 | | * array rather than doing a full-fledged error check. |
5886 | | */ |
5887 | 0 | Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1)))); |
5888 | | |
5889 | | /* OK, safe to fetch the array value */ |
5890 | 0 | arr = PG_GETARG_ARRAYTYPE_P(1); |
5891 | | |
5892 | | /* Get info about array element type */ |
5893 | 0 | element_type = ARR_ELEMTYPE(arr); |
5894 | 0 | get_typlenbyvalalign(element_type, |
5895 | 0 | &elmlen, &elmbyval, &elmalign); |
5896 | | |
5897 | | /* Extract all array elements */ |
5898 | 0 | deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign, |
5899 | 0 | &elements, &nulls, &nitems); |
5900 | 0 | } |
5901 | |
|
5902 | 0 | nargs = nitems + 1; |
5903 | 0 | funcvariadic = true; |
5904 | 0 | } |
5905 | 0 | else |
5906 | 0 | { |
5907 | | /* Non-variadic case, we'll process the arguments individually */ |
5908 | 0 | nargs = PG_NARGS(); |
5909 | 0 | funcvariadic = false; |
5910 | 0 | } |
5911 | | |
5912 | | /* Setup for main loop. */ |
5913 | 0 | fmt = PG_GETARG_TEXT_PP(0); |
5914 | 0 | start_ptr = VARDATA_ANY(fmt); |
5915 | 0 | end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt); |
5916 | 0 | initStringInfo(&str); |
5917 | 0 | arg = 1; /* next argument position to print */ |
5918 | | |
5919 | | /* Scan format string, looking for conversion specifiers. */ |
5920 | 0 | for (cp = start_ptr; cp < end_ptr; cp++) |
5921 | 0 | { |
5922 | 0 | int argpos; |
5923 | 0 | int widthpos; |
5924 | 0 | int flags; |
5925 | 0 | int width; |
5926 | 0 | Datum value; |
5927 | 0 | bool isNull; |
5928 | 0 | Oid typid; |
5929 | | |
5930 | | /* |
5931 | | * If it's not the start of a conversion specifier, just copy it to |
5932 | | * the output buffer. |
5933 | | */ |
5934 | 0 | if (*cp != '%') |
5935 | 0 | { |
5936 | 0 | appendStringInfoCharMacro(&str, *cp); |
5937 | 0 | continue; |
5938 | 0 | } |
5939 | | |
5940 | 0 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
5941 | | |
5942 | | /* Easy case: %% outputs a single % */ |
5943 | 0 | if (*cp == '%') |
5944 | 0 | { |
5945 | 0 | appendStringInfoCharMacro(&str, *cp); |
5946 | 0 | continue; |
5947 | 0 | } |
5948 | | |
5949 | | /* Parse the optional portions of the format specifier */ |
5950 | 0 | cp = text_format_parse_format(cp, end_ptr, |
5951 | 0 | &argpos, &widthpos, |
5952 | 0 | &flags, &width); |
5953 | | |
5954 | | /* |
5955 | | * Next we should see the main conversion specifier. Whether or not |
5956 | | * an argument position was present, it's known that at least one |
5957 | | * character remains in the string at this point. Experience suggests |
5958 | | * that it's worth checking that that character is one of the expected |
5959 | | * ones before we try to fetch arguments, so as to produce the least |
5960 | | * confusing response to a mis-formatted specifier. |
5961 | | */ |
5962 | 0 | if (strchr("sIL", *cp) == NULL) |
5963 | 0 | ereport(ERROR, |
5964 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
5965 | 0 | errmsg("unrecognized format() type specifier \"%.*s\"", |
5966 | 0 | pg_mblen(cp), cp), |
5967 | 0 | errhint("For a single \"%%\" use \"%%%%\"."))); |
5968 | | |
5969 | | /* If indirect width was specified, get its value */ |
5970 | 0 | if (widthpos >= 0) |
5971 | 0 | { |
5972 | | /* Collect the specified or next argument position */ |
5973 | 0 | if (widthpos > 0) |
5974 | 0 | arg = widthpos; |
5975 | 0 | if (arg >= nargs) |
5976 | 0 | ereport(ERROR, |
5977 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
5978 | 0 | errmsg("too few arguments for format()"))); |
5979 | | |
5980 | | /* Get the value and type of the selected argument */ |
5981 | 0 | if (!funcvariadic) |
5982 | 0 | { |
5983 | 0 | value = PG_GETARG_DATUM(arg); |
5984 | 0 | isNull = PG_ARGISNULL(arg); |
5985 | 0 | typid = get_fn_expr_argtype(fcinfo->flinfo, arg); |
5986 | 0 | } |
5987 | 0 | else |
5988 | 0 | { |
5989 | 0 | value = elements[arg - 1]; |
5990 | 0 | isNull = nulls[arg - 1]; |
5991 | 0 | typid = element_type; |
5992 | 0 | } |
5993 | 0 | if (!OidIsValid(typid)) |
5994 | 0 | elog(ERROR, "could not determine data type of format() input"); |
5995 | | |
5996 | 0 | arg++; |
5997 | | |
5998 | | /* We can treat NULL width the same as zero */ |
5999 | 0 | if (isNull) |
6000 | 0 | width = 0; |
6001 | 0 | else if (typid == INT4OID) |
6002 | 0 | width = DatumGetInt32(value); |
6003 | 0 | else if (typid == INT2OID) |
6004 | 0 | width = DatumGetInt16(value); |
6005 | 0 | else |
6006 | 0 | { |
6007 | | /* For less-usual datatypes, convert to text then to int */ |
6008 | 0 | char *str; |
6009 | |
|
6010 | 0 | if (typid != prev_width_type) |
6011 | 0 | { |
6012 | 0 | Oid typoutputfunc; |
6013 | 0 | bool typIsVarlena; |
6014 | |
|
6015 | 0 | getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena); |
6016 | 0 | fmgr_info(typoutputfunc, &typoutputinfo_width); |
6017 | 0 | prev_width_type = typid; |
6018 | 0 | } |
6019 | |
|
6020 | 0 | str = OutputFunctionCall(&typoutputinfo_width, value); |
6021 | | |
6022 | | /* pg_strtoint32 will complain about bad data or overflow */ |
6023 | 0 | width = pg_strtoint32(str); |
6024 | |
|
6025 | 0 | pfree(str); |
6026 | 0 | } |
6027 | 0 | } |
6028 | | |
6029 | | /* Collect the specified or next argument position */ |
6030 | 0 | if (argpos > 0) |
6031 | 0 | arg = argpos; |
6032 | 0 | if (arg >= nargs) |
6033 | 0 | ereport(ERROR, |
6034 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
6035 | 0 | errmsg("too few arguments for format()"))); |
6036 | | |
6037 | | /* Get the value and type of the selected argument */ |
6038 | 0 | if (!funcvariadic) |
6039 | 0 | { |
6040 | 0 | value = PG_GETARG_DATUM(arg); |
6041 | 0 | isNull = PG_ARGISNULL(arg); |
6042 | 0 | typid = get_fn_expr_argtype(fcinfo->flinfo, arg); |
6043 | 0 | } |
6044 | 0 | else |
6045 | 0 | { |
6046 | 0 | value = elements[arg - 1]; |
6047 | 0 | isNull = nulls[arg - 1]; |
6048 | 0 | typid = element_type; |
6049 | 0 | } |
6050 | 0 | if (!OidIsValid(typid)) |
6051 | 0 | elog(ERROR, "could not determine data type of format() input"); |
6052 | | |
6053 | 0 | arg++; |
6054 | | |
6055 | | /* |
6056 | | * Get the appropriate typOutput function, reusing previous one if |
6057 | | * same type as previous argument. That's particularly useful in the |
6058 | | * variadic-array case, but often saves work even for ordinary calls. |
6059 | | */ |
6060 | 0 | if (typid != prev_type) |
6061 | 0 | { |
6062 | 0 | Oid typoutputfunc; |
6063 | 0 | bool typIsVarlena; |
6064 | |
|
6065 | 0 | getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena); |
6066 | 0 | fmgr_info(typoutputfunc, &typoutputfinfo); |
6067 | 0 | prev_type = typid; |
6068 | 0 | } |
6069 | | |
6070 | | /* |
6071 | | * And now we can format the value. |
6072 | | */ |
6073 | 0 | switch (*cp) |
6074 | 0 | { |
6075 | 0 | case 's': |
6076 | 0 | case 'I': |
6077 | 0 | case 'L': |
6078 | 0 | text_format_string_conversion(&str, *cp, &typoutputfinfo, |
6079 | 0 | value, isNull, |
6080 | 0 | flags, width); |
6081 | 0 | break; |
6082 | 0 | default: |
6083 | | /* should not get here, because of previous check */ |
6084 | 0 | ereport(ERROR, |
6085 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
6086 | 0 | errmsg("unrecognized format() type specifier \"%.*s\"", |
6087 | 0 | pg_mblen(cp), cp), |
6088 | 0 | errhint("For a single \"%%\" use \"%%%%\"."))); |
6089 | 0 | break; |
6090 | 0 | } |
6091 | 0 | } |
6092 | | |
6093 | | /* Don't need deconstruct_array results anymore. */ |
6094 | 0 | if (elements != NULL) |
6095 | 0 | pfree(elements); |
6096 | 0 | if (nulls != NULL) |
6097 | 0 | pfree(nulls); |
6098 | | |
6099 | | /* Generate results. */ |
6100 | 0 | result = cstring_to_text_with_len(str.data, str.len); |
6101 | 0 | pfree(str.data); |
6102 | |
|
6103 | 0 | PG_RETURN_TEXT_P(result); |
6104 | 0 | } |
6105 | | |
6106 | | /* |
6107 | | * Parse contiguous digits as a decimal number. |
6108 | | * |
6109 | | * Returns true if some digits could be parsed. |
6110 | | * The value is returned into *value, and *ptr is advanced to the next |
6111 | | * character to be parsed. |
6112 | | * |
6113 | | * Note parsing invariant: at least one character is known available before |
6114 | | * string end (end_ptr) at entry, and this is still true at exit. |
6115 | | */ |
6116 | | static bool |
6117 | | text_format_parse_digits(const char **ptr, const char *end_ptr, int *value) |
6118 | 0 | { |
6119 | 0 | bool found = false; |
6120 | 0 | const char *cp = *ptr; |
6121 | 0 | int val = 0; |
6122 | |
|
6123 | 0 | while (*cp >= '0' && *cp <= '9') |
6124 | 0 | { |
6125 | 0 | int8 digit = (*cp - '0'); |
6126 | |
|
6127 | 0 | if (unlikely(pg_mul_s32_overflow(val, 10, &val)) || |
6128 | 0 | unlikely(pg_add_s32_overflow(val, digit, &val))) |
6129 | 0 | ereport(ERROR, |
6130 | 0 | (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
6131 | 0 | errmsg("number is out of range"))); |
6132 | 0 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
6133 | 0 | found = true; |
6134 | 0 | } |
6135 | | |
6136 | 0 | *ptr = cp; |
6137 | 0 | *value = val; |
6138 | |
|
6139 | 0 | return found; |
6140 | 0 | } |
6141 | | |
6142 | | /* |
6143 | | * Parse a format specifier (generally following the SUS printf spec). |
6144 | | * |
6145 | | * We have already advanced over the initial '%', and we are looking for |
6146 | | * [argpos][flags][width]type (but the type character is not consumed here). |
6147 | | * |
6148 | | * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1). |
6149 | | * Output parameters: |
6150 | | * argpos: argument position for value to be printed. -1 means unspecified. |
6151 | | * widthpos: argument position for width. Zero means the argument position |
6152 | | * was unspecified (ie, take the next arg) and -1 means no width |
6153 | | * argument (width was omitted or specified as a constant). |
6154 | | * flags: bitmask of flags. |
6155 | | * width: directly-specified width value. Zero means the width was omitted |
6156 | | * (note it's not necessary to distinguish this case from an explicit |
6157 | | * zero width value). |
6158 | | * |
6159 | | * The function result is the next character position to be parsed, ie, the |
6160 | | * location where the type character is/should be. |
6161 | | * |
6162 | | * Note parsing invariant: at least one character is known available before |
6163 | | * string end (end_ptr) at entry, and this is still true at exit. |
6164 | | */ |
6165 | | static const char * |
6166 | | text_format_parse_format(const char *start_ptr, const char *end_ptr, |
6167 | | int *argpos, int *widthpos, |
6168 | | int *flags, int *width) |
6169 | 0 | { |
6170 | 0 | const char *cp = start_ptr; |
6171 | 0 | int n; |
6172 | | |
6173 | | /* set defaults for output parameters */ |
6174 | 0 | *argpos = -1; |
6175 | 0 | *widthpos = -1; |
6176 | 0 | *flags = 0; |
6177 | 0 | *width = 0; |
6178 | | |
6179 | | /* try to identify first number */ |
6180 | 0 | if (text_format_parse_digits(&cp, end_ptr, &n)) |
6181 | 0 | { |
6182 | 0 | if (*cp != '$') |
6183 | 0 | { |
6184 | | /* Must be just a width and a type, so we're done */ |
6185 | 0 | *width = n; |
6186 | 0 | return cp; |
6187 | 0 | } |
6188 | | /* The number was argument position */ |
6189 | 0 | *argpos = n; |
6190 | | /* Explicit 0 for argument index is immediately refused */ |
6191 | 0 | if (n == 0) |
6192 | 0 | ereport(ERROR, |
6193 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
6194 | 0 | errmsg("format specifies argument 0, but arguments are numbered from 1"))); |
6195 | 0 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
6196 | 0 | } |
6197 | | |
6198 | | /* Handle flags (only minus is supported now) */ |
6199 | 0 | while (*cp == '-') |
6200 | 0 | { |
6201 | 0 | *flags |= TEXT_FORMAT_FLAG_MINUS; |
6202 | 0 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
6203 | 0 | } |
6204 | | |
6205 | 0 | if (*cp == '*') |
6206 | 0 | { |
6207 | | /* Handle indirect width */ |
6208 | 0 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
6209 | 0 | if (text_format_parse_digits(&cp, end_ptr, &n)) |
6210 | 0 | { |
6211 | | /* number in this position must be closed by $ */ |
6212 | 0 | if (*cp != '$') |
6213 | 0 | ereport(ERROR, |
6214 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
6215 | 0 | errmsg("width argument position must be ended by \"$\""))); |
6216 | | /* The number was width argument position */ |
6217 | 0 | *widthpos = n; |
6218 | | /* Explicit 0 for argument index is immediately refused */ |
6219 | 0 | if (n == 0) |
6220 | 0 | ereport(ERROR, |
6221 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
6222 | 0 | errmsg("format specifies argument 0, but arguments are numbered from 1"))); |
6223 | 0 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
6224 | 0 | } |
6225 | 0 | else |
6226 | 0 | *widthpos = 0; /* width's argument position is unspecified */ |
6227 | 0 | } |
6228 | 0 | else |
6229 | 0 | { |
6230 | | /* Check for direct width specification */ |
6231 | 0 | if (text_format_parse_digits(&cp, end_ptr, &n)) |
6232 | 0 | *width = n; |
6233 | 0 | } |
6234 | | |
6235 | | /* cp should now be pointing at type character */ |
6236 | 0 | return cp; |
6237 | 0 | } |
6238 | | |
6239 | | /* |
6240 | | * Format a %s, %I, or %L conversion |
6241 | | */ |
6242 | | static void |
6243 | | text_format_string_conversion(StringInfo buf, char conversion, |
6244 | | FmgrInfo *typOutputInfo, |
6245 | | Datum value, bool isNull, |
6246 | | int flags, int width) |
6247 | 0 | { |
6248 | 0 | char *str; |
6249 | | |
6250 | | /* Handle NULL arguments before trying to stringify the value. */ |
6251 | 0 | if (isNull) |
6252 | 0 | { |
6253 | 0 | if (conversion == 's') |
6254 | 0 | text_format_append_string(buf, "", flags, width); |
6255 | 0 | else if (conversion == 'L') |
6256 | 0 | text_format_append_string(buf, "NULL", flags, width); |
6257 | 0 | else if (conversion == 'I') |
6258 | 0 | ereport(ERROR, |
6259 | 0 | (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), |
6260 | 0 | errmsg("null values cannot be formatted as an SQL identifier"))); |
6261 | 0 | return; |
6262 | 0 | } |
6263 | | |
6264 | | /* Stringify. */ |
6265 | 0 | str = OutputFunctionCall(typOutputInfo, value); |
6266 | | |
6267 | | /* Escape. */ |
6268 | 0 | if (conversion == 'I') |
6269 | 0 | { |
6270 | | /* quote_identifier may or may not allocate a new string. */ |
6271 | 0 | text_format_append_string(buf, quote_identifier(str), flags, width); |
6272 | 0 | } |
6273 | 0 | else if (conversion == 'L') |
6274 | 0 | { |
6275 | 0 | char *qstr = quote_literal_cstr(str); |
6276 | |
|
6277 | 0 | text_format_append_string(buf, qstr, flags, width); |
6278 | | /* quote_literal_cstr() always allocates a new string */ |
6279 | 0 | pfree(qstr); |
6280 | 0 | } |
6281 | 0 | else |
6282 | 0 | text_format_append_string(buf, str, flags, width); |
6283 | | |
6284 | | /* Cleanup. */ |
6285 | 0 | pfree(str); |
6286 | 0 | } |
6287 | | |
6288 | | /* |
6289 | | * Append str to buf, padding as directed by flags/width |
6290 | | */ |
6291 | | static void |
6292 | | text_format_append_string(StringInfo buf, const char *str, |
6293 | | int flags, int width) |
6294 | 0 | { |
6295 | 0 | bool align_to_left = false; |
6296 | 0 | int len; |
6297 | | |
6298 | | /* fast path for typical easy case */ |
6299 | 0 | if (width == 0) |
6300 | 0 | { |
6301 | 0 | appendStringInfoString(buf, str); |
6302 | 0 | return; |
6303 | 0 | } |
6304 | | |
6305 | 0 | if (width < 0) |
6306 | 0 | { |
6307 | | /* Negative width: implicit '-' flag, then take absolute value */ |
6308 | 0 | align_to_left = true; |
6309 | | /* -INT_MIN is undefined */ |
6310 | 0 | if (width <= INT_MIN) |
6311 | 0 | ereport(ERROR, |
6312 | 0 | (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
6313 | 0 | errmsg("number is out of range"))); |
6314 | 0 | width = -width; |
6315 | 0 | } |
6316 | 0 | else if (flags & TEXT_FORMAT_FLAG_MINUS) |
6317 | 0 | align_to_left = true; |
6318 | | |
6319 | 0 | len = pg_mbstrlen(str); |
6320 | 0 | if (align_to_left) |
6321 | 0 | { |
6322 | | /* left justify */ |
6323 | 0 | appendStringInfoString(buf, str); |
6324 | 0 | if (len < width) |
6325 | 0 | appendStringInfoSpaces(buf, width - len); |
6326 | 0 | } |
6327 | 0 | else |
6328 | 0 | { |
6329 | | /* right justify */ |
6330 | 0 | if (len < width) |
6331 | 0 | appendStringInfoSpaces(buf, width - len); |
6332 | 0 | appendStringInfoString(buf, str); |
6333 | 0 | } |
6334 | 0 | } |
6335 | | |
6336 | | /* |
6337 | | * text_format_nv - nonvariadic wrapper for text_format function. |
6338 | | * |
6339 | | * note: this wrapper is necessary to pass the sanity check in opr_sanity, |
6340 | | * which checks that all built-in functions that share the implementing C |
6341 | | * function take the same number of arguments. |
6342 | | */ |
6343 | | Datum |
6344 | | text_format_nv(PG_FUNCTION_ARGS) |
6345 | 0 | { |
6346 | 0 | return text_format(fcinfo); |
6347 | 0 | } |
6348 | | |
6349 | | /* |
6350 | | * Helper function for Levenshtein distance functions. Faster than memcmp(), |
6351 | | * for this use case. |
6352 | | */ |
6353 | | static inline bool |
6354 | | rest_of_char_same(const char *s1, const char *s2, int len) |
6355 | 0 | { |
6356 | 0 | while (len > 0) |
6357 | 0 | { |
6358 | 0 | len--; |
6359 | 0 | if (s1[len] != s2[len]) |
6360 | 0 | return false; |
6361 | 0 | } |
6362 | 0 | return true; |
6363 | 0 | } |
6364 | | |
6365 | | /* Expand each Levenshtein distance variant */ |
6366 | | #include "levenshtein.c" |
6367 | | #define LEVENSHTEIN_LESS_EQUAL |
6368 | | #include "levenshtein.c" |
6369 | | |
6370 | | |
6371 | | /* |
6372 | | * The following *ClosestMatch() functions can be used to determine whether a |
6373 | | * user-provided string resembles any known valid values, which is useful for |
6374 | | * providing hints in log messages, among other things. Use these functions |
6375 | | * like so: |
6376 | | * |
6377 | | * initClosestMatch(&state, source_string, max_distance); |
6378 | | * |
6379 | | * for (int i = 0; i < num_valid_strings; i++) |
6380 | | * updateClosestMatch(&state, valid_strings[i]); |
6381 | | * |
6382 | | * closestMatch = getClosestMatch(&state); |
6383 | | */ |
6384 | | |
6385 | | /* |
6386 | | * Initialize the given state with the source string and maximum Levenshtein |
6387 | | * distance to consider. |
6388 | | */ |
6389 | | void |
6390 | | initClosestMatch(ClosestMatchState *state, const char *source, int max_d) |
6391 | 0 | { |
6392 | 0 | Assert(state); |
6393 | 0 | Assert(max_d >= 0); |
6394 | |
|
6395 | 0 | state->source = source; |
6396 | 0 | state->min_d = -1; |
6397 | 0 | state->max_d = max_d; |
6398 | 0 | state->match = NULL; |
6399 | 0 | } |
6400 | | |
6401 | | /* |
6402 | | * If the candidate string is a closer match than the current one saved (or |
6403 | | * there is no match saved), save it as the closest match. |
6404 | | * |
6405 | | * If the source or candidate string is NULL, empty, or too long, this function |
6406 | | * takes no action. Likewise, if the Levenshtein distance exceeds the maximum |
6407 | | * allowed or more than half the characters are different, no action is taken. |
6408 | | */ |
6409 | | void |
6410 | | updateClosestMatch(ClosestMatchState *state, const char *candidate) |
6411 | 0 | { |
6412 | 0 | int dist; |
6413 | |
|
6414 | 0 | Assert(state); |
6415 | |
|
6416 | 0 | if (state->source == NULL || state->source[0] == '\0' || |
6417 | 0 | candidate == NULL || candidate[0] == '\0') |
6418 | 0 | return; |
6419 | | |
6420 | | /* |
6421 | | * To avoid ERROR-ing, we check the lengths here instead of setting |
6422 | | * 'trusted' to false in the call to varstr_levenshtein_less_equal(). |
6423 | | */ |
6424 | 0 | if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN || |
6425 | 0 | strlen(candidate) > MAX_LEVENSHTEIN_STRLEN) |
6426 | 0 | return; |
6427 | | |
6428 | 0 | dist = varstr_levenshtein_less_equal(state->source, strlen(state->source), |
6429 | 0 | candidate, strlen(candidate), 1, 1, 1, |
6430 | 0 | state->max_d, true); |
6431 | 0 | if (dist <= state->max_d && |
6432 | 0 | dist <= strlen(state->source) / 2 && |
6433 | 0 | (state->min_d == -1 || dist < state->min_d)) |
6434 | 0 | { |
6435 | 0 | state->min_d = dist; |
6436 | 0 | state->match = candidate; |
6437 | 0 | } |
6438 | 0 | } |
6439 | | |
6440 | | /* |
6441 | | * Return the closest match. If no suitable candidates were provided via |
6442 | | * updateClosestMatch(), return NULL. |
6443 | | */ |
6444 | | const char * |
6445 | | getClosestMatch(ClosestMatchState *state) |
6446 | 0 | { |
6447 | 0 | Assert(state); |
6448 | |
|
6449 | 0 | return state->match; |
6450 | 0 | } |
6451 | | |
6452 | | |
6453 | | /* |
6454 | | * Unicode support |
6455 | | */ |
6456 | | |
6457 | | static UnicodeNormalizationForm |
6458 | | unicode_norm_form_from_string(const char *formstr) |
6459 | 0 | { |
6460 | 0 | UnicodeNormalizationForm form = -1; |
6461 | | |
6462 | | /* |
6463 | | * Might as well check this while we're here. |
6464 | | */ |
6465 | 0 | if (GetDatabaseEncoding() != PG_UTF8) |
6466 | 0 | ereport(ERROR, |
6467 | 0 | (errcode(ERRCODE_SYNTAX_ERROR), |
6468 | 0 | errmsg("Unicode normalization can only be performed if server encoding is UTF8"))); |
6469 | | |
6470 | 0 | if (pg_strcasecmp(formstr, "NFC") == 0) |
6471 | 0 | form = UNICODE_NFC; |
6472 | 0 | else if (pg_strcasecmp(formstr, "NFD") == 0) |
6473 | 0 | form = UNICODE_NFD; |
6474 | 0 | else if (pg_strcasecmp(formstr, "NFKC") == 0) |
6475 | 0 | form = UNICODE_NFKC; |
6476 | 0 | else if (pg_strcasecmp(formstr, "NFKD") == 0) |
6477 | 0 | form = UNICODE_NFKD; |
6478 | 0 | else |
6479 | 0 | ereport(ERROR, |
6480 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
6481 | 0 | errmsg("invalid normalization form: %s", formstr))); |
6482 | | |
6483 | 0 | return form; |
6484 | 0 | } |
6485 | | |
6486 | | /* |
6487 | | * Returns version of Unicode used by Postgres in "major.minor" format (the |
6488 | | * same format as the Unicode version reported by ICU). The third component |
6489 | | * ("update version") never involves additions to the character repertoire and |
6490 | | * is unimportant for most purposes. |
6491 | | * |
6492 | | * See: https://unicode.org/versions/ |
6493 | | */ |
6494 | | Datum |
6495 | | unicode_version(PG_FUNCTION_ARGS) |
6496 | 0 | { |
6497 | 0 | PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION)); |
6498 | 0 | } |
6499 | | |
6500 | | /* |
6501 | | * Returns version of Unicode used by ICU, if enabled; otherwise NULL. |
6502 | | */ |
6503 | | Datum |
6504 | | icu_unicode_version(PG_FUNCTION_ARGS) |
6505 | 0 | { |
6506 | 0 | #ifdef USE_ICU |
6507 | 0 | PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION)); |
6508 | | #else |
6509 | | PG_RETURN_NULL(); |
6510 | | #endif |
6511 | 0 | } |
6512 | | |
6513 | | /* |
6514 | | * Check whether the string contains only assigned Unicode code |
6515 | | * points. Requires that the database encoding is UTF-8. |
6516 | | */ |
6517 | | Datum |
6518 | | unicode_assigned(PG_FUNCTION_ARGS) |
6519 | 0 | { |
6520 | 0 | text *input = PG_GETARG_TEXT_PP(0); |
6521 | 0 | unsigned char *p; |
6522 | 0 | int size; |
6523 | |
|
6524 | 0 | if (GetDatabaseEncoding() != PG_UTF8) |
6525 | 0 | ereport(ERROR, |
6526 | 0 | (errmsg("Unicode categorization can only be performed if server encoding is UTF8"))); |
6527 | | |
6528 | | /* convert to pg_wchar */ |
6529 | 0 | size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); |
6530 | 0 | p = (unsigned char *) VARDATA_ANY(input); |
6531 | 0 | for (int i = 0; i < size; i++) |
6532 | 0 | { |
6533 | 0 | pg_wchar uchar = utf8_to_unicode(p); |
6534 | 0 | int category = unicode_category(uchar); |
6535 | |
|
6536 | 0 | if (category == PG_U_UNASSIGNED) |
6537 | 0 | PG_RETURN_BOOL(false); |
6538 | | |
6539 | 0 | p += pg_utf_mblen(p); |
6540 | 0 | } |
6541 | | |
6542 | 0 | PG_RETURN_BOOL(true); |
6543 | 0 | } |
6544 | | |
6545 | | Datum |
6546 | | unicode_normalize_func(PG_FUNCTION_ARGS) |
6547 | 0 | { |
6548 | 0 | text *input = PG_GETARG_TEXT_PP(0); |
6549 | 0 | char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); |
6550 | 0 | UnicodeNormalizationForm form; |
6551 | 0 | int size; |
6552 | 0 | pg_wchar *input_chars; |
6553 | 0 | pg_wchar *output_chars; |
6554 | 0 | unsigned char *p; |
6555 | 0 | text *result; |
6556 | 0 | int i; |
6557 | |
|
6558 | 0 | form = unicode_norm_form_from_string(formstr); |
6559 | | |
6560 | | /* convert to pg_wchar */ |
6561 | 0 | size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); |
6562 | 0 | input_chars = palloc((size + 1) * sizeof(pg_wchar)); |
6563 | 0 | p = (unsigned char *) VARDATA_ANY(input); |
6564 | 0 | for (i = 0; i < size; i++) |
6565 | 0 | { |
6566 | 0 | input_chars[i] = utf8_to_unicode(p); |
6567 | 0 | p += pg_utf_mblen(p); |
6568 | 0 | } |
6569 | 0 | input_chars[i] = (pg_wchar) '\0'; |
6570 | 0 | Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); |
6571 | | |
6572 | | /* action */ |
6573 | 0 | output_chars = unicode_normalize(form, input_chars); |
6574 | | |
6575 | | /* convert back to UTF-8 string */ |
6576 | 0 | size = 0; |
6577 | 0 | for (pg_wchar *wp = output_chars; *wp; wp++) |
6578 | 0 | { |
6579 | 0 | unsigned char buf[4]; |
6580 | |
|
6581 | 0 | unicode_to_utf8(*wp, buf); |
6582 | 0 | size += pg_utf_mblen(buf); |
6583 | 0 | } |
6584 | |
|
6585 | 0 | result = palloc(size + VARHDRSZ); |
6586 | 0 | SET_VARSIZE(result, size + VARHDRSZ); |
6587 | |
|
6588 | 0 | p = (unsigned char *) VARDATA_ANY(result); |
6589 | 0 | for (pg_wchar *wp = output_chars; *wp; wp++) |
6590 | 0 | { |
6591 | 0 | unicode_to_utf8(*wp, p); |
6592 | 0 | p += pg_utf_mblen(p); |
6593 | 0 | } |
6594 | 0 | Assert((char *) p == (char *) result + size + VARHDRSZ); |
6595 | |
|
6596 | 0 | PG_RETURN_TEXT_P(result); |
6597 | 0 | } |
6598 | | |
6599 | | /* |
6600 | | * Check whether the string is in the specified Unicode normalization form. |
6601 | | * |
6602 | | * This is done by converting the string to the specified normal form and then |
6603 | | * comparing that to the original string. To speed that up, we also apply the |
6604 | | * "quick check" algorithm specified in UAX #15, which can give a yes or no |
6605 | | * answer for many strings by just scanning the string once. |
6606 | | * |
6607 | | * This function should generally be optimized for the case where the string |
6608 | | * is in fact normalized. In that case, we'll end up looking at the entire |
6609 | | * string, so it's probably not worth doing any incremental conversion etc. |
6610 | | */ |
6611 | | Datum |
6612 | | unicode_is_normalized(PG_FUNCTION_ARGS) |
6613 | 0 | { |
6614 | 0 | text *input = PG_GETARG_TEXT_PP(0); |
6615 | 0 | char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); |
6616 | 0 | UnicodeNormalizationForm form; |
6617 | 0 | int size; |
6618 | 0 | pg_wchar *input_chars; |
6619 | 0 | pg_wchar *output_chars; |
6620 | 0 | unsigned char *p; |
6621 | 0 | int i; |
6622 | 0 | UnicodeNormalizationQC quickcheck; |
6623 | 0 | int output_size; |
6624 | 0 | bool result; |
6625 | |
|
6626 | 0 | form = unicode_norm_form_from_string(formstr); |
6627 | | |
6628 | | /* convert to pg_wchar */ |
6629 | 0 | size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); |
6630 | 0 | input_chars = palloc((size + 1) * sizeof(pg_wchar)); |
6631 | 0 | p = (unsigned char *) VARDATA_ANY(input); |
6632 | 0 | for (i = 0; i < size; i++) |
6633 | 0 | { |
6634 | 0 | input_chars[i] = utf8_to_unicode(p); |
6635 | 0 | p += pg_utf_mblen(p); |
6636 | 0 | } |
6637 | 0 | input_chars[i] = (pg_wchar) '\0'; |
6638 | 0 | Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); |
6639 | | |
6640 | | /* quick check (see UAX #15) */ |
6641 | 0 | quickcheck = unicode_is_normalized_quickcheck(form, input_chars); |
6642 | 0 | if (quickcheck == UNICODE_NORM_QC_YES) |
6643 | 0 | PG_RETURN_BOOL(true); |
6644 | 0 | else if (quickcheck == UNICODE_NORM_QC_NO) |
6645 | 0 | PG_RETURN_BOOL(false); |
6646 | | |
6647 | | /* normalize and compare with original */ |
6648 | 0 | output_chars = unicode_normalize(form, input_chars); |
6649 | |
|
6650 | 0 | output_size = 0; |
6651 | 0 | for (pg_wchar *wp = output_chars; *wp; wp++) |
6652 | 0 | output_size++; |
6653 | |
|
6654 | 0 | result = (size == output_size) && |
6655 | 0 | (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0); |
6656 | |
|
6657 | 0 | PG_RETURN_BOOL(result); |
6658 | 0 | } |
6659 | | |
6660 | | /* |
6661 | | * Check if first n chars are hexadecimal digits |
6662 | | */ |
6663 | | static bool |
6664 | | isxdigits_n(const char *instr, size_t n) |
6665 | 0 | { |
6666 | 0 | for (size_t i = 0; i < n; i++) |
6667 | 0 | if (!isxdigit((unsigned char) instr[i])) |
6668 | 0 | return false; |
6669 | | |
6670 | 0 | return true; |
6671 | 0 | } |
6672 | | |
6673 | | static unsigned int |
6674 | | hexval(unsigned char c) |
6675 | 0 | { |
6676 | 0 | if (c >= '0' && c <= '9') |
6677 | 0 | return c - '0'; |
6678 | 0 | if (c >= 'a' && c <= 'f') |
6679 | 0 | return c - 'a' + 0xA; |
6680 | 0 | if (c >= 'A' && c <= 'F') |
6681 | 0 | return c - 'A' + 0xA; |
6682 | 0 | elog(ERROR, "invalid hexadecimal digit"); |
6683 | 0 | return 0; /* not reached */ |
6684 | 0 | } |
6685 | | |
6686 | | /* |
6687 | | * Translate string with hexadecimal digits to number |
6688 | | */ |
6689 | | static unsigned int |
6690 | | hexval_n(const char *instr, size_t n) |
6691 | 0 | { |
6692 | 0 | unsigned int result = 0; |
6693 | |
|
6694 | 0 | for (size_t i = 0; i < n; i++) |
6695 | 0 | result += hexval(instr[i]) << (4 * (n - i - 1)); |
6696 | |
|
6697 | 0 | return result; |
6698 | 0 | } |
6699 | | |
6700 | | /* |
6701 | | * Replaces Unicode escape sequences by Unicode characters |
6702 | | */ |
6703 | | Datum |
6704 | | unistr(PG_FUNCTION_ARGS) |
6705 | 0 | { |
6706 | 0 | text *input_text = PG_GETARG_TEXT_PP(0); |
6707 | 0 | char *instr; |
6708 | 0 | int len; |
6709 | 0 | StringInfoData str; |
6710 | 0 | text *result; |
6711 | 0 | pg_wchar pair_first = 0; |
6712 | 0 | char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; |
6713 | |
|
6714 | 0 | instr = VARDATA_ANY(input_text); |
6715 | 0 | len = VARSIZE_ANY_EXHDR(input_text); |
6716 | |
|
6717 | 0 | initStringInfo(&str); |
6718 | |
|
6719 | 0 | while (len > 0) |
6720 | 0 | { |
6721 | 0 | if (instr[0] == '\\') |
6722 | 0 | { |
6723 | 0 | if (len >= 2 && |
6724 | 0 | instr[1] == '\\') |
6725 | 0 | { |
6726 | 0 | if (pair_first) |
6727 | 0 | goto invalid_pair; |
6728 | 0 | appendStringInfoChar(&str, '\\'); |
6729 | 0 | instr += 2; |
6730 | 0 | len -= 2; |
6731 | 0 | } |
6732 | 0 | else if ((len >= 5 && isxdigits_n(instr + 1, 4)) || |
6733 | 0 | (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4))) |
6734 | 0 | { |
6735 | 0 | pg_wchar unicode; |
6736 | 0 | int offset = instr[1] == 'u' ? 2 : 1; |
6737 | |
|
6738 | 0 | unicode = hexval_n(instr + offset, 4); |
6739 | |
|
6740 | 0 | if (!is_valid_unicode_codepoint(unicode)) |
6741 | 0 | ereport(ERROR, |
6742 | 0 | errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
6743 | 0 | errmsg("invalid Unicode code point: %04X", unicode)); |
6744 | | |
6745 | 0 | if (pair_first) |
6746 | 0 | { |
6747 | 0 | if (is_utf16_surrogate_second(unicode)) |
6748 | 0 | { |
6749 | 0 | unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
6750 | 0 | pair_first = 0; |
6751 | 0 | } |
6752 | 0 | else |
6753 | 0 | goto invalid_pair; |
6754 | 0 | } |
6755 | 0 | else if (is_utf16_surrogate_second(unicode)) |
6756 | 0 | goto invalid_pair; |
6757 | | |
6758 | 0 | if (is_utf16_surrogate_first(unicode)) |
6759 | 0 | pair_first = unicode; |
6760 | 0 | else |
6761 | 0 | { |
6762 | 0 | pg_unicode_to_server(unicode, (unsigned char *) cbuf); |
6763 | 0 | appendStringInfoString(&str, cbuf); |
6764 | 0 | } |
6765 | |
|
6766 | 0 | instr += 4 + offset; |
6767 | 0 | len -= 4 + offset; |
6768 | 0 | } |
6769 | 0 | else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6)) |
6770 | 0 | { |
6771 | 0 | pg_wchar unicode; |
6772 | |
|
6773 | 0 | unicode = hexval_n(instr + 2, 6); |
6774 | |
|
6775 | 0 | if (!is_valid_unicode_codepoint(unicode)) |
6776 | 0 | ereport(ERROR, |
6777 | 0 | errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
6778 | 0 | errmsg("invalid Unicode code point: %04X", unicode)); |
6779 | | |
6780 | 0 | if (pair_first) |
6781 | 0 | { |
6782 | 0 | if (is_utf16_surrogate_second(unicode)) |
6783 | 0 | { |
6784 | 0 | unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
6785 | 0 | pair_first = 0; |
6786 | 0 | } |
6787 | 0 | else |
6788 | 0 | goto invalid_pair; |
6789 | 0 | } |
6790 | 0 | else if (is_utf16_surrogate_second(unicode)) |
6791 | 0 | goto invalid_pair; |
6792 | | |
6793 | 0 | if (is_utf16_surrogate_first(unicode)) |
6794 | 0 | pair_first = unicode; |
6795 | 0 | else |
6796 | 0 | { |
6797 | 0 | pg_unicode_to_server(unicode, (unsigned char *) cbuf); |
6798 | 0 | appendStringInfoString(&str, cbuf); |
6799 | 0 | } |
6800 | |
|
6801 | 0 | instr += 8; |
6802 | 0 | len -= 8; |
6803 | 0 | } |
6804 | 0 | else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8)) |
6805 | 0 | { |
6806 | 0 | pg_wchar unicode; |
6807 | |
|
6808 | 0 | unicode = hexval_n(instr + 2, 8); |
6809 | |
|
6810 | 0 | if (!is_valid_unicode_codepoint(unicode)) |
6811 | 0 | ereport(ERROR, |
6812 | 0 | errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
6813 | 0 | errmsg("invalid Unicode code point: %04X", unicode)); |
6814 | | |
6815 | 0 | if (pair_first) |
6816 | 0 | { |
6817 | 0 | if (is_utf16_surrogate_second(unicode)) |
6818 | 0 | { |
6819 | 0 | unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
6820 | 0 | pair_first = 0; |
6821 | 0 | } |
6822 | 0 | else |
6823 | 0 | goto invalid_pair; |
6824 | 0 | } |
6825 | 0 | else if (is_utf16_surrogate_second(unicode)) |
6826 | 0 | goto invalid_pair; |
6827 | | |
6828 | 0 | if (is_utf16_surrogate_first(unicode)) |
6829 | 0 | pair_first = unicode; |
6830 | 0 | else |
6831 | 0 | { |
6832 | 0 | pg_unicode_to_server(unicode, (unsigned char *) cbuf); |
6833 | 0 | appendStringInfoString(&str, cbuf); |
6834 | 0 | } |
6835 | |
|
6836 | 0 | instr += 10; |
6837 | 0 | len -= 10; |
6838 | 0 | } |
6839 | 0 | else |
6840 | 0 | ereport(ERROR, |
6841 | 0 | (errcode(ERRCODE_SYNTAX_ERROR), |
6842 | 0 | errmsg("invalid Unicode escape"), |
6843 | 0 | errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX."))); |
6844 | 0 | } |
6845 | 0 | else |
6846 | 0 | { |
6847 | 0 | if (pair_first) |
6848 | 0 | goto invalid_pair; |
6849 | | |
6850 | 0 | appendStringInfoChar(&str, *instr++); |
6851 | 0 | len--; |
6852 | 0 | } |
6853 | 0 | } |
6854 | | |
6855 | | /* unfinished surrogate pair? */ |
6856 | 0 | if (pair_first) |
6857 | 0 | goto invalid_pair; |
6858 | | |
6859 | 0 | result = cstring_to_text_with_len(str.data, str.len); |
6860 | 0 | pfree(str.data); |
6861 | |
|
6862 | 0 | PG_RETURN_TEXT_P(result); |
6863 | | |
6864 | 0 | invalid_pair: |
6865 | 0 | ereport(ERROR, |
6866 | 0 | (errcode(ERRCODE_SYNTAX_ERROR), |
6867 | 0 | errmsg("invalid Unicode surrogate pair"))); |
6868 | 0 | PG_RETURN_NULL(); /* keep compiler quiet */ |
6869 | 0 | } |