Coverage Report

Created: 2025-06-15 06:31

/src/postgres/src/backend/regex/regc_pg_locale.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * regc_pg_locale.c
4
 *    ctype functions adapted to work on pg_wchar (a/k/a chr),
5
 *    and functions to cache the results of wholesale ctype probing.
6
 *
7
 * This file is #included by regcomp.c; it's not meant to compile standalone.
8
 *
9
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
10
 * Portions Copyright (c) 1994, Regents of the University of California
11
 *
12
 * IDENTIFICATION
13
 *    src/backend/regex/regc_pg_locale.c
14
 *
15
 *-------------------------------------------------------------------------
16
 */
17
18
#include "catalog/pg_collation.h"
19
#include "common/unicode_case.h"
20
#include "common/unicode_category.h"
21
#include "utils/pg_locale.h"
22
23
/*
24
 * For the libc provider, to provide as much functionality as possible on a
25
 * variety of platforms without going so far as to implement everything from
26
 * scratch, we use several implementation strategies depending on the
27
 * situation:
28
 *
29
 * 1. In C/POSIX collations, we use hard-wired code.  We can't depend on
30
 * the <ctype.h> functions since those will obey LC_CTYPE.  Note that these
31
 * collations don't give a fig about multibyte characters.
32
 *
33
 * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
34
 * This assumes that every platform uses Unicode codepoints directly
35
 * as the wchar_t representation of Unicode.  (XXX: ICU makes this assumption
36
 * even for non-UTF8 encodings, which may be a problem.)  On some platforms
37
 * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
38
 *
39
 * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
40
 * values up to 255, and punt for values above that.  This is 100% correct
41
 * only in single-byte encodings such as LATINn.  However, non-Unicode
42
 * multibyte encodings are mostly Far Eastern character sets for which the
43
 * properties being tested here aren't very relevant for higher code values
44
 * anyway.  The difficulty with using the <wctype.h> functions with
45
 * non-Unicode multibyte encodings is that we can have no certainty that
46
 * the platform's wchar_t representation matches what we do in pg_wchar
47
 * conversions.
48
 *
49
 * As a special case, in the "default" collation, (2) and (3) force ASCII
50
 * letters to follow ASCII upcase/downcase rules, while in a non-default
51
 * collation we just let the library functions do what they will.  The case
52
 * where this matters is treatment of I/i in Turkish, and the behavior is
53
 * meant to match the upper()/lower() SQL functions.
54
 *
55
 * We store the active collation setting in static variables.  In principle
56
 * it could be passed down to here via the regex library's "struct vars" data
57
 * structure; but that would require somewhat invasive changes in the regex
58
 * library, and right now there's no real benefit to be gained from that.
59
 *
60
 * NB: the coding here assumes pg_wchar is an unsigned type.
61
 */
62
63
typedef enum
64
{
65
  PG_REGEX_STRATEGY_C,    /* C locale (encoding independent) */
66
  PG_REGEX_STRATEGY_BUILTIN,  /* built-in Unicode semantics */
67
  PG_REGEX_STRATEGY_LIBC_WIDE,  /* Use locale_t <wctype.h> functions */
68
  PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */
69
  PG_REGEX_STRATEGY_ICU,    /* Use ICU uchar.h functions */
70
} PG_Locale_Strategy;
71
72
static PG_Locale_Strategy pg_regex_strategy;
73
static pg_locale_t pg_regex_locale;
74
75
/*
76
 * Hard-wired character properties for C locale
77
 */
78
0
#define PG_ISDIGIT  0x01
79
0
#define PG_ISALPHA  0x02
80
0
#define PG_ISALNUM  (PG_ISDIGIT | PG_ISALPHA)
81
0
#define PG_ISUPPER  0x04
82
0
#define PG_ISLOWER  0x08
83
0
#define PG_ISGRAPH  0x10
84
0
#define PG_ISPRINT  0x20
85
0
#define PG_ISPUNCT  0x40
86
0
#define PG_ISSPACE  0x80
87
88
static const unsigned char pg_char_properties[128] = {
89
   /* NUL */ 0,
90
   /* ^A */ 0,
91
   /* ^B */ 0,
92
   /* ^C */ 0,
93
   /* ^D */ 0,
94
   /* ^E */ 0,
95
   /* ^F */ 0,
96
   /* ^G */ 0,
97
   /* ^H */ 0,
98
   /* ^I */ PG_ISSPACE,
99
   /* ^J */ PG_ISSPACE,
100
   /* ^K */ PG_ISSPACE,
101
   /* ^L */ PG_ISSPACE,
102
   /* ^M */ PG_ISSPACE,
103
   /* ^N */ 0,
104
   /* ^O */ 0,
105
   /* ^P */ 0,
106
   /* ^Q */ 0,
107
   /* ^R */ 0,
108
   /* ^S */ 0,
109
   /* ^T */ 0,
110
   /* ^U */ 0,
111
   /* ^V */ 0,
112
   /* ^W */ 0,
113
   /* ^X */ 0,
114
   /* ^Y */ 0,
115
   /* ^Z */ 0,
116
   /* ^[ */ 0,
117
   /* ^\ */ 0,
118
   /* ^] */ 0,
119
   /* ^^ */ 0,
120
   /* ^_ */ 0,
121
   /* */ PG_ISPRINT | PG_ISSPACE,
122
   /* !  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
123
   /* "  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
124
   /* #  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
125
   /* $  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
126
   /* %  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
127
   /* &  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
128
   /* '  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
129
   /* (  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
130
   /* )  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
131
   /* *  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
132
   /* +  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
133
   /* ,  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
134
   /* -  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
135
   /* .  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
136
   /* /  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
137
   /* 0  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
138
   /* 1  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
139
   /* 2  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
140
   /* 3  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
141
   /* 4  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
142
   /* 5  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
143
   /* 6  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
144
   /* 7  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
145
   /* 8  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
146
   /* 9  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
147
   /* :  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
148
   /* ;  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
149
   /* <  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
150
   /* =  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
151
   /* >  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
152
   /* ?  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
153
   /* @  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
154
   /* A  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
155
   /* B  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
156
   /* C  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
157
   /* D  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
158
   /* E  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
159
   /* F  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
160
   /* G  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
161
   /* H  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
162
   /* I  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
163
   /* J  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
164
   /* K  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
165
   /* L  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
166
   /* M  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
167
   /* N  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
168
   /* O  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
169
   /* P  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
170
   /* Q  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
171
   /* R  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
172
   /* S  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
173
   /* T  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
174
   /* U  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
175
   /* V  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
176
   /* W  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
177
   /* X  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
178
   /* Y  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
179
   /* Z  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
180
   /* [  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
181
   /* \  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
182
   /* ]  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
183
   /* ^  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
184
   /* _  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
185
   /* `  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
186
   /* a  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
187
   /* b  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
188
   /* c  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
189
   /* d  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
190
   /* e  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
191
   /* f  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
192
   /* g  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
193
   /* h  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
194
   /* i  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
195
   /* j  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
196
   /* k  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
197
   /* l  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
198
   /* m  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
199
   /* n  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
200
   /* o  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
201
   /* p  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
202
   /* q  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
203
   /* r  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
204
   /* s  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
205
   /* t  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
206
   /* u  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
207
   /* v  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
208
   /* w  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
209
   /* x  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
210
   /* y  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
211
   /* z  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
212
   /* {  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
213
   /* |  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
214
   /* }  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
215
   /* ~  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
216
   /* DEL */ 0
217
};
218
219
220
/*
221
 * pg_set_regex_collation: set collation for these functions to obey
222
 *
223
 * This is called when beginning compilation or execution of a regexp.
224
 * Since there's no need for reentrancy of regexp operations, it's okay
225
 * to store the results in static variables.
226
 */
227
void
228
pg_set_regex_collation(Oid collation)
229
0
{
230
0
  pg_locale_t locale = 0;
231
0
  PG_Locale_Strategy strategy;
232
233
0
  if (!OidIsValid(collation))
234
0
  {
235
    /*
236
     * This typically means that the parser could not resolve a conflict
237
     * of implicit collations, so report it that way.
238
     */
239
0
    ereport(ERROR,
240
0
        (errcode(ERRCODE_INDETERMINATE_COLLATION),
241
0
         errmsg("could not determine which collation to use for regular expression"),
242
0
         errhint("Use the COLLATE clause to set the collation explicitly.")));
243
0
  }
244
245
0
  if (collation == C_COLLATION_OID)
246
0
  {
247
    /*
248
     * Some callers expect regexes to work for C_COLLATION_OID before
249
     * catalog access is available, so we can't call
250
     * pg_newlocale_from_collation().
251
     */
252
0
    strategy = PG_REGEX_STRATEGY_C;
253
0
    locale = 0;
254
0
  }
255
0
  else
256
0
  {
257
0
    locale = pg_newlocale_from_collation(collation);
258
259
0
    if (!locale->deterministic)
260
0
      ereport(ERROR,
261
0
          (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
262
0
           errmsg("nondeterministic collations are not supported for regular expressions")));
263
264
0
    if (locale->ctype_is_c)
265
0
    {
266
      /*
267
       * C/POSIX collations use this path regardless of database
268
       * encoding
269
       */
270
0
      strategy = PG_REGEX_STRATEGY_C;
271
0
      locale = 0;
272
0
    }
273
0
    else if (locale->provider == COLLPROVIDER_BUILTIN)
274
0
    {
275
0
      Assert(GetDatabaseEncoding() == PG_UTF8);
276
0
      strategy = PG_REGEX_STRATEGY_BUILTIN;
277
0
    }
278
0
#ifdef USE_ICU
279
0
    else if (locale->provider == COLLPROVIDER_ICU)
280
0
    {
281
0
      strategy = PG_REGEX_STRATEGY_ICU;
282
0
    }
283
0
#endif
284
0
    else
285
0
    {
286
0
      Assert(locale->provider == COLLPROVIDER_LIBC);
287
0
      if (GetDatabaseEncoding() == PG_UTF8)
288
0
        strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
289
0
      else
290
0
        strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
291
0
    }
292
0
  }
293
294
0
  pg_regex_strategy = strategy;
295
0
  pg_regex_locale = locale;
296
0
}
297
298
static int
299
pg_wc_isdigit(pg_wchar c)
300
0
{
301
0
  switch (pg_regex_strategy)
302
0
  {
303
0
    case PG_REGEX_STRATEGY_C:
304
0
      return (c <= (pg_wchar) 127 &&
305
0
          (pg_char_properties[c] & PG_ISDIGIT));
306
0
    case PG_REGEX_STRATEGY_BUILTIN:
307
0
      return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full);
308
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
309
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
310
0
        return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
311
      /* FALL THRU */
312
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
313
0
      return (c <= (pg_wchar) UCHAR_MAX &&
314
0
          isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
315
0
      break;
316
0
    case PG_REGEX_STRATEGY_ICU:
317
0
#ifdef USE_ICU
318
0
      return u_isdigit(c);
319
0
#endif
320
0
      break;
321
0
  }
322
0
  return 0;         /* can't get here, but keep compiler quiet */
323
0
}
324
325
static int
326
pg_wc_isalpha(pg_wchar c)
327
0
{
328
0
  switch (pg_regex_strategy)
329
0
  {
330
0
    case PG_REGEX_STRATEGY_C:
331
0
      return (c <= (pg_wchar) 127 &&
332
0
          (pg_char_properties[c] & PG_ISALPHA));
333
0
    case PG_REGEX_STRATEGY_BUILTIN:
334
0
      return pg_u_isalpha(c);
335
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
336
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
337
0
        return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
338
      /* FALL THRU */
339
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
340
0
      return (c <= (pg_wchar) UCHAR_MAX &&
341
0
          isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
342
0
      break;
343
0
    case PG_REGEX_STRATEGY_ICU:
344
0
#ifdef USE_ICU
345
0
      return u_isalpha(c);
346
0
#endif
347
0
      break;
348
0
  }
349
0
  return 0;         /* can't get here, but keep compiler quiet */
350
0
}
351
352
static int
353
pg_wc_isalnum(pg_wchar c)
354
0
{
355
0
  switch (pg_regex_strategy)
356
0
  {
357
0
    case PG_REGEX_STRATEGY_C:
358
0
      return (c <= (pg_wchar) 127 &&
359
0
          (pg_char_properties[c] & PG_ISALNUM));
360
0
    case PG_REGEX_STRATEGY_BUILTIN:
361
0
      return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full);
362
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
363
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
364
0
        return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
365
      /* FALL THRU */
366
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
367
0
      return (c <= (pg_wchar) UCHAR_MAX &&
368
0
          isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
369
0
      break;
370
0
    case PG_REGEX_STRATEGY_ICU:
371
0
#ifdef USE_ICU
372
0
      return u_isalnum(c);
373
0
#endif
374
0
      break;
375
0
  }
376
0
  return 0;         /* can't get here, but keep compiler quiet */
377
0
}
378
379
static int
380
pg_wc_isword(pg_wchar c)
381
0
{
382
  /* We define word characters as alnum class plus underscore */
383
0
  if (c == CHR('_'))
384
0
    return 1;
385
0
  return pg_wc_isalnum(c);
386
0
}
387
388
static int
389
pg_wc_isupper(pg_wchar c)
390
0
{
391
0
  switch (pg_regex_strategy)
392
0
  {
393
0
    case PG_REGEX_STRATEGY_C:
394
0
      return (c <= (pg_wchar) 127 &&
395
0
          (pg_char_properties[c] & PG_ISUPPER));
396
0
    case PG_REGEX_STRATEGY_BUILTIN:
397
0
      return pg_u_isupper(c);
398
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
399
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
400
0
        return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
401
      /* FALL THRU */
402
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
403
0
      return (c <= (pg_wchar) UCHAR_MAX &&
404
0
          isupper_l((unsigned char) c, pg_regex_locale->info.lt));
405
0
      break;
406
0
    case PG_REGEX_STRATEGY_ICU:
407
0
#ifdef USE_ICU
408
0
      return u_isupper(c);
409
0
#endif
410
0
      break;
411
0
  }
412
0
  return 0;         /* can't get here, but keep compiler quiet */
413
0
}
414
415
static int
416
pg_wc_islower(pg_wchar c)
417
0
{
418
0
  switch (pg_regex_strategy)
419
0
  {
420
0
    case PG_REGEX_STRATEGY_C:
421
0
      return (c <= (pg_wchar) 127 &&
422
0
          (pg_char_properties[c] & PG_ISLOWER));
423
0
    case PG_REGEX_STRATEGY_BUILTIN:
424
0
      return pg_u_islower(c);
425
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
426
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
427
0
        return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
428
      /* FALL THRU */
429
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
430
0
      return (c <= (pg_wchar) UCHAR_MAX &&
431
0
          islower_l((unsigned char) c, pg_regex_locale->info.lt));
432
0
      break;
433
0
    case PG_REGEX_STRATEGY_ICU:
434
0
#ifdef USE_ICU
435
0
      return u_islower(c);
436
0
#endif
437
0
      break;
438
0
  }
439
0
  return 0;         /* can't get here, but keep compiler quiet */
440
0
}
441
442
static int
443
pg_wc_isgraph(pg_wchar c)
444
0
{
445
0
  switch (pg_regex_strategy)
446
0
  {
447
0
    case PG_REGEX_STRATEGY_C:
448
0
      return (c <= (pg_wchar) 127 &&
449
0
          (pg_char_properties[c] & PG_ISGRAPH));
450
0
    case PG_REGEX_STRATEGY_BUILTIN:
451
0
      return pg_u_isgraph(c);
452
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
453
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
454
0
        return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
455
      /* FALL THRU */
456
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
457
0
      return (c <= (pg_wchar) UCHAR_MAX &&
458
0
          isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
459
0
      break;
460
0
    case PG_REGEX_STRATEGY_ICU:
461
0
#ifdef USE_ICU
462
0
      return u_isgraph(c);
463
0
#endif
464
0
      break;
465
0
  }
466
0
  return 0;         /* can't get here, but keep compiler quiet */
467
0
}
468
469
static int
470
pg_wc_isprint(pg_wchar c)
471
0
{
472
0
  switch (pg_regex_strategy)
473
0
  {
474
0
    case PG_REGEX_STRATEGY_C:
475
0
      return (c <= (pg_wchar) 127 &&
476
0
          (pg_char_properties[c] & PG_ISPRINT));
477
0
    case PG_REGEX_STRATEGY_BUILTIN:
478
0
      return pg_u_isprint(c);
479
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
480
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
481
0
        return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
482
      /* FALL THRU */
483
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
484
0
      return (c <= (pg_wchar) UCHAR_MAX &&
485
0
          isprint_l((unsigned char) c, pg_regex_locale->info.lt));
486
0
      break;
487
0
    case PG_REGEX_STRATEGY_ICU:
488
0
#ifdef USE_ICU
489
0
      return u_isprint(c);
490
0
#endif
491
0
      break;
492
0
  }
493
0
  return 0;         /* can't get here, but keep compiler quiet */
494
0
}
495
496
static int
497
pg_wc_ispunct(pg_wchar c)
498
0
{
499
0
  switch (pg_regex_strategy)
500
0
  {
501
0
    case PG_REGEX_STRATEGY_C:
502
0
      return (c <= (pg_wchar) 127 &&
503
0
          (pg_char_properties[c] & PG_ISPUNCT));
504
0
    case PG_REGEX_STRATEGY_BUILTIN:
505
0
      return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full);
506
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
507
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
508
0
        return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
509
      /* FALL THRU */
510
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
511
0
      return (c <= (pg_wchar) UCHAR_MAX &&
512
0
          ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
513
0
      break;
514
0
    case PG_REGEX_STRATEGY_ICU:
515
0
#ifdef USE_ICU
516
0
      return u_ispunct(c);
517
0
#endif
518
0
      break;
519
0
  }
520
0
  return 0;         /* can't get here, but keep compiler quiet */
521
0
}
522
523
static int
524
pg_wc_isspace(pg_wchar c)
525
0
{
526
0
  switch (pg_regex_strategy)
527
0
  {
528
0
    case PG_REGEX_STRATEGY_C:
529
0
      return (c <= (pg_wchar) 127 &&
530
0
          (pg_char_properties[c] & PG_ISSPACE));
531
0
    case PG_REGEX_STRATEGY_BUILTIN:
532
0
      return pg_u_isspace(c);
533
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
534
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
535
0
        return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
536
      /* FALL THRU */
537
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
538
0
      return (c <= (pg_wchar) UCHAR_MAX &&
539
0
          isspace_l((unsigned char) c, pg_regex_locale->info.lt));
540
0
      break;
541
0
    case PG_REGEX_STRATEGY_ICU:
542
0
#ifdef USE_ICU
543
0
      return u_isspace(c);
544
0
#endif
545
0
      break;
546
0
  }
547
0
  return 0;         /* can't get here, but keep compiler quiet */
548
0
}
549
550
static pg_wchar
551
pg_wc_toupper(pg_wchar c)
552
0
{
553
0
  switch (pg_regex_strategy)
554
0
  {
555
0
    case PG_REGEX_STRATEGY_C:
556
0
      if (c <= (pg_wchar) 127)
557
0
        return pg_ascii_toupper((unsigned char) c);
558
0
      return c;
559
0
    case PG_REGEX_STRATEGY_BUILTIN:
560
0
      return unicode_uppercase_simple(c);
561
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
562
      /* force C behavior for ASCII characters, per comments above */
563
0
      if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
564
0
        return pg_ascii_toupper((unsigned char) c);
565
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
566
0
        return towupper_l((wint_t) c, pg_regex_locale->info.lt);
567
      /* FALL THRU */
568
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
569
      /* force C behavior for ASCII characters, per comments above */
570
0
      if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
571
0
        return pg_ascii_toupper((unsigned char) c);
572
0
      if (c <= (pg_wchar) UCHAR_MAX)
573
0
        return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
574
0
      return c;
575
0
    case PG_REGEX_STRATEGY_ICU:
576
0
#ifdef USE_ICU
577
0
      return u_toupper(c);
578
0
#endif
579
0
      break;
580
0
  }
581
0
  return 0;         /* can't get here, but keep compiler quiet */
582
0
}
583
584
static pg_wchar
585
pg_wc_tolower(pg_wchar c)
586
0
{
587
0
  switch (pg_regex_strategy)
588
0
  {
589
0
    case PG_REGEX_STRATEGY_C:
590
0
      if (c <= (pg_wchar) 127)
591
0
        return pg_ascii_tolower((unsigned char) c);
592
0
      return c;
593
0
    case PG_REGEX_STRATEGY_BUILTIN:
594
0
      return unicode_lowercase_simple(c);
595
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
596
      /* force C behavior for ASCII characters, per comments above */
597
0
      if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
598
0
        return pg_ascii_tolower((unsigned char) c);
599
0
      if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
600
0
        return towlower_l((wint_t) c, pg_regex_locale->info.lt);
601
      /* FALL THRU */
602
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
603
      /* force C behavior for ASCII characters, per comments above */
604
0
      if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
605
0
        return pg_ascii_tolower((unsigned char) c);
606
0
      if (c <= (pg_wchar) UCHAR_MAX)
607
0
        return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
608
0
      return c;
609
0
    case PG_REGEX_STRATEGY_ICU:
610
0
#ifdef USE_ICU
611
0
      return u_tolower(c);
612
0
#endif
613
0
      break;
614
0
  }
615
0
  return 0;         /* can't get here, but keep compiler quiet */
616
0
}
617
618
619
/*
620
 * These functions cache the results of probing libc's ctype behavior for
621
 * all character codes of interest in a given encoding/collation.  The
622
 * result is provided as a "struct cvec", but notice that the representation
623
 * is a touch different from a cvec created by regc_cvec.c: we allocate the
624
 * chrs[] and ranges[] arrays separately from the struct so that we can
625
 * realloc them larger at need.  This is okay since the cvecs made here
626
 * should never be freed by freecvec().
627
 *
628
 * We use malloc not palloc since we mustn't lose control on out-of-memory;
629
 * the main regex code expects us to return a failure indication instead.
630
 */
631
632
typedef int (*pg_wc_probefunc) (pg_wchar c);
633
634
typedef struct pg_ctype_cache
635
{
636
  pg_wc_probefunc probefunc;  /* pg_wc_isalpha or a sibling */
637
  pg_locale_t locale;     /* locale this entry is for */
638
  struct cvec cv;       /* cache entry contents */
639
  struct pg_ctype_cache *next;  /* chain link */
640
} pg_ctype_cache;
641
642
static pg_ctype_cache *pg_ctype_cache_list = NULL;
643
644
/*
645
 * Add a chr or range to pcc->cv; return false if run out of memory
646
 */
647
static bool
648
store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
649
0
{
650
0
  chr      *newchrs;
651
652
0
  if (nchrs > 1)
653
0
  {
654
0
    if (pcc->cv.nranges >= pcc->cv.rangespace)
655
0
    {
656
0
      pcc->cv.rangespace *= 2;
657
0
      newchrs = (chr *) realloc(pcc->cv.ranges,
658
0
                    pcc->cv.rangespace * sizeof(chr) * 2);
659
0
      if (newchrs == NULL)
660
0
        return false;
661
0
      pcc->cv.ranges = newchrs;
662
0
    }
663
0
    pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
664
0
    pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
665
0
    pcc->cv.nranges++;
666
0
  }
667
0
  else
668
0
  {
669
0
    assert(nchrs == 1);
670
0
    if (pcc->cv.nchrs >= pcc->cv.chrspace)
671
0
    {
672
0
      pcc->cv.chrspace *= 2;
673
0
      newchrs = (chr *) realloc(pcc->cv.chrs,
674
0
                    pcc->cv.chrspace * sizeof(chr));
675
0
      if (newchrs == NULL)
676
0
        return false;
677
0
      pcc->cv.chrs = newchrs;
678
0
    }
679
0
    pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
680
0
  }
681
0
  return true;
682
0
}
683
684
/*
685
 * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
686
 * chrs satisfying the probe function.  The active collation is the one
687
 * previously set by pg_set_regex_collation.  Return NULL if out of memory.
688
 *
689
 * Note that the result must not be freed or modified by caller.
690
 */
691
static struct cvec *
692
pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
693
0
{
694
0
  pg_ctype_cache *pcc;
695
0
  pg_wchar  max_chr;
696
0
  pg_wchar  cur_chr;
697
0
  int     nmatches;
698
0
  chr      *newchrs;
699
700
  /*
701
   * Do we already have the answer cached?
702
   */
703
0
  for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
704
0
  {
705
0
    if (pcc->probefunc == probefunc &&
706
0
      pcc->locale == pg_regex_locale)
707
0
      return &pcc->cv;
708
0
  }
709
710
  /*
711
   * Nope, so initialize some workspace ...
712
   */
713
0
  pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
714
0
  if (pcc == NULL)
715
0
    return NULL;
716
0
  pcc->probefunc = probefunc;
717
0
  pcc->locale = pg_regex_locale;
718
0
  pcc->cv.nchrs = 0;
719
0
  pcc->cv.chrspace = 128;
720
0
  pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
721
0
  pcc->cv.nranges = 0;
722
0
  pcc->cv.rangespace = 64;
723
0
  pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
724
0
  if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
725
0
    goto out_of_memory;
726
0
  pcc->cv.cclasscode = cclasscode;
727
728
  /*
729
   * Decide how many character codes we ought to look through.  In general
730
   * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
731
   * runtime using the "high colormap" mechanism.  However, in C locale
732
   * there's no need to go further than 127, and if we only have a 1-byte
733
   * <ctype.h> API there's no need to go further than that can handle.
734
   *
735
   * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
736
   * output cvec as not having any locale-dependent behavior, since there
737
   * will be no need to do any run-time locale checks.  (The #if's here
738
   * would always be true for production values of MAX_SIMPLE_CHR, but it's
739
   * useful to allow it to be small for testing purposes.)
740
   */
741
0
  switch (pg_regex_strategy)
742
0
  {
743
0
    case PG_REGEX_STRATEGY_C:
744
0
#if MAX_SIMPLE_CHR >= 127
745
0
      max_chr = (pg_wchar) 127;
746
0
      pcc->cv.cclasscode = -1;
747
#else
748
      max_chr = (pg_wchar) MAX_SIMPLE_CHR;
749
#endif
750
0
      break;
751
0
    case PG_REGEX_STRATEGY_BUILTIN:
752
0
      max_chr = (pg_wchar) MAX_SIMPLE_CHR;
753
0
      break;
754
0
    case PG_REGEX_STRATEGY_LIBC_WIDE:
755
0
      max_chr = (pg_wchar) MAX_SIMPLE_CHR;
756
0
      break;
757
0
    case PG_REGEX_STRATEGY_LIBC_1BYTE:
758
0
#if MAX_SIMPLE_CHR >= UCHAR_MAX
759
0
      max_chr = (pg_wchar) UCHAR_MAX;
760
0
      pcc->cv.cclasscode = -1;
761
#else
762
      max_chr = (pg_wchar) MAX_SIMPLE_CHR;
763
#endif
764
0
      break;
765
0
    case PG_REGEX_STRATEGY_ICU:
766
0
      max_chr = (pg_wchar) MAX_SIMPLE_CHR;
767
0
      break;
768
0
    default:
769
0
      Assert(false);
770
0
      max_chr = 0;    /* can't get here, but keep compiler quiet */
771
0
      break;
772
0
  }
773
774
  /*
775
   * And scan 'em ...
776
   */
777
0
  nmatches = 0;       /* number of consecutive matches */
778
779
0
  for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
780
0
  {
781
0
    if ((*probefunc) (cur_chr))
782
0
      nmatches++;
783
0
    else if (nmatches > 0)
784
0
    {
785
0
      if (!store_match(pcc, cur_chr - nmatches, nmatches))
786
0
        goto out_of_memory;
787
0
      nmatches = 0;
788
0
    }
789
0
  }
790
791
0
  if (nmatches > 0)
792
0
    if (!store_match(pcc, cur_chr - nmatches, nmatches))
793
0
      goto out_of_memory;
794
795
  /*
796
   * We might have allocated more memory than needed, if so free it
797
   */
798
0
  if (pcc->cv.nchrs == 0)
799
0
  {
800
0
    free(pcc->cv.chrs);
801
0
    pcc->cv.chrs = NULL;
802
0
    pcc->cv.chrspace = 0;
803
0
  }
804
0
  else if (pcc->cv.nchrs < pcc->cv.chrspace)
805
0
  {
806
0
    newchrs = (chr *) realloc(pcc->cv.chrs,
807
0
                  pcc->cv.nchrs * sizeof(chr));
808
0
    if (newchrs == NULL)
809
0
      goto out_of_memory;
810
0
    pcc->cv.chrs = newchrs;
811
0
    pcc->cv.chrspace = pcc->cv.nchrs;
812
0
  }
813
0
  if (pcc->cv.nranges == 0)
814
0
  {
815
0
    free(pcc->cv.ranges);
816
0
    pcc->cv.ranges = NULL;
817
0
    pcc->cv.rangespace = 0;
818
0
  }
819
0
  else if (pcc->cv.nranges < pcc->cv.rangespace)
820
0
  {
821
0
    newchrs = (chr *) realloc(pcc->cv.ranges,
822
0
                  pcc->cv.nranges * sizeof(chr) * 2);
823
0
    if (newchrs == NULL)
824
0
      goto out_of_memory;
825
0
    pcc->cv.ranges = newchrs;
826
0
    pcc->cv.rangespace = pcc->cv.nranges;
827
0
  }
828
829
  /*
830
   * Success, link it into cache chain
831
   */
832
0
  pcc->next = pg_ctype_cache_list;
833
0
  pg_ctype_cache_list = pcc;
834
835
0
  return &pcc->cv;
836
837
  /*
838
   * Failure, clean up
839
   */
840
0
out_of_memory:
841
0
  free(pcc->cv.chrs);
842
0
  free(pcc->cv.ranges);
843
0
  free(pcc);
844
845
0
  return NULL;
846
0
}