Coverage Report

Created: 2025-06-13 06:43

/src/php-src/ext/pcre/pcre2lib/pcre2_xclass.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
/* This module contains two internal functions that are used to match
42
OP_XCLASS and OP_ECLASS. It is used by pcre2_auto_possessify() and by both
43
pcre2_match() and pcre2_dfa_match(). */
44
45
46
#ifdef HAVE_CONFIG_H
47
#include "config.h"
48
#endif
49
50
51
#include "pcre2_internal.h"
52
53
/*************************************************
54
*       Match character against an XCLASS        *
55
*************************************************/
56
57
/* This function is called to match a character against an extended class that
58
might contain codepoints above 255 and/or Unicode properties.
59
60
Arguments:
61
  c           the character
62
  data        points to the flag code unit of the XCLASS data
63
  utf         TRUE if in UTF mode
64
65
Returns:      TRUE if character matches, else FALSE
66
*/
67
68
BOOL
69
PRIV(xclass)(uint32_t c, PCRE2_SPTR data, const uint8_t *char_lists_end, BOOL utf)
70
861k
{
71
/* Update PRIV(update_classbits) when this function is changed. */
72
861k
PCRE2_UCHAR t;
73
861k
BOOL not_negated = (*data & XCL_NOT) == 0;
74
861k
uint32_t type, max_index, min_index, value;
75
861k
const uint8_t *next_char;
76
77
861k
#if PCRE2_CODE_UNIT_WIDTH == 8
78
/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
79
861k
utf = TRUE;
80
861k
#endif
81
82
/* Code points < 256 are matched against a bitmap, if one is present. */
83
84
861k
if ((*data++ & XCL_MAP) != 0)
85
861k
  {
86
861k
  if (c < 256)
87
858k
    return (((const uint8_t *)data)[c/8] & (1u << (c&7))) != 0;
88
  /* Skip bitmap. */
89
3.40k
  data += 32 / sizeof(PCRE2_UCHAR);
90
3.40k
  }
91
92
/* Match against the list of Unicode properties. We won't ever
93
encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */
94
3.40k
#ifdef SUPPORT_UNICODE
95
3.40k
if (*data == XCL_PROP || *data == XCL_NOTPROP)
96
637
  {
97
  /* The UCD record is the same for all properties. */
98
637
  const ucd_record *prop = GET_UCD(c);
99
100
637
  do
101
637
    {
102
637
    int chartype;
103
637
    BOOL isprop = (*data++) == XCL_PROP;
104
637
    BOOL ok;
105
106
637
    switch(*data)
107
637
      {
108
0
      case PT_LAMP:
109
0
      chartype = prop->chartype;
110
0
      if ((chartype == ucp_Lu || chartype == ucp_Ll ||
111
0
           chartype == ucp_Lt) == isprop) return not_negated;
112
0
      break;
113
114
0
      case PT_GC:
115
0
      if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
116
0
        return not_negated;
117
0
      break;
118
119
5
      case PT_PC:
120
5
      if ((data[1] == prop->chartype) == isprop) return not_negated;
121
5
      break;
122
123
5
      case PT_SC:
124
0
      if ((data[1] == prop->script) == isprop) return not_negated;
125
0
      break;
126
127
0
      case PT_SCX:
128
0
      ok = (data[1] == prop->script ||
129
0
            MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), data[1]) != 0);
130
0
      if (ok == isprop) return not_negated;
131
0
      break;
132
133
0
      case PT_ALNUM:
134
0
      chartype = prop->chartype;
135
0
      if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
136
0
           PRIV(ucp_gentype)[chartype] == ucp_N) == isprop)
137
0
        return not_negated;
138
0
      break;
139
140
      /* Perl space used to exclude VT, but from Perl 5.18 it is included,
141
      which means that Perl space and POSIX space are now identical. PCRE
142
      was changed at release 8.34. */
143
144
0
      case PT_SPACE:    /* Perl space */
145
0
      case PT_PXSPACE:  /* POSIX space */
146
0
      switch(c)
147
0
        {
148
0
        HSPACE_CASES:
149
0
        VSPACE_CASES:
150
0
        if (isprop) return not_negated;
151
0
        break;
152
153
0
        default:
154
0
        if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
155
0
          return not_negated;
156
0
        break;
157
0
        }
158
0
      break;
159
160
632
      case PT_WORD:
161
632
      chartype = prop->chartype;
162
632
      if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
163
632
           PRIV(ucp_gentype)[chartype] == ucp_N ||
164
632
           chartype == ucp_Mn || chartype == ucp_Pc) == isprop)
165
629
        return not_negated;
166
3
      break;
167
168
3
      case PT_UCNC:
169
0
      if (c < 0xa0)
170
0
        {
171
0
        if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
172
0
             c == CHAR_GRAVE_ACCENT) == isprop)
173
0
          return not_negated;
174
0
        }
175
0
      else
176
0
        {
177
0
        if ((c < 0xd800 || c > 0xdfff) == isprop)
178
0
          return not_negated;
179
0
        }
180
0
      break;
181
182
0
      case PT_BIDICL:
183
0
      if ((UCD_BIDICLASS_PROP(prop) == data[1]) == isprop)
184
0
        return not_negated;
185
0
      break;
186
187
0
      case PT_BOOL:
188
0
      ok = MAPBIT(PRIV(ucd_boolprop_sets) +
189
0
        UCD_BPROPS_PROP(prop), data[1]) != 0;
190
0
      if (ok == isprop) return not_negated;
191
0
      break;
192
193
      /* The following three properties can occur only in an XCLASS, as there
194
      is no \p or \P coding for them. */
195
196
      /* Graphic character. Implement this as not Z (space or separator) and
197
      not C (other), except for Cf (format) with a few exceptions. This seems
198
      to be what Perl does. The exceptional characters are:
199
200
      U+061C           Arabic Letter Mark
201
      U+180E           Mongolian Vowel Separator
202
      U+2066 - U+2069  Various "isolate"s
203
      */
204
205
0
      case PT_PXGRAPH:
206
0
      chartype = prop->chartype;
207
0
      if ((PRIV(ucp_gentype)[chartype] != ucp_Z &&
208
0
            (PRIV(ucp_gentype)[chartype] != ucp_C ||
209
0
              (chartype == ucp_Cf &&
210
0
                c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
211
0
         )) == isprop)
212
0
        return not_negated;
213
0
      break;
214
215
      /* Printable character: same as graphic, with the addition of Zs, i.e.
216
      not Zl and not Zp, and U+180E. */
217
218
0
      case PT_PXPRINT:
219
0
      chartype = prop->chartype;
220
0
      if ((chartype != ucp_Zl &&
221
0
           chartype != ucp_Zp &&
222
0
            (PRIV(ucp_gentype)[chartype] != ucp_C ||
223
0
              (chartype == ucp_Cf &&
224
0
                c != 0x061c && (c < 0x2066 || c > 0x2069))
225
0
         )) == isprop)
226
0
        return not_negated;
227
0
      break;
228
229
      /* Punctuation: all Unicode punctuation, plus ASCII characters that
230
      Unicode treats as symbols rather than punctuation, for Perl
231
      compatibility (these are $+<=>^`|~). */
232
233
0
      case PT_PXPUNCT:
234
0
      chartype = prop->chartype;
235
0
      if ((PRIV(ucp_gentype)[chartype] == ucp_P ||
236
0
            (c < 128 && PRIV(ucp_gentype)[chartype] == ucp_S)) == isprop)
237
0
        return not_negated;
238
0
      break;
239
240
      /* Perl has two sets of hex digits */
241
242
0
      case PT_PXXDIGIT:
243
0
      if (((c >= CHAR_0 && c <= CHAR_9) ||
244
0
           (c >= CHAR_A && c <= CHAR_F) ||
245
0
           (c >= CHAR_a && c <= CHAR_f) ||
246
0
           (c >= 0xff10 && c <= 0xff19) ||  /* Fullwidth digits */
247
0
           (c >= 0xff21 && c <= 0xff26) ||  /* Fullwidth letters */
248
0
           (c >= 0xff41 && c <= 0xff46)) == isprop)
249
0
        return not_negated;
250
0
      break;
251
252
      /* This should never occur, but compilers may mutter if there is no
253
      default. */
254
255
0
      default:
256
0
      PCRE2_DEBUG_UNREACHABLE();
257
0
      return FALSE;
258
637
      }
259
260
8
    data += 2;
261
8
    }
262
637
  while (*data == XCL_PROP || *data == XCL_NOTPROP);
263
637
  }
264
#else
265
  (void)utf;  /* Avoid compiler warning */
266
#endif  /* SUPPORT_UNICODE */
267
268
/* Match against large chars or ranges that end with a large char. */
269
2.78k
if (*data < XCL_LIST)
270
2.78k
  {
271
4.68k
  while ((t = *data++) != XCL_END)
272
3.93k
    {
273
3.93k
    uint32_t x, y;
274
275
3.93k
#ifdef SUPPORT_UNICODE
276
3.93k
    if (utf)
277
3.93k
      {
278
3.93k
      GETCHARINC(x, data); /* macro generates multiple statements */
279
3.93k
      }
280
0
    else
281
0
#endif
282
0
      x = *data++;
283
284
3.93k
    if (t == XCL_SINGLE)
285
3.85k
      {
286
      /* Since character ranges follow the properties, and they are
287
      sorted, early return is possible for all characters <= x. */
288
3.85k
      if (c <= x) return (c == x) ? not_negated : !not_negated;
289
1.90k
      continue;
290
3.85k
      }
291
292
84
    PCRE2_ASSERT(t == XCL_RANGE);
293
84
#ifdef SUPPORT_UNICODE
294
84
    if (utf)
295
84
      {
296
84
      GETCHARINC(y, data); /* macro generates multiple statements */
297
84
      }
298
0
    else
299
0
#endif
300
0
      y = *data++;
301
302
    /* Since character ranges follow the properties, and they are
303
    sorted, early return is possible for all characters <= y. */
304
84
    if (c <= y) return (c >= x) ? not_negated : !not_negated;
305
84
    }
306
307
747
  return !not_negated;   /* char did not match */
308
2.78k
  }
309
310
0
#if PCRE2_CODE_UNIT_WIDTH == 8
311
0
type = (uint32_t)(data[0] << 8) | data[1];
312
0
data += 2;
313
#else
314
type = data[0];
315
data++;
316
#endif  /* CODE_UNIT_WIDTH */
317
318
/* Align characters. */
319
0
next_char = char_lists_end - (GET(data, 0) << 1);
320
0
type &= XCL_TYPE_MASK;
321
322
/* Alignment check. */
323
0
PCRE2_ASSERT(((uintptr_t)next_char & 0x1) == 0);
324
325
0
if (c >= XCL_CHAR_LIST_HIGH_16_START)
326
0
  {
327
0
  max_index = type & XCL_ITEM_COUNT_MASK;
328
0
  if (max_index == XCL_ITEM_COUNT_MASK)
329
0
    {
330
0
    max_index = *(const uint16_t*)next_char;
331
0
    PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
332
0
    next_char += 2;
333
0
    }
334
335
0
  next_char += max_index << 1;
336
0
  type >>= XCL_TYPE_BIT_LEN;
337
0
  }
338
339
0
if (c < XCL_CHAR_LIST_LOW_32_START)
340
0
  {
341
0
  max_index = type & XCL_ITEM_COUNT_MASK;
342
343
0
  c = (uint16_t)((c << XCL_CHAR_SHIFT) | XCL_CHAR_END);
344
345
0
  if (max_index == XCL_ITEM_COUNT_MASK)
346
0
    {
347
0
    max_index = *(const uint16_t*)next_char;
348
0
    PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
349
0
    next_char += 2;
350
0
    }
351
352
0
  if (max_index == 0 || c < *(const uint16_t*)next_char)
353
0
    return ((type & XCL_BEGIN_WITH_RANGE) != 0) == not_negated;
354
355
0
  min_index = 0;
356
0
  value = ((const uint16_t*)next_char)[--max_index];
357
0
  if (c >= value)
358
0
    return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
359
360
0
  max_index--;
361
362
  /* Binary search of a range. */
363
0
  while (TRUE)
364
0
    {
365
0
    uint32_t mid_index = (min_index + max_index) >> 1;
366
0
    value = ((const uint16_t*)next_char)[mid_index];
367
368
0
    if (c < value)
369
0
      max_index = mid_index - 1;
370
0
    else if (((const uint16_t*)next_char)[mid_index + 1] <= c)
371
0
      min_index = mid_index + 1;
372
0
    else
373
0
      return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
374
0
    }
375
0
  }
376
377
/* Skip the 16 bit ranges. */
378
0
max_index = type & XCL_ITEM_COUNT_MASK;
379
0
if (max_index == XCL_ITEM_COUNT_MASK)
380
0
  {
381
0
  max_index = *(const uint16_t*)next_char;
382
0
  PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
383
0
  next_char += 2;
384
0
  }
385
386
0
next_char += (max_index << 1);
387
0
type >>= XCL_TYPE_BIT_LEN;
388
389
/* Alignment check. */
390
0
PCRE2_ASSERT(((uintptr_t)next_char & 0x3) == 0);
391
392
0
max_index = type & XCL_ITEM_COUNT_MASK;
393
394
#if PCRE2_CODE_UNIT_WIDTH == 32
395
if (c >= XCL_CHAR_LIST_HIGH_32_START)
396
  {
397
  if (max_index == XCL_ITEM_COUNT_MASK)
398
    {
399
    max_index = *(const uint32_t*)next_char;
400
    PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
401
    next_char += 4;
402
    }
403
404
  next_char += max_index << 2;
405
  type >>= XCL_TYPE_BIT_LEN;
406
  max_index = type & XCL_ITEM_COUNT_MASK;
407
  }
408
#endif
409
410
0
c = (uint32_t)((c << XCL_CHAR_SHIFT) | XCL_CHAR_END);
411
412
0
if (max_index == XCL_ITEM_COUNT_MASK)
413
0
  {
414
0
  max_index = *(const uint32_t*)next_char;
415
0
  next_char += 4;
416
0
  }
417
418
0
if (max_index == 0 || c < *(const uint32_t*)next_char)
419
0
  return ((type & XCL_BEGIN_WITH_RANGE) != 0) == not_negated;
420
421
0
min_index = 0;
422
0
value = ((const uint32_t*)next_char)[--max_index];
423
0
if (c >= value)
424
0
  return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
425
426
0
max_index--;
427
428
/* Binary search of a range. */
429
0
while (TRUE)
430
0
  {
431
0
  uint32_t mid_index = (min_index + max_index) >> 1;
432
0
  value = ((const uint32_t*)next_char)[mid_index];
433
434
0
  if (c < value)
435
0
    max_index = mid_index - 1;
436
0
  else if (((const uint32_t*)next_char)[mid_index + 1] <= c)
437
0
    min_index = mid_index + 1;
438
0
  else
439
0
    return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
440
0
  }
441
0
}
442
443
444
445
/*************************************************
446
*       Match character against an ECLASS        *
447
*************************************************/
448
449
/* This function is called to match a character against an extended class
450
used for describing characters using boolean operations on sets.
451
452
Arguments:
453
  c           the character
454
  data_start  points to the start of the ECLASS data
455
  data_end    points one-past-the-last of the ECLASS data
456
  utf         TRUE if in UTF mode
457
458
Returns:      TRUE if character matches, else FALSE
459
*/
460
461
BOOL
462
PRIV(eclass)(uint32_t c, PCRE2_SPTR data_start, PCRE2_SPTR data_end,
463
  const uint8_t *char_lists_end, BOOL utf)
464
0
{
465
0
PCRE2_SPTR ptr = data_start;
466
0
PCRE2_UCHAR flags;
467
0
uint32_t stack = 0;
468
0
int stack_depth = 0;
469
470
0
PCRE2_ASSERT(data_start < data_end);
471
0
flags = *ptr++;
472
0
PCRE2_ASSERT((flags & ECL_MAP) == 0 ||
473
0
             (data_end - ptr) >= 32 / (int)sizeof(PCRE2_UCHAR));
474
475
/* Code points < 256 are matched against a bitmap, if one is present.
476
Otherwise all codepoints are checked later. */
477
478
0
if ((flags & ECL_MAP) != 0)
479
0
  {
480
0
  if (c < 256)
481
0
    return (((const uint8_t *)ptr)[c/8] & (1u << (c&7))) != 0;
482
483
  /* Skip the bitmap. */
484
0
  ptr += 32 / sizeof(PCRE2_UCHAR);
485
0
  }
486
487
/* Do a little loop, until we reach the end of the ECLASS. */
488
0
while (ptr < data_end)
489
0
  {
490
0
  switch (*ptr)
491
0
    {
492
0
    case ECL_AND:
493
0
    ++ptr;
494
0
    stack = (stack >> 1) & (stack | ~(uint32_t)1u);
495
0
    PCRE2_ASSERT(stack_depth >= 2);
496
0
    --stack_depth;
497
0
    break;
498
499
0
    case ECL_OR:
500
0
    ++ptr;
501
0
    stack = (stack >> 1) | (stack & (uint32_t)1u);
502
0
    PCRE2_ASSERT(stack_depth >= 2);
503
0
    --stack_depth;
504
0
    break;
505
506
0
    case ECL_XOR:
507
0
    ++ptr;
508
0
    stack = (stack >> 1) ^ (stack & (uint32_t)1u);
509
0
    PCRE2_ASSERT(stack_depth >= 2);
510
0
    --stack_depth;
511
0
    break;
512
513
0
    case ECL_NOT:
514
0
    ++ptr;
515
0
    stack ^= (uint32_t)1u;
516
0
    PCRE2_ASSERT(stack_depth >= 1);
517
0
    break;
518
519
0
    case ECL_XCLASS:
520
0
      {
521
0
      uint32_t matched = PRIV(xclass)(c, ptr + 1 + LINK_SIZE, char_lists_end, utf);
522
523
0
      ptr += GET(ptr, 1);
524
0
      stack = (stack << 1) | matched;
525
0
      ++stack_depth;
526
0
      break;
527
0
      }
528
529
    /* This should never occur, but compilers may mutter if there is no
530
    default. */
531
532
0
    default:
533
0
    PCRE2_DEBUG_UNREACHABLE();
534
0
    return FALSE;
535
0
    }
536
0
  }
537
538
0
PCRE2_ASSERT(stack_depth == 1);
539
0
(void)stack_depth;  /* Ignore unused variable, if assertions are disabled. */
540
541
/* The final bit left on the stack now holds the match result. */
542
0
return (stack & 1u) != 0;
543
0
}
544
545
/* End of pcre2_xclass.c */