Coverage Report

Created: 2025-07-23 06:33

/src/php-src/ext/pcre/pcre2lib/pcre2_xclass.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
/* This module contains two internal functions that are used to match
42
OP_XCLASS and OP_ECLASS. It is used by pcre2_auto_possessify() and by both
43
pcre2_match() and pcre2_dfa_match(). */
44
45
46
#ifdef HAVE_CONFIG_H
47
#include "config.h"
48
#endif
49
50
51
#include "pcre2_internal.h"
52
53
/*************************************************
54
*       Match character against an XCLASS        *
55
*************************************************/
56
57
/* This function is called to match a character against an extended class that
58
might contain codepoints above 255 and/or Unicode properties.
59
60
Arguments:
61
  c           the character
62
  data        points to the flag code unit of the XCLASS data
63
  utf         TRUE if in UTF mode
64
65
Returns:      TRUE if character matches, else FALSE
66
*/
67
68
BOOL
69
PRIV(xclass)(uint32_t c, PCRE2_SPTR data, const uint8_t *char_lists_end, BOOL utf)
70
410k
{
71
/* Update PRIV(update_classbits) when this function is changed. */
72
410k
PCRE2_UCHAR t;
73
410k
BOOL not_negated = (*data & XCL_NOT) == 0;
74
410k
uint32_t type, max_index, min_index, value;
75
410k
const uint8_t *next_char;
76
77
410k
#if PCRE2_CODE_UNIT_WIDTH == 8
78
/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
79
410k
utf = TRUE;
80
410k
#endif
81
82
/* Code points < 256 are matched against a bitmap, if one is present. */
83
84
410k
if ((*data++ & XCL_MAP) != 0)
85
410k
  {
86
410k
  if (c < 256)
87
406k
    return (((const uint8_t *)data)[c/8] & (1u << (c&7))) != 0;
88
  /* Skip bitmap. */
89
4.81k
  data += 32 / sizeof(PCRE2_UCHAR);
90
4.81k
  }
91
92
/* Match against the list of Unicode properties. We won't ever
93
encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */
94
4.81k
#ifdef SUPPORT_UNICODE
95
4.81k
if (*data == XCL_PROP || *data == XCL_NOTPROP)
96
1.60k
  {
97
  /* The UCD record is the same for all properties. */
98
1.60k
  const ucd_record *prop = GET_UCD(c);
99
100
1.60k
  do
101
1.60k
    {
102
1.60k
    int chartype;
103
1.60k
    BOOL isprop = (*data++) == XCL_PROP;
104
1.60k
    BOOL ok;
105
106
1.60k
    switch(*data)
107
1.60k
      {
108
0
      case PT_LAMP:
109
0
      chartype = prop->chartype;
110
0
      if ((chartype == ucp_Lu || chartype == ucp_Ll ||
111
0
           chartype == ucp_Lt) == isprop) return not_negated;
112
0
      break;
113
114
0
      case PT_GC:
115
0
      if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
116
0
        return not_negated;
117
0
      break;
118
119
53
      case PT_PC:
120
53
      if ((data[1] == prop->chartype) == isprop) return not_negated;
121
53
      break;
122
123
53
      case PT_SC:
124
0
      if ((data[1] == prop->script) == isprop) return not_negated;
125
0
      break;
126
127
0
      case PT_SCX:
128
0
      ok = (data[1] == prop->script ||
129
0
            MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), data[1]) != 0);
130
0
      if (ok == isprop) return not_negated;
131
0
      break;
132
133
0
      case PT_ALNUM:
134
0
      chartype = prop->chartype;
135
0
      if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
136
0
           PRIV(ucp_gentype)[chartype] == ucp_N) == isprop)
137
0
        return not_negated;
138
0
      break;
139
140
      /* Perl space used to exclude VT, but from Perl 5.18 it is included,
141
      which means that Perl space and POSIX space are now identical. PCRE
142
      was changed at release 8.34. */
143
144
0
      case PT_SPACE:    /* Perl space */
145
0
      case PT_PXSPACE:  /* POSIX space */
146
0
      switch(c)
147
0
        {
148
0
        HSPACE_CASES:
149
0
        VSPACE_CASES:
150
0
        if (isprop) return not_negated;
151
0
        break;
152
153
0
        default:
154
0
        if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
155
0
          return not_negated;
156
0
        break;
157
0
        }
158
0
      break;
159
160
1.54k
      case PT_WORD:
161
1.54k
      chartype = prop->chartype;
162
1.54k
      if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
163
1.54k
           PRIV(ucp_gentype)[chartype] == ucp_N ||
164
1.54k
           chartype == ucp_Mn || chartype == ucp_Pc) == isprop)
165
1.54k
        return not_negated;
166
0
      break;
167
168
0
      case PT_UCNC:
169
0
      if (c < 0xa0)
170
0
        {
171
0
        if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
172
0
             c == CHAR_GRAVE_ACCENT) == isprop)
173
0
          return not_negated;
174
0
        }
175
0
      else
176
0
        {
177
0
        if ((c < 0xd800 || c > 0xdfff) == isprop)
178
0
          return not_negated;
179
0
        }
180
0
      break;
181
182
0
      case PT_BIDICL:
183
0
      if ((UCD_BIDICLASS_PROP(prop) == data[1]) == isprop)
184
0
        return not_negated;
185
0
      break;
186
187
0
      case PT_BOOL:
188
0
      ok = MAPBIT(PRIV(ucd_boolprop_sets) +
189
0
        UCD_BPROPS_PROP(prop), data[1]) != 0;
190
0
      if (ok == isprop) return not_negated;
191
0
      break;
192
193
      /* The following three properties can occur only in an XCLASS, as there
194
      is no \p or \P coding for them. */
195
196
      /* Graphic character. Implement this as not Z (space or separator) and
197
      not C (other), except for Cf (format) with a few exceptions. This seems
198
      to be what Perl does. The exceptional characters are:
199
200
      U+061C           Arabic Letter Mark
201
      U+180E           Mongolian Vowel Separator
202
      U+2066 - U+2069  Various "isolate"s
203
      */
204
205
0
      case PT_PXGRAPH:
206
0
      chartype = prop->chartype;
207
0
      if ((PRIV(ucp_gentype)[chartype] != ucp_Z &&
208
0
            (PRIV(ucp_gentype)[chartype] != ucp_C ||
209
0
              (chartype == ucp_Cf &&
210
0
                c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
211
0
         )) == isprop)
212
0
        return not_negated;
213
0
      break;
214
215
      /* Printable character: same as graphic, with the addition of Zs, i.e.
216
      not Zl and not Zp, and U+180E. */
217
218
0
      case PT_PXPRINT:
219
0
      chartype = prop->chartype;
220
0
      if ((chartype != ucp_Zl &&
221
0
           chartype != ucp_Zp &&
222
0
            (PRIV(ucp_gentype)[chartype] != ucp_C ||
223
0
              (chartype == ucp_Cf &&
224
0
                c != 0x061c && (c < 0x2066 || c > 0x2069))
225
0
         )) == isprop)
226
0
        return not_negated;
227
0
      break;
228
229
      /* Punctuation: all Unicode punctuation, plus ASCII characters that
230
      Unicode treats as symbols rather than punctuation, for Perl
231
      compatibility (these are $+<=>^`|~). */
232
233
0
      case PT_PXPUNCT:
234
0
      chartype = prop->chartype;
235
0
      if ((PRIV(ucp_gentype)[chartype] == ucp_P ||
236
0
            (c < 128 && PRIV(ucp_gentype)[chartype] == ucp_S)) == isprop)
237
0
        return not_negated;
238
0
      break;
239
240
      /* Perl has two sets of hex digits */
241
242
0
      case PT_PXXDIGIT:
243
0
      if (((c >= CHAR_0 && c <= CHAR_9) ||
244
0
           (c >= CHAR_A && c <= CHAR_F) ||
245
0
           (c >= CHAR_a && c <= CHAR_f) ||
246
0
           (c >= 0xff10 && c <= 0xff19) ||  /* Fullwidth digits */
247
0
           (c >= 0xff21 && c <= 0xff26) ||  /* Fullwidth letters */
248
0
           (c >= 0xff41 && c <= 0xff46)) == isprop)
249
0
        return not_negated;
250
0
      break;
251
252
      /* This should never occur, but compilers may mutter if there is no
253
      default. */
254
255
0
      default:
256
0
      PCRE2_DEBUG_UNREACHABLE();
257
0
      return FALSE;
258
1.60k
      }
259
260
53
    data += 2;
261
53
    }
262
1.60k
  while (*data == XCL_PROP || *data == XCL_NOTPROP);
263
1.60k
  }
264
#else
265
  (void)utf;  /* Avoid compiler warning */
266
#endif  /* SUPPORT_UNICODE */
267
268
/* Match against large chars or ranges that end with a large char. */
269
3.26k
if (*data < XCL_LIST)
270
3.26k
  {
271
5.50k
  while ((t = *data++) != XCL_END)
272
4.94k
    {
273
4.94k
    uint32_t x, y;
274
275
4.94k
#ifdef SUPPORT_UNICODE
276
4.94k
    if (utf)
277
4.94k
      {
278
4.94k
      GETCHARINC(x, data); /* macro generates multiple statements */
279
4.94k
      }
280
0
    else
281
0
#endif
282
0
      x = *data++;
283
284
4.94k
    if (t == XCL_SINGLE)
285
4.94k
      {
286
      /* Since character ranges follow the properties, and they are
287
      sorted, early return is possible for all characters <= x. */
288
4.94k
      if (c <= x) return (c == x) ? not_negated : !not_negated;
289
2.23k
      continue;
290
4.94k
      }
291
292
0
    PCRE2_ASSERT(t == XCL_RANGE);
293
0
#ifdef SUPPORT_UNICODE
294
0
    if (utf)
295
0
      {
296
0
      GETCHARINC(y, data); /* macro generates multiple statements */
297
0
      }
298
0
    else
299
0
#endif
300
0
      y = *data++;
301
302
    /* Since character ranges follow the properties, and they are
303
    sorted, early return is possible for all characters <= y. */
304
0
    if (c <= y) return (c >= x) ? not_negated : !not_negated;
305
0
    }
306
307
560
  return !not_negated;   /* char did not match */
308
3.26k
  }
309
310
0
#if PCRE2_CODE_UNIT_WIDTH == 8
311
0
type = (uint32_t)(data[0] << 8) | data[1];
312
0
data += 2;
313
#else
314
type = data[0];
315
data++;
316
#endif  /* CODE_UNIT_WIDTH */
317
318
/* Align characters. */
319
0
next_char = char_lists_end - (GET(data, 0) << 1);
320
0
type &= XCL_TYPE_MASK;
321
322
/* Alignment check. */
323
0
PCRE2_ASSERT(((uintptr_t)next_char & 0x1) == 0);
324
325
0
if (c >= XCL_CHAR_LIST_HIGH_16_START)
326
0
  {
327
0
  max_index = type & XCL_ITEM_COUNT_MASK;
328
0
  if (max_index == XCL_ITEM_COUNT_MASK)
329
0
    {
330
0
    max_index = *(const uint16_t*)next_char;
331
0
    PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
332
0
    next_char += 2;
333
0
    }
334
335
0
  next_char += max_index << 1;
336
0
  type >>= XCL_TYPE_BIT_LEN;
337
0
  }
338
339
0
if (c < XCL_CHAR_LIST_LOW_32_START)
340
0
  {
341
0
  max_index = type & XCL_ITEM_COUNT_MASK;
342
343
0
  c = (uint16_t)((c << XCL_CHAR_SHIFT) | XCL_CHAR_END);
344
345
0
  if (max_index == XCL_ITEM_COUNT_MASK)
346
0
    {
347
0
    max_index = *(const uint16_t*)next_char;
348
0
    PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
349
0
    next_char += 2;
350
0
    }
351
352
0
  if (max_index == 0 || c < *(const uint16_t*)next_char)
353
0
    return ((type & XCL_BEGIN_WITH_RANGE) != 0) == not_negated;
354
355
0
  min_index = 0;
356
0
  value = ((const uint16_t*)next_char)[--max_index];
357
0
  if (c >= value)
358
0
    return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
359
360
0
  max_index--;
361
362
  /* Binary search of a range. */
363
0
  while (TRUE)
364
0
    {
365
0
    uint32_t mid_index = (min_index + max_index) >> 1;
366
0
    value = ((const uint16_t*)next_char)[mid_index];
367
368
0
    if (c < value)
369
0
      max_index = mid_index - 1;
370
0
    else if (((const uint16_t*)next_char)[mid_index + 1] <= c)
371
0
      min_index = mid_index + 1;
372
0
    else
373
0
      return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
374
0
    }
375
0
  }
376
377
/* Skip the 16 bit ranges. */
378
0
max_index = type & XCL_ITEM_COUNT_MASK;
379
0
if (max_index == XCL_ITEM_COUNT_MASK)
380
0
  {
381
0
  max_index = *(const uint16_t*)next_char;
382
0
  PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
383
0
  next_char += 2;
384
0
  }
385
386
0
next_char += (max_index << 1);
387
0
type >>= XCL_TYPE_BIT_LEN;
388
389
/* Alignment check. */
390
0
PCRE2_ASSERT(((uintptr_t)next_char & 0x3) == 0);
391
392
0
max_index = type & XCL_ITEM_COUNT_MASK;
393
394
#if PCRE2_CODE_UNIT_WIDTH == 32
395
if (c >= XCL_CHAR_LIST_HIGH_32_START)
396
  {
397
  if (max_index == XCL_ITEM_COUNT_MASK)
398
    {
399
    max_index = *(const uint32_t*)next_char;
400
    PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
401
    next_char += 4;
402
    }
403
404
  next_char += max_index << 2;
405
  type >>= XCL_TYPE_BIT_LEN;
406
  max_index = type & XCL_ITEM_COUNT_MASK;
407
  }
408
#endif
409
410
0
c = (uint32_t)((c << XCL_CHAR_SHIFT) | XCL_CHAR_END);
411
412
0
if (max_index == XCL_ITEM_COUNT_MASK)
413
0
  {
414
0
  max_index = *(const uint32_t*)next_char;
415
0
  next_char += 4;
416
0
  }
417
418
0
if (max_index == 0 || c < *(const uint32_t*)next_char)
419
0
  return ((type & XCL_BEGIN_WITH_RANGE) != 0) == not_negated;
420
421
0
min_index = 0;
422
0
value = ((const uint32_t*)next_char)[--max_index];
423
0
if (c >= value)
424
0
  return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
425
426
0
max_index--;
427
428
/* Binary search of a range. */
429
0
while (TRUE)
430
0
  {
431
0
  uint32_t mid_index = (min_index + max_index) >> 1;
432
0
  value = ((const uint32_t*)next_char)[mid_index];
433
434
0
  if (c < value)
435
0
    max_index = mid_index - 1;
436
0
  else if (((const uint32_t*)next_char)[mid_index + 1] <= c)
437
0
    min_index = mid_index + 1;
438
0
  else
439
0
    return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
440
0
  }
441
0
}
442
443
444
445
/*************************************************
446
*       Match character against an ECLASS        *
447
*************************************************/
448
449
/* This function is called to match a character against an extended class
450
used for describing characters using boolean operations on sets.
451
452
Arguments:
453
  c           the character
454
  data_start  points to the start of the ECLASS data
455
  data_end    points one-past-the-last of the ECLASS data
456
  utf         TRUE if in UTF mode
457
458
Returns:      TRUE if character matches, else FALSE
459
*/
460
461
BOOL
462
PRIV(eclass)(uint32_t c, PCRE2_SPTR data_start, PCRE2_SPTR data_end,
463
  const uint8_t *char_lists_end, BOOL utf)
464
0
{
465
0
PCRE2_SPTR ptr = data_start;
466
0
PCRE2_UCHAR flags;
467
0
uint32_t stack = 0;
468
0
int stack_depth = 0;
469
470
0
PCRE2_ASSERT(data_start < data_end);
471
0
flags = *ptr++;
472
0
PCRE2_ASSERT((flags & ECL_MAP) == 0 ||
473
0
             (data_end - ptr) >= 32 / (int)sizeof(PCRE2_UCHAR));
474
475
/* Code points < 256 are matched against a bitmap, if one is present.
476
Otherwise all codepoints are checked later. */
477
478
0
if ((flags & ECL_MAP) != 0)
479
0
  {
480
0
  if (c < 256)
481
0
    return (((const uint8_t *)ptr)[c/8] & (1u << (c&7))) != 0;
482
483
  /* Skip the bitmap. */
484
0
  ptr += 32 / sizeof(PCRE2_UCHAR);
485
0
  }
486
487
/* Do a little loop, until we reach the end of the ECLASS. */
488
0
while (ptr < data_end)
489
0
  {
490
0
  switch (*ptr)
491
0
    {
492
0
    case ECL_AND:
493
0
    ++ptr;
494
0
    stack = (stack >> 1) & (stack | ~(uint32_t)1u);
495
0
    PCRE2_ASSERT(stack_depth >= 2);
496
0
    --stack_depth;
497
0
    break;
498
499
0
    case ECL_OR:
500
0
    ++ptr;
501
0
    stack = (stack >> 1) | (stack & (uint32_t)1u);
502
0
    PCRE2_ASSERT(stack_depth >= 2);
503
0
    --stack_depth;
504
0
    break;
505
506
0
    case ECL_XOR:
507
0
    ++ptr;
508
0
    stack = (stack >> 1) ^ (stack & (uint32_t)1u);
509
0
    PCRE2_ASSERT(stack_depth >= 2);
510
0
    --stack_depth;
511
0
    break;
512
513
0
    case ECL_NOT:
514
0
    ++ptr;
515
0
    stack ^= (uint32_t)1u;
516
0
    PCRE2_ASSERT(stack_depth >= 1);
517
0
    break;
518
519
0
    case ECL_XCLASS:
520
0
      {
521
0
      uint32_t matched = PRIV(xclass)(c, ptr + 1 + LINK_SIZE, char_lists_end, utf);
522
523
0
      ptr += GET(ptr, 1);
524
0
      stack = (stack << 1) | matched;
525
0
      ++stack_depth;
526
0
      break;
527
0
      }
528
529
    /* This should never occur, but compilers may mutter if there is no
530
    default. */
531
532
0
    default:
533
0
    PCRE2_DEBUG_UNREACHABLE();
534
0
    return FALSE;
535
0
    }
536
0
  }
537
538
0
PCRE2_ASSERT(stack_depth == 1);
539
0
(void)stack_depth;  /* Ignore unused variable, if assertions are disabled. */
540
541
/* The final bit left on the stack now holds the match result. */
542
0
return (stack & 1u) != 0;
543
0
}
544
545
/* End of pcre2_xclass.c */