Coverage Report

Created: 2025-08-28 06:38

/src/pcre2/src/pcre2_compile_class.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#include "pcre2_compile.h"
43
44
45
46
typedef struct {
47
  /* Option bits for eclass. */
48
  uint32_t options;
49
  uint32_t xoptions;
50
  /* Rarely used members. */
51
  int *errorcodeptr;
52
  compile_block *cb;
53
  /* Bitmap is needed. */
54
  BOOL needs_bitmap;
55
} eclass_context;
56
57
/* Checks the allowed tokens at the end of a class structure in debug mode.
58
When a new token is not processed by all loops, and the token is equals to
59
a) one of the cases here:
60
   the compiler will complain about a duplicated case value.
61
b) none of the cases here:
62
   the loop without the handler will stop with an assertion failure. */
63
64
#ifdef PCRE2_DEBUG
65
#define CLASS_END_CASES(meta) \
66
  default: \
67
  PCRE2_ASSERT((meta) <= META_END); \
68
  /* Fall through */ \
69
  case META_CLASS: \
70
  case META_CLASS_NOT: \
71
  case META_CLASS_EMPTY: \
72
  case META_CLASS_EMPTY_NOT: \
73
  case META_CLASS_END: \
74
  case META_ECLASS_AND: \
75
  case META_ECLASS_OR: \
76
  case META_ECLASS_SUB: \
77
  case META_ECLASS_XOR: \
78
  case META_ECLASS_NOT:
79
#else
80
#define CLASS_END_CASES(meta) \
81
4.80M
  default:
82
#endif
83
84
#ifdef SUPPORT_WIDE_CHARS
85
86
/* Heapsort algorithm. */
87
88
static void do_heapify(uint32_t *buffer, size_t size, size_t i)
89
2.08M
{
90
2.08M
size_t max;
91
2.08M
size_t left;
92
2.08M
size_t right;
93
2.08M
uint32_t tmp1, tmp2;
94
95
11.6M
while (TRUE)
96
11.6M
  {
97
11.6M
  max = i;
98
11.6M
  left = (i << 1) + 2;
99
11.6M
  right = left + 2;
100
101
11.6M
  if (left < size && buffer[left] > buffer[max]) max = left;
102
11.6M
  if (right < size && buffer[right] > buffer[max]) max = right;
103
11.6M
  if (i == max) return;
104
105
  /* Swap items. */
106
9.51M
  tmp1 = buffer[i];
107
9.51M
  tmp2 = buffer[i + 1];
108
9.51M
  buffer[i] = buffer[max];
109
9.51M
  buffer[i + 1] = buffer[max + 1];
110
9.51M
  buffer[max] = tmp1;
111
9.51M
  buffer[max + 1] = tmp2;
112
9.51M
  i = max;
113
9.51M
  }
114
2.08M
}
115
116
#ifdef SUPPORT_UNICODE
117
118
39.0k
#define PARSE_CLASS_UTF               0x1
119
1.14M
#define PARSE_CLASS_CASELESS_UTF      0x2
120
2.58M
#define PARSE_CLASS_RESTRICTED_UTF    0x4
121
6.49M
#define PARSE_CLASS_TURKISH_UTF       0x8
122
123
/* Get the range of nocase characters which includes the
124
'c' character passed as argument, or directly follows 'c'. */
125
126
static const uint32_t*
127
get_nocase_range(uint32_t c)
128
798k
{
129
798k
uint32_t left = 0;
130
798k
uint32_t right = PRIV(ucd_nocase_ranges_size);
131
798k
uint32_t middle;
132
133
798k
if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;
134
135
4.08M
while (TRUE)
136
4.08M
  {
137
  /* Range end of the middle element. */
138
4.08M
  middle = ((left + right) >> 1) | 0x1;
139
140
4.08M
  if (PRIV(ucd_nocase_ranges)[middle] <= c)
141
16.6k
    left = middle + 1;
142
4.06M
  else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)
143
3.26M
    right = middle - 1;
144
798k
  else
145
798k
    return PRIV(ucd_nocase_ranges) + (middle - 1);
146
4.08M
  }
147
798k
}
148
149
/* Get the list of othercase characters, which belongs to the passed range.
150
Create ranges from these characters, and append them to the buffer argument. */
151
152
static size_t
153
utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,
154
  uint32_t *buffer)
155
798k
{
156
798k
uint32_t new_start = start;
157
798k
uint32_t new_end = end;
158
798k
uint32_t c = start;
159
798k
const uint32_t *list;
160
798k
uint32_t tmp[3];
161
798k
size_t result = 2;
162
798k
const uint32_t *skip_range = get_nocase_range(c);
163
798k
uint32_t skip_start = skip_range[0];
164
165
798k
#if PCRE2_CODE_UNIT_WIDTH == 8
166
798k
PCRE2_ASSERT(options & PARSE_CLASS_UTF);
167
798k
#endif
168
169
#if PCRE2_CODE_UNIT_WIDTH == 32
170
if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;
171
#endif
172
173
3.05M
while (c <= end)
174
2.26M
  {
175
2.26M
  uint32_t co;
176
177
2.26M
  if (c > skip_start)
178
94.4k
    {
179
94.4k
    c = skip_range[1];
180
94.4k
    skip_range += 2;
181
94.4k
    skip_start = skip_range[0];
182
94.4k
    continue;
183
94.4k
    }
184
185
  /* Compute caseless set. */
186
187
2.16M
  if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) ==
188
2.16M
        PARSE_CLASS_TURKISH_UTF &&
189
2.16M
      UCD_ANY_I(c))
190
0
    {
191
0
    co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
192
0
    }
193
2.16M
  else if ((co = UCD_CASESET(c)) != 0 &&
194
2.16M
           (options & PARSE_CLASS_RESTRICTED_UTF) != 0 &&
195
2.16M
           PRIV(ucd_caseless_sets)[co] < 128)
196
372
    {
197
372
    co = 0;  /* Ignore the caseless set if it's restricted. */
198
372
    }
199
200
2.16M
  if (co != 0)
201
417k
    list = PRIV(ucd_caseless_sets) + co;
202
1.74M
  else
203
1.74M
    {
204
1.74M
    co = UCD_OTHERCASE(c);
205
1.74M
    list = tmp;
206
1.74M
    tmp[0] = c;
207
1.74M
    tmp[1] = NOTACHAR;
208
209
1.74M
    if (co != c)
210
1.60M
      {
211
1.60M
      tmp[1] = co;
212
1.60M
      tmp[2] = NOTACHAR;
213
1.60M
      }
214
1.74M
    }
215
2.16M
  c++;
216
217
  /* Add characters. */
218
2.16M
  do
219
4.61M
    {
220
#if PCRE2_CODE_UNIT_WIDTH == 16
221
    if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue;
222
#endif
223
224
4.61M
    if (*list < new_start)
225
724k
      {
226
724k
      if (*list + 1 == new_start)
227
722
        {
228
722
        new_start--;
229
722
        continue;
230
722
        }
231
724k
      }
232
3.88M
    else if (*list > new_end)
233
461k
      {
234
461k
      if (*list - 1 == new_end)
235
15.9k
        {
236
15.9k
        new_end++;
237
15.9k
        continue;
238
15.9k
        }
239
461k
      }
240
3.42M
    else continue;
241
242
1.16M
    result += 2;
243
1.16M
    if (buffer != NULL)
244
584k
      {
245
584k
      buffer[0] = *list;
246
584k
      buffer[1] = *list;
247
584k
      buffer += 2;
248
584k
      }
249
1.16M
    }
250
4.61M
  while (*(++list) != NOTACHAR);
251
2.16M
  }
252
253
798k
  if (buffer != NULL)
254
399k
    {
255
399k
    buffer[0] = new_start;
256
399k
    buffer[1] = new_end;
257
399k
    buffer += 2;
258
399k
    (void)buffer;
259
399k
    }
260
798k
  return result;
261
798k
}
262
263
#endif
264
265
/* Add a character list to a buffer. */
266
267
static size_t
268
append_char_list(const uint32_t *p, uint32_t *buffer)
269
41.5k
{
270
41.5k
const uint32_t *n;
271
41.5k
size_t result = 0;
272
273
402k
while (*p != NOTACHAR)
274
361k
  {
275
361k
  n = p;
276
764k
  while (n[0] == n[1] - 1) n++;
277
278
361k
  PCRE2_ASSERT(*p < 0xffff);
279
280
361k
  if (buffer != NULL)
281
180k
    {
282
180k
    buffer[0] = *p;
283
180k
    buffer[1] = *n;
284
180k
    buffer += 2;
285
180k
    }
286
287
361k
  result += 2;
288
361k
  p = n + 1;
289
361k
  }
290
291
41.5k
  return result;
292
41.5k
}
293
294
static uint32_t
295
get_highest_char(uint32_t options)
296
9.21k
{
297
9.21k
(void)options; /* Avoid compiler warning. */
298
299
9.21k
#if PCRE2_CODE_UNIT_WIDTH == 8
300
9.21k
return MAX_UTF_CODE_POINT;
301
#else
302
#ifdef SUPPORT_UNICODE
303
return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0);
304
#else
305
return MAX_UCHAR_VALUE;
306
#endif
307
#endif
308
9.21k
}
309
310
/* Add a negated character list to a buffer. */
311
static size_t
312
append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer)
313
15.0k
{
314
15.0k
const uint32_t *n;
315
15.0k
uint32_t start = 0;
316
15.0k
size_t result = 2;
317
318
15.0k
PCRE2_ASSERT(*p > 0);
319
320
139k
while (*p != NOTACHAR)
321
124k
  {
322
124k
  n = p;
323
264k
  while (n[0] == n[1] - 1) n++;
324
325
124k
  PCRE2_ASSERT(*p < 0xffff);
326
327
124k
  if (buffer != NULL)
328
62.4k
    {
329
62.4k
    buffer[0] = start;
330
62.4k
    buffer[1] = *p - 1;
331
62.4k
    buffer += 2;
332
62.4k
    }
333
334
124k
  result += 2;
335
124k
  start = *n + 1;
336
124k
  p = n + 1;
337
124k
  }
338
339
15.0k
  if (buffer != NULL)
340
7.50k
    {
341
7.50k
    buffer[0] = start;
342
7.50k
    buffer[1] = get_highest_char(options);
343
7.50k
    buffer += 2;
344
7.50k
    (void)buffer;
345
7.50k
    }
346
347
15.0k
  return result;
348
15.0k
}
349
350
static uint32_t *
351
append_non_ascii_range(uint32_t options, uint32_t *buffer)
352
2.90k
{
353
2.90k
  if (buffer == NULL) return NULL;
354
355
1.45k
  buffer[0] = 0x100;
356
1.45k
  buffer[1] = get_highest_char(options);
357
1.45k
  return buffer + 2;
358
2.90k
}
359
360
static size_t
361
parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
362
77.0k
{
363
77.0k
size_t total_size = 0;
364
77.0k
size_t size;
365
77.0k
uint32_t meta_arg;
366
77.0k
uint32_t start_char;
367
368
1.28M
while (TRUE)
369
1.28M
  {
370
1.28M
  switch (META_CODE(*ptr))
371
1.28M
    {
372
72.4k
    case META_ESCAPE:
373
72.4k
      meta_arg = META_DATA(*ptr);
374
72.4k
      switch (meta_arg)
375
72.4k
        {
376
784
        case ESC_D:
377
2.00k
        case ESC_W:
378
2.77k
        case ESC_S:
379
2.77k
        buffer = append_non_ascii_range(options, buffer);
380
2.77k
        total_size += 2;
381
2.77k
        break;
382
383
39.4k
        case ESC_h:
384
39.4k
        size = append_char_list(PRIV(hspace_list), buffer);
385
39.4k
        total_size += size;
386
39.4k
        if (buffer != NULL) buffer += size;
387
39.4k
        break;
388
389
13.3k
        case ESC_H:
390
13.3k
        size = append_negated_char_list(PRIV(hspace_list), options, buffer);
391
13.3k
        total_size += size;
392
13.3k
        if (buffer != NULL) buffer += size;
393
13.3k
        break;
394
395
2.09k
        case ESC_v:
396
2.09k
        size = append_char_list(PRIV(vspace_list), buffer);
397
2.09k
        total_size += size;
398
2.09k
        if (buffer != NULL) buffer += size;
399
2.09k
        break;
400
401
1.68k
        case ESC_V:
402
1.68k
        size = append_negated_char_list(PRIV(vspace_list), options, buffer);
403
1.68k
        total_size += size;
404
1.68k
        if (buffer != NULL) buffer += size;
405
1.68k
        break;
406
407
6.79k
        case ESC_p:
408
11.9k
        case ESC_P:
409
11.9k
        ptr++;
410
11.9k
        if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)
411
518
          {
412
518
          if (buffer != NULL)
413
259
            {
414
259
            buffer[0] = 0;
415
259
            buffer[1] = get_highest_char(options);
416
259
            buffer += 2;
417
259
            }
418
518
          total_size += 2;
419
518
          }
420
11.9k
        break;
421
72.4k
        }
422
72.4k
      ptr++;
423
72.4k
      continue;
424
132
    case META_POSIX_NEG:
425
132
      buffer = append_non_ascii_range(options, buffer);
426
132
      total_size += 2;
427
132
      ptr += 2;
428
132
      continue;
429
945
    case META_POSIX:
430
945
      ptr += 2;
431
945
      continue;
432
0
    case META_BIGVALUE:
433
      /* Character literal */
434
0
      ptr++;
435
0
      break;
436
1.20M
    CLASS_END_CASES(*ptr)
437
1.20M
      if (*ptr >= META_END) return total_size;
438
1.13M
      break;
439
1.28M
    }
440
441
1.13M
    start_char = *ptr;
442
443
1.13M
    if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED)
444
6.87k
      {
445
6.87k
      ptr += 2;
446
6.87k
      PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE);
447
448
6.87k
      if (*ptr == META_BIGVALUE) ptr++;
449
450
#ifdef EBCDIC
451
#error "Missing EBCDIC support"
452
#endif
453
6.87k
      }
454
455
1.13M
#ifdef SUPPORT_UNICODE
456
1.13M
    if (options & PARSE_CLASS_CASELESS_UTF)
457
798k
      {
458
798k
      size = utf_caseless_extend(start_char, *ptr++, options, buffer);
459
798k
      if (buffer != NULL) buffer += size;
460
798k
      total_size += size;
461
798k
      continue;
462
798k
      }
463
332k
#endif
464
465
332k
    if (buffer != NULL)
466
166k
      {
467
166k
      buffer[0] = start_char;
468
166k
      buffer[1] = *ptr;
469
166k
      buffer += 2;
470
166k
      }
471
472
332k
    ptr++;
473
332k
    total_size += 2;
474
332k
  }
475
476
0
  return total_size;
477
77.0k
}
478
479
/* Extra uint32_t values for storing the lengths of range lists in
480
the worst case. Two uint32_t lengths and a range end for a range
481
starting before 255 */
482
38.0k
#define CHAR_LIST_EXTRA_SIZE 3
483
484
/* Starting character values for each character list. */
485
486
static const uint32_t char_list_starts[] = {
487
#if PCRE2_CODE_UNIT_WIDTH == 32
488
  XCL_CHAR_LIST_HIGH_32_START,
489
#endif
490
#if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE
491
  XCL_CHAR_LIST_LOW_32_START,
492
#endif
493
  XCL_CHAR_LIST_HIGH_16_START,
494
  /* Must be terminated by XCL_CHAR_LIST_LOW_16_START,
495
  which also represents the end of the bitset. */
496
  XCL_CHAR_LIST_LOW_16_START,
497
};
498
499
static class_ranges *
500
compile_optimize_class(uint32_t *start_ptr, uint32_t options,
501
  uint32_t xoptions, compile_block *cb)
502
39.0k
{
503
39.0k
class_ranges* cranges;
504
39.0k
uint32_t *ptr;
505
39.0k
uint32_t *buffer;
506
39.0k
uint32_t *dst;
507
39.0k
uint32_t class_options = 0;
508
39.0k
size_t range_list_size = 0, total_size, i;
509
39.0k
uint32_t tmp1, tmp2;
510
39.0k
const uint32_t *char_list_next;
511
39.0k
uint16_t *next_char;
512
39.0k
uint32_t char_list_start, char_list_end;
513
39.0k
uint32_t range_start, range_end;
514
515
39.0k
#ifdef SUPPORT_UNICODE
516
39.0k
if (options & PCRE2_UTF)
517
39.0k
  class_options |= PARSE_CLASS_UTF;
518
519
39.0k
if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP)))
520
13.4k
  class_options |= PARSE_CLASS_CASELESS_UTF;
521
522
39.0k
if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT)
523
309
  class_options |= PARSE_CLASS_RESTRICTED_UTF;
524
525
39.0k
if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
526
0
  class_options |= PARSE_CLASS_TURKISH_UTF;
527
#else
528
(void)options;   /* Avoid compiler warning. */
529
(void)xoptions;  /* Avoid compiler warning. */
530
#endif
531
532
/* Compute required space for the range. */
533
534
39.0k
range_list_size = parse_class(start_ptr, class_options, NULL);
535
39.0k
PCRE2_ASSERT((range_list_size & 0x1) == 0);
536
537
/* Allocate buffer. The total_size also represents the end of the buffer. */
538
539
39.0k
total_size = range_list_size +
540
39.0k
   ((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);
541
542
39.0k
cranges = cb->cx->memctl.malloc(
543
39.0k
  sizeof(class_ranges) + total_size * sizeof(uint32_t),
544
39.0k
  cb->cx->memctl.memory_data);
545
546
39.0k
if (cranges == NULL) return NULL;
547
548
39.0k
cranges->header.next = NULL;
549
#ifdef PCRE2_DEBUG
550
cranges->header.type = CDATA_CRANGE;
551
#endif
552
39.0k
cranges->range_list_size = (uint16_t)range_list_size;
553
39.0k
cranges->char_lists_types = 0;
554
39.0k
cranges->char_lists_size = 0;
555
39.0k
cranges->char_lists_start = 0;
556
557
39.0k
if (range_list_size == 0) return cranges;
558
559
38.0k
buffer = (uint32_t*)(cranges + 1);
560
38.0k
parse_class(start_ptr, class_options, buffer);
561
562
/* Using <= instead of == to help static analysis. */
563
38.0k
if (range_list_size <= 2) return cranges;
564
565
/* In-place sorting of ranges. */
566
567
31.6k
i = (((range_list_size >> 2) - 1) << 1);
568
688k
while (TRUE)
569
688k
  {
570
688k
  do_heapify(buffer, range_list_size, i);
571
688k
  if (i == 0) break;
572
657k
  i -= 2;
573
657k
  }
574
575
31.6k
i = range_list_size - 2;
576
1.39M
while (TRUE)
577
1.39M
  {
578
1.39M
  tmp1 = buffer[i];
579
1.39M
  tmp2 = buffer[i + 1];
580
1.39M
  buffer[i] = buffer[0];
581
1.39M
  buffer[i + 1] = buffer[1];
582
1.39M
  buffer[0] = tmp1;
583
1.39M
  buffer[1] = tmp2;
584
585
1.39M
  do_heapify(buffer, i, 0);
586
1.39M
  if (i == 0) break;
587
1.36M
  i -= 2;
588
1.36M
  }
589
590
/* Merge ranges whenever possible. */
591
31.6k
dst = buffer;
592
31.6k
ptr = buffer + 2;
593
31.6k
range_list_size -= 2;
594
595
/* The second condition is a very rare corner case, where the end of the last
596
range is the maximum character. This range cannot be extended further. */
597
598
1.39M
while (range_list_size > 0 && dst[1] != ~(uint32_t)0)
599
1.36M
  {
600
1.36M
  if (dst[1] + 1 < ptr[0])
601
202k
    {
602
202k
    dst += 2;
603
202k
    dst[0] = ptr[0];
604
202k
    dst[1] = ptr[1];
605
202k
    }
606
1.16M
  else if (dst[1] < ptr[1]) dst[1] = ptr[1];
607
608
1.36M
  ptr += 2;
609
1.36M
  range_list_size -= 2;
610
1.36M
  }
611
612
31.6k
PCRE2_ASSERT(dst[1] <= get_highest_char(class_options));
613
614
/* When the number of ranges are less than six,
615
they are not converted to range lists. */
616
617
31.6k
ptr = buffer;
618
153k
while (ptr < dst && ptr[1] < 0x100) ptr += 2;
619
31.6k
if (dst - ptr < (2 * (6 - 1)))
620
17.8k
  {
621
17.8k
  cranges->range_list_size = (uint16_t)(dst + 2 - buffer);
622
17.8k
  return cranges;
623
17.8k
  }
624
625
/* Compute character lists structures. */
626
627
13.7k
char_list_next = char_list_starts;
628
13.7k
char_list_start = *char_list_next++;
629
#if PCRE2_CODE_UNIT_WIDTH == 32
630
char_list_end = XCL_CHAR_LIST_HIGH_32_END;
631
#elif defined SUPPORT_UNICODE
632
13.7k
char_list_end = XCL_CHAR_LIST_LOW_32_END;
633
#else
634
char_list_end = XCL_CHAR_LIST_HIGH_16_END;
635
#endif
636
13.7k
next_char = (uint16_t*)(buffer + total_size);
637
638
13.7k
tmp1 = 0;
639
13.7k
tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN;
640
13.7k
PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN);
641
13.7k
range_start = dst[0];
642
13.7k
range_end = dst[1];
643
644
130k
while (TRUE)
645
130k
  {
646
130k
  if (range_start >= char_list_start)
647
89.1k
    {
648
89.1k
    if (range_start == range_end || range_end < char_list_end)
649
86.7k
      {
650
86.7k
      tmp1++;
651
86.7k
      next_char--;
652
653
86.7k
      if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
654
84.4k
        *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
655
2.25k
      else
656
2.25k
        *(uint32_t*)(--next_char) =
657
2.25k
          (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
658
86.7k
      }
659
660
89.1k
    if (range_start < range_end)
661
26.8k
      {
662
26.8k
      if (range_start > char_list_start)
663
26.7k
        {
664
26.7k
        tmp1++;
665
26.7k
        next_char--;
666
667
26.7k
        if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
668
26.6k
          *next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT);
669
121
        else
670
121
          *(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT);
671
26.7k
        }
672
66
      else
673
66
        cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
674
26.8k
      }
675
676
89.1k
    PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
677
678
89.1k
    if (dst > buffer)
679
89.0k
      {
680
89.0k
      dst -= 2;
681
89.0k
      range_start = dst[0];
682
89.0k
      range_end = dst[1];
683
89.0k
      continue;
684
89.0k
      }
685
686
132
    range_start = 0;
687
132
    range_end = 0;
688
132
    }
689
690
41.0k
  if (range_end >= char_list_start)
691
7.19k
    {
692
7.19k
    PCRE2_ASSERT(range_start < char_list_start);
693
694
7.19k
    if (range_end < char_list_end)
695
4.81k
      {
696
4.81k
      tmp1++;
697
4.81k
      next_char--;
698
699
4.81k
      if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
700
2.40k
        *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
701
2.40k
      else
702
2.40k
        *(uint32_t*)(--next_char) =
703
2.40k
          (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
704
705
4.81k
      PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
706
4.81k
      }
707
708
7.19k
    cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
709
7.19k
    }
710
711
41.0k
  if (tmp1 >= XCL_ITEM_COUNT_MASK)
712
13.9k
    {
713
13.9k
    cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2;
714
13.9k
    next_char--;
715
716
13.9k
    if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
717
13.7k
      *next_char = (uint16_t)tmp1;
718
184
    else
719
184
      *(uint32_t*)(--next_char) = tmp1;
720
13.9k
    }
721
27.1k
  else
722
27.1k
    cranges->char_lists_types |= tmp1 << tmp2;
723
724
41.0k
  if (range_start < XCL_CHAR_LIST_LOW_16_START) break;
725
726
27.3k
  PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN);
727
27.3k
  char_list_end = char_list_start - 1;
728
27.3k
  char_list_start = *char_list_next++;
729
27.3k
  tmp1 = 0;
730
27.3k
  tmp2 -= XCL_TYPE_BIT_LEN;
731
27.3k
  }
732
733
13.7k
if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2;
734
13.7k
PCRE2_ASSERT((uint16_t*)dst <= next_char);
735
736
13.7k
cranges->char_lists_size =
737
13.7k
  (size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char);
738
13.7k
cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer);
739
13.7k
cranges->range_list_size = (uint16_t)(dst - buffer);
740
13.7k
return cranges;
741
31.6k
}
742
743
#endif /* SUPPORT_WIDE_CHARS */
744
745
#ifdef SUPPORT_UNICODE
746
747
void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated,
748
  uint8_t *classbits)
749
40.4k
{
750
/* Update PRIV(xclass) when this function is changed. */
751
40.4k
int c, chartype;
752
40.4k
const ucd_record *prop;
753
40.4k
uint32_t gentype;
754
40.4k
BOOL set_bit;
755
756
40.4k
if (ptype == PT_ANY)
757
0
  {
758
0
  if (!negated) memset(classbits, 0xff, 32);
759
0
  return;
760
0
  }
761
762
10.3M
for (c = 0; c < 256; c++)
763
10.3M
  {
764
10.3M
  prop = GET_UCD(c);
765
10.3M
  set_bit = FALSE;
766
10.3M
  (void)set_bit;
767
768
10.3M
  switch (ptype)
769
10.3M
    {
770
553k
    case PT_LAMP:
771
553k
    chartype = prop->chartype;
772
553k
    set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt);
773
553k
    break;
774
775
834k
    case PT_GC:
776
834k
    set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata);
777
834k
    break;
778
779
1.41M
    case PT_PC:
780
1.41M
    set_bit = (prop->chartype == pdata);
781
1.41M
    break;
782
783
27.6k
    case PT_SC:
784
27.6k
    set_bit = (prop->script == pdata);
785
27.6k
    break;
786
787
108k
    case PT_SCX:
788
108k
    set_bit = (prop->script == pdata ||
789
108k
      MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
790
108k
    break;
791
792
188k
    case PT_ALNUM:
793
188k
    gentype = PRIV(ucp_gentype)[prop->chartype];
794
188k
    set_bit = (gentype == ucp_L || gentype == ucp_N);
795
188k
    break;
796
797
1.90M
    case PT_SPACE:    /* Perl space */
798
1.99M
    case PT_PXSPACE:  /* POSIX space */
799
1.99M
    switch(c)
800
1.99M
      {
801
46.7k
      HSPACE_BYTE_CASES:
802
62.2k
      VSPACE_BYTE_CASES:
803
62.2k
      set_bit = TRUE;
804
62.2k
      break;
805
806
1.93M
      default:
807
1.93M
      set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z);
808
1.93M
      break;
809
1.99M
      }
810
1.99M
    break;
811
812
2.88M
    case PT_WORD:
813
2.88M
    chartype = prop->chartype;
814
2.88M
    gentype = PRIV(ucp_gentype)[chartype];
815
2.88M
    set_bit = (gentype == ucp_L || gentype == ucp_N ||
816
2.88M
               chartype == ucp_Mn || chartype == ucp_Pc);
817
2.88M
    break;
818
819
495k
    case PT_UCNC:
820
495k
    set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
821
495k
               c == CHAR_GRAVE_ACCENT || c >= 0xa0);
822
495k
    break;
823
824
95.2k
    case PT_BIDICL:
825
95.2k
    set_bit = (UCD_BIDICLASS_PROP(prop) == pdata);
826
95.2k
    break;
827
828
40.9k
    case PT_BOOL:
829
40.9k
    set_bit = MAPBIT(PRIV(ucd_boolprop_sets) +
830
40.9k
                     UCD_BPROPS_PROP(prop), pdata) != 0;
831
40.9k
    break;
832
833
234k
    case PT_PXGRAPH:
834
234k
    chartype = prop->chartype;
835
234k
    gentype = PRIV(ucp_gentype)[chartype];
836
234k
    set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf));
837
234k
    break;
838
839
180k
    case PT_PXPRINT:
840
180k
    chartype = prop->chartype;
841
180k
    set_bit = (chartype != ucp_Zl && chartype != ucp_Zp &&
842
180k
       (PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf));
843
180k
    break;
844
845
511k
    case PT_PXPUNCT:
846
511k
    gentype = PRIV(ucp_gentype)[prop->chartype];
847
511k
    set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S));
848
511k
    break;
849
850
782k
    default:
851
782k
    PCRE2_ASSERT(ptype == PT_PXXDIGIT);
852
782k
    set_bit = (c >= CHAR_0 && c <= CHAR_9) ||
853
782k
              (c >= CHAR_A && c <= CHAR_F) ||
854
782k
              (c >= CHAR_a && c <= CHAR_f);
855
782k
    break;
856
10.3M
    }
857
858
10.3M
  if (negated) set_bit = !set_bit;
859
10.3M
  if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7));
860
10.3M
  if ((c & 0x7) == 0x7) classbits++;
861
10.3M
  }
862
40.4k
}
863
864
#endif /* SUPPORT_UNICODE */
865
866
867
868
#ifdef SUPPORT_WIDE_CHARS
869
870
/*************************************************
871
*           XClass related properties            *
872
*************************************************/
873
874
/* XClass needs to be generated. */
875
276k
#define XCLASS_REQUIRED 0x1
876
/* XClass has 8 bit character. */
877
2.60M
#define XCLASS_HAS_8BIT_CHARS 0x2
878
/* XClass has properties. */
879
94.8k
#define XCLASS_HAS_PROPS 0x4
880
/* XClass has character lists. */
881
131k
#define XCLASS_HAS_CHAR_LISTS 0x8
882
/* XClass matches to all >= 256 characters. */
883
84.5k
#define XCLASS_HIGH_ANY 0x10
884
885
#endif
886
887
888
/*************************************************
889
*   Internal entry point for add range to class  *
890
*************************************************/
891
892
/* This function sets the overall range for characters < 256.
893
It also handles non-utf case folding.
894
895
Arguments:
896
  options       the options bits
897
  xoptions      the extra options bits
898
  cb            compile data
899
  start         start of range character
900
  end           end of range character
901
902
Returns:        cb->classbits is updated
903
*/
904
905
static void
906
add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
907
  uint32_t start, uint32_t end)
908
2.65M
{
909
2.65M
uint8_t *classbits = cb->classbits.classbits;
910
2.65M
uint32_t c, byte_start, byte_end;
911
2.65M
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
912
913
#ifndef SUPPORT_UNICODE
914
(void)xoptions; /* Avoid compiler warning. */
915
#endif
916
917
/* If caseless matching is required, scan the range and process alternate
918
cases. In Unicode, there are 8-bit characters that have alternate cases that
919
are greater than 255 and vice-versa (though these may be ignored if caseless
920
restriction is in force). Sometimes we can just extend the original range. */
921
922
2.65M
if ((options & PCRE2_CASELESS) != 0)
923
361k
  {
924
361k
#ifdef SUPPORT_UNICODE
925
  /* UTF mode. This branch is taken if we don't support wide characters (e.g.
926
  8-bit library, without UTF), but we do treat those characters as Unicode
927
  (if UCP flag is set). In this case, we only need to expand the character class
928
  set to include the case pairs which are in the 0-255 codepoint range. */
929
361k
  if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
930
270k
    {
931
270k
      BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
932
270k
        PCRE2_EXTRA_TURKISH_CASING;
933
270k
      if (start < 128)
934
202k
        {
935
202k
        uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127);
936
456k
        for (c = start; c <= lo_end; c++)
937
253k
          {
938
253k
          if (turkish_i && UCD_ANY_I(c)) continue;
939
253k
          SETBIT(classbits, cb->fcc[c]);
940
253k
          }
941
202k
        }
942
270k
      if (classbits_end >= 128)
943
67.6k
        {
944
67.6k
        uint32_t hi_start = (start > 128 ? start : 128);
945
158k
        for (c = hi_start; c <= classbits_end; c++)
946
90.4k
          {
947
90.4k
          uint32_t co = UCD_OTHERCASE(c);
948
90.4k
          if (co <= 0xff) SETBIT(classbits, co);
949
90.4k
          }
950
67.6k
        }
951
270k
    }
952
953
91.1k
  else
954
91.1k
#endif  /* SUPPORT_UNICODE */
955
956
  /* Not UTF mode */
957
91.1k
    {
958
207k
    for (c = start; c <= classbits_end; c++)
959
115k
      SETBIT(classbits, cb->fcc[c]);
960
91.1k
    }
961
361k
  }
962
963
/* Use the bitmap for characters < 256. Otherwise use extra data. */
964
965
2.65M
byte_start = (start + 7) >> 3;
966
2.65M
byte_end = (classbits_end + 1) >> 3;
967
968
2.65M
if (byte_start >= byte_end)
969
2.57M
  {
970
5.17M
  for (c = start; c <= classbits_end; c++)
971
    /* Regardless of start, c will always be <= 255. */
972
2.60M
    SETBIT(classbits, c);
973
2.57M
  return;
974
2.57M
  }
975
976
755k
for (c = byte_start; c < byte_end; c++)
977
669k
  classbits[c] = 0xff;
978
979
85.7k
byte_start <<= 3;
980
85.7k
byte_end <<= 3;
981
982
441k
for (c = start; c < byte_start; c++)
983
355k
  SETBIT(classbits, c);
984
985
159k
for (c = byte_end; c <= classbits_end; c++)
986
73.4k
  SETBIT(classbits, c);
987
85.7k
}
988
989
990
#if PCRE2_CODE_UNIT_WIDTH == 8
991
/*************************************************
992
*   Internal entry point for add list to class   *
993
*************************************************/
994
995
/* This function is used for adding a list of horizontal or vertical whitespace
996
characters to a class. The list must be in order so that ranges of characters
997
can be detected and handled appropriately. This function sets the overall range
998
so that the internal functions can try to avoid duplication when handling
999
case-independence.
1000
1001
Arguments:
1002
  options       the options bits
1003
  xoptions      the extra options bits
1004
  cb            contains pointers to tables etc.
1005
  p             points to row of 32-bit values, terminated by NOTACHAR
1006
1007
Returns:        cb->classbits is updated
1008
*/
1009
1010
static void
1011
add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
1012
  const uint32_t *p)
1013
14.2k
{
1014
54.4k
while (p[0] < 256)
1015
40.1k
  {
1016
40.1k
  unsigned int n = 0;
1017
1018
48.1k
  while(p[n+1] == p[0] + n + 1) n++;
1019
40.1k
  add_to_class(options, xoptions, cb, p[0], p[n]);
1020
1021
40.1k
  p += n + 1;
1022
40.1k
  }
1023
14.2k
}
1024
1025
1026
1027
/*************************************************
1028
*    Add characters not in a list to a class     *
1029
*************************************************/
1030
1031
/* This function is used for adding the complement of a list of horizontal or
1032
vertical whitespace to a class. The list must be in order.
1033
1034
Arguments:
1035
  options       the options bits
1036
  xoptions      the extra options bits
1037
  cb            contains pointers to tables etc.
1038
  p             points to row of 32-bit values, terminated by NOTACHAR
1039
1040
Returns:        cb->classbits is updated
1041
*/
1042
1043
static void
1044
add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
1045
  const uint32_t *p)
1046
15.9k
{
1047
15.9k
if (p[0] > 0)
1048
15.9k
  add_to_class(options, xoptions, cb, 0, p[0] - 1);
1049
61.5k
while (p[0] < 256)
1050
45.6k
  {
1051
52.0k
  while (p[1] == p[0] + 1) p++;
1052
45.6k
  add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1);
1053
45.6k
  p++;
1054
45.6k
  }
1055
15.9k
}
1056
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1057
1058
1059
1060
/*************************************************
1061
*  Main entry-point to compile a character class *
1062
*************************************************/
1063
1064
/* This function consumes a "leaf", which is a set of characters that will
1065
become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */
1066
1067
uint32_t *
1068
PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions,
1069
  uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap,
1070
  int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr)
1071
199k
{
1072
199k
uint32_t *pptr = start_ptr;
1073
199k
PCRE2_UCHAR *code = *pcode;
1074
199k
BOOL should_flip_negation;
1075
199k
const uint8_t *cbits = cb->cbits;
1076
/* Some functions such as add_to_class() or eclass processing
1077
expects that the bitset is stored in cb->classbits.classbits. */
1078
199k
uint8_t *const classbits = cb->classbits.classbits;
1079
1080
199k
#ifdef SUPPORT_UNICODE
1081
199k
BOOL utf = (options & PCRE2_UTF) != 0;
1082
#else  /* No Unicode support */
1083
BOOL utf = FALSE;
1084
#endif
1085
1086
/* Helper variables for OP_XCLASS opcode (for characters > 255). */
1087
1088
199k
#ifdef SUPPORT_WIDE_CHARS
1089
199k
uint32_t xclass_props;
1090
199k
PCRE2_UCHAR *class_uchardata;
1091
199k
class_ranges* cranges;
1092
#else
1093
(void)has_bitmap;    /* Avoid compiler warning. */
1094
(void)errorcodeptr;  /* Avoid compiler warning. */
1095
(void)lengthptr;     /* Avoid compiler warning. */
1096
#endif
1097
1098
/* If an XClass contains a negative special such as \S, we need to flip the
1099
negation flag at the end, so that support for characters > 255 works correctly
1100
(they are all included in the class). An XClass may need to insert specific
1101
matching or non-matching code for wide characters.
1102
*/
1103
1104
199k
should_flip_negation = FALSE;
1105
1106
/* XClass will be used when characters > 255 might match. */
1107
1108
199k
#ifdef SUPPORT_WIDE_CHARS
1109
199k
xclass_props = 0;
1110
1111
199k
#if PCRE2_CODE_UNIT_WIDTH == 8
1112
199k
cranges = NULL;
1113
1114
199k
if (utf)
1115
60.2k
#endif
1116
60.2k
  {
1117
60.2k
  if (lengthptr != NULL)
1118
39.0k
    {
1119
39.0k
    cranges = compile_optimize_class(pptr, options, xoptions, cb);
1120
1121
39.0k
    if (cranges == NULL)
1122
0
      {
1123
0
      *errorcodeptr = ERR21;
1124
0
      return NULL;
1125
0
      }
1126
1127
    /* Caching the pre-processed character ranges. */
1128
39.0k
    if (cb->last_data != NULL)
1129
35.3k
      cb->last_data->next = &cranges->header;
1130
3.68k
    else
1131
3.68k
      cb->first_data = &cranges->header;
1132
1133
39.0k
    cb->last_data = &cranges->header;
1134
39.0k
    }
1135
21.1k
  else
1136
21.1k
    {
1137
    /* Reuse the pre-processed character ranges. */
1138
21.1k
    cranges = (class_ranges*)cb->first_data;
1139
21.1k
    PCRE2_ASSERT(cranges != NULL && cranges->header.type == CDATA_CRANGE);
1140
21.1k
    cb->first_data = cranges->header.next;
1141
21.1k
    }
1142
1143
60.2k
  if (cranges->range_list_size > 0)
1144
57.9k
    {
1145
57.9k
    const uint32_t *ranges = (const uint32_t*)(cranges + 1);
1146
1147
57.9k
    if (ranges[0] <= 255)
1148
53.5k
      xclass_props |= XCLASS_HAS_8BIT_CHARS;
1149
1150
57.9k
    if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) &&
1151
57.9k
        ranges[cranges->range_list_size - 2] <= 256)
1152
1.79k
      xclass_props |= XCLASS_HIGH_ANY;
1153
57.9k
    }
1154
60.2k
  }
1155
1156
199k
class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
1157
199k
#endif /* SUPPORT_WIDE_CHARS */
1158
1159
/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
1160
in a temporary bit of memory, in case the class contains fewer than two
1161
8-bit characters because in that case the compiled code doesn't use the bit
1162
map. */
1163
1164
199k
memset(classbits, 0, 32);
1165
1166
/* Process items until end_ptr is reached. */
1167
1168
3.76M
while (TRUE)
1169
3.76M
  {
1170
3.76M
  uint32_t meta = *(pptr++);
1171
3.76M
  BOOL local_negate;
1172
3.76M
  int posix_class;
1173
3.76M
  int taboffset, tabopt;
1174
3.76M
  class_bits_storage pbits;
1175
3.76M
  uint32_t escape, c;
1176
1177
  /* Handle POSIX classes such as [:alpha:] etc. */
1178
3.76M
  switch (META_CODE(meta))
1179
3.76M
    {
1180
14.1k
    case META_POSIX:
1181
16.0k
    case META_POSIX_NEG:
1182
1183
16.0k
    local_negate = (meta == META_POSIX_NEG);
1184
16.0k
    posix_class = *(pptr++);
1185
1186
16.0k
    if (local_negate) should_flip_negation = TRUE;  /* Note negative special */
1187
1188
    /* If matching is caseless, upper and lower are converted to alpha.
1189
    This relies on the fact that the class table starts with alpha,
1190
    lower, upper as the first 3 entries. */
1191
1192
16.0k
    if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
1193
634
      posix_class = 0;
1194
1195
    /* When PCRE2_UCP is set, some of the POSIX classes are converted to
1196
    different escape sequences that use Unicode properties \p or \P.
1197
    Others that are not available via \p or \P have to generate
1198
    XCL_PROP/XCL_NOTPROP directly, which is done here. */
1199
1200
16.0k
#ifdef SUPPORT_UNICODE
1201
    /* TODO This entire block of code here appears to be unreachable!? I simply
1202
    can't see how it can be hit, given that the frontend parser doesn't emit
1203
    META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */
1204
16.0k
    if ((options & PCRE2_UCP) != 0 &&
1205
16.0k
        (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
1206
1.18k
      {
1207
1.18k
      uint32_t ptype;
1208
1209
1.18k
      switch(posix_class)
1210
1.18k
        {
1211
0
        case PC_GRAPH:
1212
0
        case PC_PRINT:
1213
0
        case PC_PUNCT:
1214
0
        ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH :
1215
0
                (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT;
1216
1217
0
        PRIV(update_classbits)(ptype, 0, local_negate, classbits);
1218
1219
0
        if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1220
0
          {
1221
0
          if (lengthptr != NULL)
1222
0
            *lengthptr += 3;
1223
0
          else
1224
0
            {
1225
0
            *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
1226
0
            *class_uchardata++ = (PCRE2_UCHAR)ptype;
1227
0
            *class_uchardata++ = 0;
1228
0
            }
1229
0
          xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1230
0
          }
1231
0
        continue;
1232
1233
        /* For the other POSIX classes (ex: ascii) we are going to
1234
        fall through to the non-UCP case and build a bit map for
1235
        characters with code points less than 256. However, if we are in
1236
        a negated POSIX class, characters with code points greater than
1237
        255 must either all match or all not match, depending on whether
1238
        the whole class is not or is negated. For example, for
1239
        [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
1240
        they must not.
1241
1242
        In the special case where there are no xclass items, this is
1243
        automatically handled by the use of OP_CLASS or OP_NCLASS, but an
1244
        explicit range is needed for OP_XCLASS. Setting a flag here
1245
        causes the range to be generated later when it is known that
1246
        OP_XCLASS is required. In the 8-bit library this is relevant only in
1247
        utf mode, since no wide characters can exist otherwise. */
1248
1249
1.18k
        default:
1250
1.18k
        break;
1251
1.18k
        }
1252
1.18k
      }
1253
16.0k
#endif  /* SUPPORT_UNICODE */
1254
1255
    /* In the non-UCP case, or when UCP makes no difference, we build the
1256
    bit map for the POSIX class in a chunk of local store because we may
1257
    be adding and subtracting from it, and we don't want to subtract bits
1258
    that may be in the main map already. At the end we or the result into
1259
    the bit map that is being built. */
1260
1261
16.0k
    posix_class *= 3;
1262
1263
    /* Copy in the first table (always present) */
1264
1265
16.0k
    memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32);
1266
1267
    /* If there is a second table, add or remove it as required. */
1268
1269
16.0k
    taboffset = PRIV(posix_class_maps)[posix_class + 1];
1270
16.0k
    tabopt = PRIV(posix_class_maps)[posix_class + 2];
1271
1272
16.0k
    if (taboffset >= 0)
1273
5.17k
      {
1274
5.17k
      if (tabopt >= 0)
1275
108k
        for (int i = 0; i < 32; i++)
1276
104k
          pbits.classbits[i] |= cbits[i + taboffset];
1277
1.90k
      else
1278
62.7k
        for (int i = 0; i < 32; i++)
1279
60.8k
          pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]);
1280
5.17k
      }
1281
1282
    /* Now see if we need to remove any special characters. An option
1283
    value of 1 removes vertical space and 2 removes underscore. */
1284
1285
16.0k
    if (tabopt < 0) tabopt = -tabopt;
1286
#ifdef EBCDIC
1287
      {
1288
      uint8_t posix_vertical[4] = { CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR };
1289
      uint8_t posix_underscore = CHAR_UNDERSCORE;
1290
      uint8_t *chars = NULL;
1291
      int n = 0;
1292
1293
      if (tabopt == 1) { chars = posix_vertical; n = 4; }
1294
      else if (tabopt == 2) { chars = &posix_underscore; n = 1; }
1295
1296
      for (; n > 0; ++chars, --n)
1297
        pbits.classbits[*chars/8] &= ~(1u << (*chars&7));
1298
      }
1299
#else
1300
16.0k
    if (tabopt == 1) pbits.classbits[1] &= ~0x3c;
1301
15.1k
    else if (tabopt == 2) pbits.classbits[11] &= 0x7f;
1302
16.0k
#endif
1303
1304
    /* Add the POSIX table or its complement into the main table that is
1305
    being built and we are done. */
1306
1307
16.0k
      {
1308
16.0k
      uint32_t *classwords = cb->classbits.classwords;
1309
1310
16.0k
      if (local_negate)
1311
16.9k
        for (int i = 0; i < 8; i++)
1312
15.0k
          classwords[i] |= (uint32_t)(~pbits.classwords[i]);
1313
14.1k
      else
1314
127k
        for (int i = 0; i < 8; i++)
1315
113k
          classwords[i] |= pbits.classwords[i];
1316
16.0k
      }
1317
1318
16.0k
#ifdef SUPPORT_WIDE_CHARS
1319
    /* Every class contains at least one < 256 character. */
1320
16.0k
    xclass_props |= XCLASS_HAS_8BIT_CHARS;
1321
16.0k
#endif
1322
16.0k
    continue;               /* End of POSIX handling */
1323
1324
    /* Other than POSIX classes, the only items we should encounter are
1325
    \d-type escapes and literal characters (possibly as ranges). */
1326
0
    case META_BIGVALUE:
1327
0
    meta = *(pptr++);
1328
0
    break;
1329
1330
151k
    case META_ESCAPE:
1331
151k
    escape = META_DATA(meta);
1332
1333
151k
    switch(escape)
1334
151k
      {
1335
4.32k
      case ESC_d:
1336
142k
      for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
1337
4.32k
      break;
1338
1339
7.65k
      case ESC_D:
1340
7.65k
      should_flip_negation = TRUE;
1341
252k
      for (int i = 0; i < 32; i++)
1342
244k
        classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
1343
7.65k
      break;
1344
1345
4.38k
      case ESC_w:
1346
144k
      for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
1347
4.38k
      break;
1348
1349
9.29k
      case ESC_W:
1350
9.29k
      should_flip_negation = TRUE;
1351
306k
      for (int i = 0; i < 32; i++)
1352
297k
        classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
1353
9.29k
      break;
1354
1355
      /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
1356
      5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
1357
      previously set by something earlier in the character class.
1358
      Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
1359
      we could just adjust the appropriate bit. From PCRE 8.34 we no
1360
      longer treat \s and \S specially. */
1361
1362
3.33k
      case ESC_s:
1363
110k
      for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
1364
3.33k
      break;
1365
1366
7.37k
      case ESC_S:
1367
7.37k
      should_flip_negation = TRUE;
1368
243k
      for (int i = 0; i < 32; i++)
1369
236k
        classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
1370
7.37k
      break;
1371
1372
      /* When adding the horizontal or vertical space lists to a class, or
1373
      their complements, disable PCRE2_CASELESS, because it justs wastes
1374
      time, and in the "not-x" UTF cases can create unwanted duplicates in
1375
      the XCLASS list (provoked by characters that have more than one other
1376
      case and by both cases being in the same "not-x" sublist). */
1377
1378
38.7k
      case ESC_h:
1379
38.7k
#if PCRE2_CODE_UNIT_WIDTH == 8
1380
38.7k
#ifdef SUPPORT_UNICODE
1381
38.7k
      if (cranges != NULL) break;
1382
11.5k
#endif
1383
11.5k
      add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1384
11.5k
        cb, PRIV(hspace_list));
1385
#else
1386
      PCRE2_ASSERT(cranges != NULL);
1387
#endif
1388
11.5k
      break;
1389
1390
25.7k
      case ESC_H:
1391
25.7k
#if PCRE2_CODE_UNIT_WIDTH == 8
1392
25.7k
#ifdef SUPPORT_UNICODE
1393
25.7k
      if (cranges != NULL) break;
1394
13.7k
#endif
1395
13.7k
      add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1396
13.7k
        cb, PRIV(hspace_list));
1397
#else
1398
      PCRE2_ASSERT(cranges != NULL);
1399
#endif
1400
13.7k
      break;
1401
1402
4.73k
      case ESC_v:
1403
4.73k
#if PCRE2_CODE_UNIT_WIDTH == 8
1404
4.73k
#ifdef SUPPORT_UNICODE
1405
4.73k
      if (cranges != NULL) break;
1406
2.68k
#endif
1407
2.68k
      add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1408
2.68k
        cb, PRIV(vspace_list));
1409
#else
1410
      PCRE2_ASSERT(cranges != NULL);
1411
#endif
1412
2.68k
      break;
1413
1414
3.64k
      case ESC_V:
1415
3.64k
#if PCRE2_CODE_UNIT_WIDTH == 8
1416
3.64k
#ifdef SUPPORT_UNICODE
1417
3.64k
      if (cranges != NULL) break;
1418
2.14k
#endif
1419
2.14k
      add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1420
2.14k
        cb, PRIV(vspace_list));
1421
#else
1422
      PCRE2_ASSERT(cranges != NULL);
1423
#endif
1424
2.14k
      break;
1425
1426
      /* If Unicode is not supported, \P and \p are not allowed and are
1427
      faulted at parse time, so will never appear here. */
1428
1429
0
#ifdef SUPPORT_UNICODE
1430
22.3k
      case ESC_p:
1431
41.8k
      case ESC_P:
1432
41.8k
        {
1433
41.8k
        uint32_t ptype = *pptr >> 16;
1434
41.8k
        uint32_t pdata = *(pptr++) & 0xffff;
1435
1436
        /* The "Any" is processed by PRIV(update_classbits)(). */
1437
41.8k
        if (ptype == PT_ANY)
1438
1.39k
          {
1439
1.39k
#if PCRE2_CODE_UNIT_WIDTH == 8
1440
1.39k
          if (!utf && escape == ESC_p) memset(classbits, 0xff, 32);
1441
1.39k
#endif
1442
1.39k
          continue;
1443
1.39k
          }
1444
1445
40.4k
        PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);
1446
1447
40.4k
        if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1448
37.8k
          {
1449
37.8k
          if (lengthptr != NULL)
1450
23.8k
            *lengthptr += 3;
1451
13.9k
          else
1452
13.9k
            {
1453
13.9k
            *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
1454
13.9k
            *class_uchardata++ = ptype;
1455
13.9k
            *class_uchardata++ = pdata;
1456
13.9k
            }
1457
37.8k
          xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1458
37.8k
          }
1459
40.4k
        }
1460
0
      continue;
1461
151k
#endif
1462
151k
      }
1463
1464
109k
#ifdef SUPPORT_WIDE_CHARS
1465
    /* Every non-property class contains at least one < 256 character. */
1466
109k
    xclass_props |= XCLASS_HAS_8BIT_CHARS;
1467
109k
#endif
1468
    /* End handling \d-type escapes */
1469
109k
    continue;
1470
1471
3.59M
    CLASS_END_CASES(meta)
1472
    /* Literals. */
1473
3.59M
    if (meta < META_END) break;
1474
    /* Non-literals: end of class contents. */
1475
199k
    goto END_PROCESSING;
1476
3.76M
    }
1477
1478
  /* A literal character may be followed by a range meta. At parse time
1479
  there are checks for out-of-order characters, for ranges where the two
1480
  characters are equal, and for hyphens that cannot indicate a range. At
1481
  this point, therefore, no checking is needed. */
1482
1483
3.39M
  c = meta;
1484
1485
  /* Remember if \r or \n were explicitly used */
1486
1487
3.39M
  if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
1488
1489
  /* Process a character range */
1490
1491
3.39M
  if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED)
1492
16.8k
    {
1493
16.8k
    uint32_t d;
1494
1495
#ifdef EBCDIC
1496
    BOOL range_is_literal = (*pptr == META_RANGE_LITERAL);
1497
#endif
1498
16.8k
    ++pptr;
1499
16.8k
    d = *(pptr++);
1500
16.8k
    if (d == META_BIGVALUE) d = *(pptr++);
1501
1502
    /* Remember an explicit \r or \n, and add the range to the class. */
1503
1504
16.8k
    if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
1505
1506
16.8k
#if PCRE2_CODE_UNIT_WIDTH == 8
1507
16.8k
#ifdef SUPPORT_UNICODE
1508
16.8k
    if (cranges != NULL) continue;
1509
10.6k
    xclass_props |= XCLASS_HAS_8BIT_CHARS;
1510
10.6k
#endif
1511
1512
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
1513
    because there are holes in the encoding, and simply using the range
1514
    A-Z (for example) would include the characters in the holes. This
1515
    applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
1516
1517
#ifdef EBCDIC
1518
    if (range_is_literal &&
1519
         (cb->ctypes[c] & ctype_letter) != 0 &&
1520
         (cb->ctypes[d] & ctype_letter) != 0 &&
1521
         (c <= CHAR_z) == (d <= CHAR_z))
1522
      {
1523
      uint32_t uc = (d <= CHAR_z)? 0 : 64;
1524
      uint32_t C = c - uc;
1525
      uint32_t D = d - uc;
1526
1527
      if (C <= CHAR_i)
1528
        {
1529
        add_to_class(options, xoptions, cb, C + uc,
1530
          ((D < CHAR_i)? D : CHAR_i) + uc);
1531
        C = CHAR_j;
1532
        }
1533
1534
      if (C <= D && C <= CHAR_r)
1535
        {
1536
        add_to_class(options, xoptions, cb, C + uc,
1537
          ((D < CHAR_r)? D : CHAR_r) + uc);
1538
        C = CHAR_s;
1539
        }
1540
1541
      if (C <= D)
1542
        add_to_class(options, xoptions, cb, C + uc, D + uc);
1543
      }
1544
    else
1545
#endif
1546
    /* Not an EBCDIC special range */
1547
1548
10.6k
    add_to_class(options, xoptions, cb, c, d);
1549
#else
1550
    PCRE2_ASSERT(cranges != NULL);
1551
#endif
1552
10.6k
    continue;
1553
16.8k
    }  /* End of range handling */
1554
1555
  /* Character ranges are ignored when class_ranges is present. */
1556
3.38M
#if PCRE2_CODE_UNIT_WIDTH == 8
1557
3.38M
#ifdef SUPPORT_UNICODE
1558
3.38M
  if (cranges != NULL) continue;
1559
2.33M
  xclass_props |= XCLASS_HAS_8BIT_CHARS;
1560
2.33M
#endif
1561
  /* Handle a single character. */
1562
1563
2.33M
  add_to_class(options, xoptions, cb, meta, meta);
1564
#else
1565
  PCRE2_ASSERT(cranges != NULL);
1566
#endif
1567
2.33M
  }   /* End of main class-processing loop */
1568
1569
199k
END_PROCESSING:
1570
1571
199k
#ifdef SUPPORT_WIDE_CHARS
1572
199k
PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 ||
1573
199k
             (xclass_props & XCLASS_HIGH_ANY) == 0);
1574
1575
199k
if (cranges != NULL)
1576
60.2k
  {
1577
60.2k
  uint32_t *range = (uint32_t*)(cranges + 1);
1578
60.2k
  uint32_t *end = range + cranges->range_list_size;
1579
1580
265k
  while (range < end && range[0] < 256)
1581
209k
    {
1582
209k
    PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0);
1583
    /* Add range to bitset. If we are in UTF or UCP mode, then clear the
1584
    caseless bit, because the cranges handle caselessness (only) in this
1585
    condition; see the condition for PARSE_CLASS_CASELESS_UTF in
1586
    compile_optimize_class(). */
1587
209k
    add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)?
1588
209k
        (options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]);
1589
1590
209k
    if (range[1] > 255) break;
1591
205k
    range += 2;
1592
205k
    }
1593
1594
60.2k
  if (cranges->char_lists_size > 0)
1595
17.8k
    {
1596
    /* The cranges structure is still used and freed later. */
1597
17.8k
    PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0);
1598
17.8k
    xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS;
1599
17.8k
    }
1600
42.3k
  else
1601
42.3k
    {
1602
42.3k
    if ((xclass_props & XCLASS_HIGH_ANY) != 0)
1603
1.79k
      {
1604
1.79k
      PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&
1605
1.79k
        range[1] >= GET_MAX_CHAR_VALUE(utf));
1606
1.79k
      should_flip_negation = TRUE;
1607
1.79k
      range = end;
1608
1.79k
      }
1609
1610
63.6k
    while (range < end)
1611
21.2k
      {
1612
21.2k
      uint32_t range_start = range[0];
1613
21.2k
      uint32_t range_end = range[1];
1614
1615
21.2k
      range += 2;
1616
21.2k
      xclass_props |= XCLASS_REQUIRED;
1617
1618
21.2k
      if (range_start < 256) range_start = 256;
1619
1620
21.2k
      if (lengthptr != NULL)
1621
12.7k
        {
1622
12.7k
#ifdef SUPPORT_UNICODE
1623
12.7k
        if (utf)
1624
12.7k
          {
1625
12.7k
          *lengthptr += 1;
1626
1627
12.7k
          if (range_start < range_end)
1628
1.98k
            *lengthptr += PRIV(ord2utf)(range_start, class_uchardata);
1629
1630
12.7k
          *lengthptr += PRIV(ord2utf)(range_end, class_uchardata);
1631
12.7k
          continue;
1632
12.7k
          }
1633
0
#endif  /* SUPPORT_UNICODE */
1634
1635
0
        *lengthptr += range_start < range_end ? 3 : 2;
1636
0
        continue;
1637
12.7k
        }
1638
1639
8.56k
#ifdef SUPPORT_UNICODE
1640
8.56k
      if (utf)
1641
8.56k
        {
1642
8.56k
        if (range_start < range_end)
1643
1.30k
          {
1644
1.30k
          *class_uchardata++ = XCL_RANGE;
1645
1.30k
          class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);
1646
1.30k
          }
1647
7.26k
        else
1648
7.26k
          *class_uchardata++ = XCL_SINGLE;
1649
1650
8.56k
        class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);
1651
8.56k
        continue;
1652
8.56k
        }
1653
8.56k
#endif  /* SUPPORT_UNICODE */
1654
1655
      /* Without UTF support, character values are constrained
1656
      by the bit length, and can only be > 256 for 16-bit and
1657
      32-bit libraries. */
1658
#if PCRE2_CODE_UNIT_WIDTH != 8
1659
      if (range_start < range_end)
1660
        {
1661
        *class_uchardata++ = XCL_RANGE;
1662
        *class_uchardata++ = range_start;
1663
        }
1664
      else
1665
        *class_uchardata++ = XCL_SINGLE;
1666
1667
      *class_uchardata++ = range_end;
1668
#endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
1669
8.56k
      }
1670
1671
42.3k
    if (lengthptr == NULL)
1672
17.0k
      cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
1673
42.3k
    }
1674
60.2k
  }
1675
199k
#endif /* SUPPORT_WIDE_CHARS */
1676
1677
/* If there are characters with values > 255, or Unicode property settings
1678
(\p or \P), we have to compile an extended class, with its own opcode,
1679
unless there were no property settings and there was a negated special such
1680
as \S in the class, and PCRE2_UCP is not set, because in that case all
1681
characters > 255 are in or not in the class, so any that were explicitly
1682
given as well can be ignored.
1683
1684
In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
1685
were present in a class, we either have to match or not match all wide
1686
characters (depending on whether the whole class is or is not negated).
1687
This requirement is indicated by match_all_or_no_wide_chars being true.
1688
We do this by including an explicit range, which works in both cases.
1689
This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
1690
cannot be any wide characters in 8-bit non-UTF mode.
1691
1692
When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
1693
class where \S etc is present without PCRE2_UCP, causing an extended class
1694
to be compiled, we make sure that all characters > 255 are included by
1695
forcing match_all_or_no_wide_chars to be true.
1696
1697
If, when generating an xclass, there are no characters < 256, we can omit
1698
the bitmap in the actual compiled code. */
1699
1700
199k
#ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
1701
199k
if ((xclass_props & XCLASS_REQUIRED) != 0)
1702
57.0k
  {
1703
57.0k
  PCRE2_UCHAR *previous = code;
1704
1705
57.0k
  if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0)
1706
39.1k
    *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
1707
57.0k
  *code++ = OP_XCLASS;
1708
57.0k
  code += LINK_SIZE;
1709
57.0k
  *code = negate_class? XCL_NOT:0;
1710
57.0k
  if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;
1711
1712
  /* If the map is required, move up the extra data to make room for it;
1713
  otherwise just move the code pointer to the end of the extra data. */
1714
1715
57.0k
  if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL)
1716
49.6k
    {
1717
49.6k
    if (negate_class)
1718
10.6k
      {
1719
10.6k
      uint32_t *classwords = cb->classbits.classwords;
1720
95.5k
      for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
1721
10.6k
      }
1722
1723
49.6k
    if (has_bitmap == NULL)
1724
24.3k
      {
1725
24.3k
      *code++ |= XCL_MAP;
1726
24.3k
      (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
1727
24.3k
        CU2BYTES(class_uchardata - code));
1728
24.3k
      memcpy(code, classbits, 32);
1729
24.3k
      code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
1730
24.3k
      }
1731
25.3k
    else
1732
25.3k
      {
1733
25.3k
      code = class_uchardata;
1734
25.3k
      if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0)
1735
19.7k
        *has_bitmap = TRUE;
1736
25.3k
      }
1737
49.6k
    }
1738
7.36k
  else code = class_uchardata;
1739
1740
57.0k
  if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)
1741
17.8k
    {
1742
    /* Char lists size is an even number, because all items are 16 or 32
1743
    bit values. The character list data is always aligned to 32 bits. */
1744
17.8k
    size_t char_lists_size = cranges->char_lists_size;
1745
17.8k
    PCRE2_ASSERT((char_lists_size & 0x1) == 0 &&
1746
17.8k
                 (cb->char_lists_size & 0x3) == 0);
1747
1748
17.8k
    if (lengthptr != NULL)
1749
13.7k
      {
1750
13.7k
      char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
1751
1752
13.7k
#if PCRE2_CODE_UNIT_WIDTH == 8
1753
13.7k
      *lengthptr += 2 + LINK_SIZE;
1754
#else
1755
      *lengthptr += 1 + LINK_SIZE;
1756
#endif
1757
1758
13.7k
      cb->char_lists_size += char_lists_size;
1759
1760
13.7k
      char_lists_size /= sizeof(PCRE2_UCHAR);
1761
1762
      /* Storage space for character lists is included
1763
      in the maximum pattern size. */
1764
13.7k
      if (*lengthptr > MAX_PATTERN_SIZE ||
1765
13.7k
          MAX_PATTERN_SIZE - *lengthptr < char_lists_size)
1766
18
        {
1767
18
        *errorcodeptr = ERR20;   /* Pattern is too large */
1768
18
        return NULL;
1769
18
        }
1770
13.7k
      }
1771
4.12k
    else
1772
4.12k
      {
1773
4.12k
      uint8_t *data;
1774
1775
4.12k
      PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);
1776
4.12k
#if PCRE2_CODE_UNIT_WIDTH == 8
1777
      /* Encode as high / low bytes. */
1778
4.12k
      code[0] = (uint8_t)(XCL_LIST |
1779
4.12k
        (cranges->char_lists_types >> 8));
1780
4.12k
      code[1] = (uint8_t)cranges->char_lists_types;
1781
4.12k
      code += 2;
1782
#else
1783
      *code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
1784
#endif
1785
1786
      /* Character lists are stored in backwards direction from
1787
      byte code start. The non-dfa/dfa matchers can access these
1788
      lists using the byte code start stored in match blocks.
1789
      Each list is aligned to 32 bit with an optional unused
1790
      16 bit value at the beginning of the character list. */
1791
1792
4.12k
      cb->char_lists_size += char_lists_size;
1793
4.12k
      data = (uint8_t*)cb->start_code - cb->char_lists_size;
1794
1795
4.12k
      memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start,
1796
4.12k
        char_lists_size);
1797
1798
      /* Since character lists total size is less than MAX_PATTERN_SIZE,
1799
      their starting offset fits into a value which size is LINK_SIZE. */
1800
1801
4.12k
      char_lists_size = cb->char_lists_size;
1802
4.12k
      PUT(code, 0, (uint32_t)(char_lists_size >> 1));
1803
4.12k
      code += LINK_SIZE;
1804
1805
#if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND
1806
      if ((char_lists_size & 0x2) != 0)
1807
        {
1808
        /* In debug the unused 16 bit value is set
1809
        to a fixed value and marked unused. */
1810
        ((uint16_t*)data)[-1] = 0x5555;
1811
#ifdef SUPPORT_VALGRIND
1812
        VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2);
1813
#endif
1814
        }
1815
#endif
1816
1817
4.12k
      cb->char_lists_size =
1818
4.12k
        CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
1819
1820
4.12k
      cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
1821
4.12k
      }
1822
17.8k
    }
1823
1824
  /* Now fill in the complete length of the item */
1825
1826
57.0k
  PUT(previous, 1, (int)(code - previous));
1827
57.0k
  goto DONE;   /* End of class handling */
1828
57.0k
  }
1829
142k
#endif  /* SUPPORT_WIDE_CHARS */
1830
1831
/* If there are no characters > 255, or they are all to be included or
1832
excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
1833
whole class was negated and whether there were negative specials such as \S
1834
(non-UCP) in the class. Then copy the 32-byte map into the code vector,
1835
negating it if necessary. */
1836
1837
142k
if (negate_class)
1838
55.4k
  {
1839
55.4k
  uint32_t *classwords = cb->classbits.classwords;
1840
1841
498k
  for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
1842
55.4k
  }
1843
1844
142k
if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) &&
1845
142k
    cb->classbits.classwords[0] == ~(uint32_t)0)
1846
39.6k
  {
1847
39.6k
  const uint32_t *classwords = cb->classbits.classwords;
1848
39.6k
  int i;
1849
1850
151k
  for (i = 0; i < 8; i++)
1851
150k
    if (classwords[i] != ~(uint32_t)0) break;
1852
1853
39.6k
  if (i == 8)
1854
1.32k
    {
1855
1.32k
    *code++ = OP_ALLANY;
1856
1.32k
    goto DONE;   /* End of class handling */
1857
1.32k
    }
1858
39.6k
  }
1859
1860
141k
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
1861
141k
memcpy(code, classbits, 32);
1862
141k
code += 32 / sizeof(PCRE2_UCHAR);
1863
1864
199k
DONE:
1865
199k
*pcode = code;
1866
199k
return pptr - 1;
1867
141k
}
1868
1869
1870
1871
/* ===================================================================*/
1872
/* Here follows a block of ECLASS-compiling functions. You may well want to
1873
read them from top to bottom; they are ordered from leafmost (at the top) to
1874
outermost parser (at the bottom of the file). */
1875
1876
/* This function folds one operand using the negation operator.
1877
The new, combined chunk of stack code is written out to *pop_info. */
1878
1879
static void
1880
fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr,
1881
  BOOL preserve_classbits)
1882
11.4k
{
1883
/* If the chunk of stack code is already composed of multiple ops, we won't
1884
descend in and try and propagate the negation down the tree. (That would lead
1885
to O(n^2) compile-time, which could be exploitable with a malicious regex -
1886
although maybe that's not really too much of a worry in a library that offers
1887
an exponential-time matching function!) */
1888
1889
11.4k
if (pop_info->op_single_type == 0)
1890
4.38k
  {
1891
4.38k
  if (lengthptr != NULL)
1892
2.34k
    *lengthptr += 1;
1893
2.04k
  else
1894
2.04k
    pop_info->code_start[pop_info->length] = ECL_NOT;
1895
4.38k
  pop_info->length += 1;
1896
4.38k
  }
1897
1898
/* Otherwise, it's a nice single-op item, so we can easily fold in the negation
1899
without needing to produce an ECL_NOT. */
1900
1901
7.10k
else if (pop_info->op_single_type == ECL_ANY ||
1902
7.10k
         pop_info->op_single_type == ECL_NONE)
1903
3.40k
  {
1904
3.40k
  pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)?
1905
2.20k
      ECL_ANY : ECL_NONE;
1906
3.40k
  if (lengthptr == NULL)
1907
1.35k
    *(pop_info->code_start) = pop_info->op_single_type;
1908
3.40k
  }
1909
3.69k
else
1910
3.69k
  {
1911
3.69k
  PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS &&
1912
3.69k
               pop_info->length >= 1 + LINK_SIZE + 1);
1913
3.69k
  if (lengthptr == NULL)
1914
1.78k
    pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT;
1915
3.69k
  }
1916
1917
11.4k
if (!preserve_classbits)
1918
6.79k
  {
1919
61.1k
  for (int i = 0; i < 8; i++)
1920
54.3k
    pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i];
1921
6.79k
  }
1922
11.4k
}
1923
1924
1925
1926
/* This function folds together two operands using a binary operator.
1927
The new, combined chunk of stack code is written out to *lhs_op_info. */
1928
1929
static void
1930
fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info,
1931
  PCRE2_SIZE *lengthptr)
1932
57.1k
{
1933
57.1k
switch (op)
1934
57.1k
  {
1935
  /* ECL_AND truth table:
1936
1937
     LHS  RHS  RESULT
1938
     ----------------
1939
     ANY  *    RHS
1940
     *    ANY  LHS
1941
     NONE *    NONE
1942
     *    NONE NONE
1943
     X    Y    X & Y
1944
  */
1945
1946
14.7k
  case ECL_AND:
1947
14.7k
  if (rhs_op_info->op_single_type == ECL_ANY)
1948
7.26k
    {
1949
    /* no-op: drop the RHS */
1950
7.26k
    }
1951
7.48k
  else if (lhs_op_info->op_single_type == ECL_ANY)
1952
1.93k
    {
1953
    /* no-op: drop the LHS, and memmove the RHS into its place */
1954
1.93k
    if (lengthptr == NULL)
1955
862
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
1956
862
              CU2BYTES(rhs_op_info->length));
1957
1.93k
    lhs_op_info->length = rhs_op_info->length;
1958
1.93k
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
1959
1.93k
    }
1960
5.55k
  else if (rhs_op_info->op_single_type == ECL_NONE)
1961
2.21k
    {
1962
    /* the result is ECL_NONE: write into the LHS */
1963
2.21k
    if (lengthptr == NULL)
1964
1.03k
      lhs_op_info->code_start[0] = ECL_NONE;
1965
2.21k
    lhs_op_info->length = 1;
1966
2.21k
    lhs_op_info->op_single_type = ECL_NONE;
1967
2.21k
    }
1968
3.33k
  else if (lhs_op_info->op_single_type == ECL_NONE)
1969
809
    {
1970
    /* the result is ECL_NONE: drop the RHS */
1971
809
    }
1972
2.52k
  else
1973
2.52k
    {
1974
    /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
1975
2.52k
    if (lengthptr != NULL)
1976
1.63k
      *lengthptr += 1;
1977
897
    else
1978
897
      {
1979
897
      PCRE2_ASSERT(rhs_op_info->code_start ==
1980
897
          lhs_op_info->code_start + lhs_op_info->length);
1981
897
      rhs_op_info->code_start[rhs_op_info->length] = ECL_AND;
1982
897
      }
1983
2.52k
    lhs_op_info->length += rhs_op_info->length + 1;
1984
2.52k
    lhs_op_info->op_single_type = 0;
1985
2.52k
    }
1986
1987
132k
  for (int i = 0; i < 8; i++)
1988
118k
    lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i];
1989
14.7k
  break;
1990
1991
  /* ECL_OR truth table:
1992
1993
     LHS  RHS  RESULT
1994
     ----------------
1995
     ANY  *    ANY
1996
     *    ANY  ANY
1997
     NONE *    RHS
1998
     *    NONE LHS
1999
     X    Y    X | Y
2000
  */
2001
2002
34.3k
  case ECL_OR:
2003
34.3k
  if (rhs_op_info->op_single_type == ECL_NONE)
2004
15.6k
    {
2005
    /* no-op: drop the RHS */
2006
15.6k
    }
2007
18.6k
  else if (lhs_op_info->op_single_type == ECL_NONE)
2008
5.09k
    {
2009
    /* no-op: drop the LHS, and memmove the RHS into its place */
2010
5.09k
    if (lengthptr == NULL)
2011
1.73k
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2012
1.73k
              CU2BYTES(rhs_op_info->length));
2013
5.09k
    lhs_op_info->length = rhs_op_info->length;
2014
5.09k
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2015
5.09k
    }
2016
13.5k
  else if (rhs_op_info->op_single_type == ECL_ANY)
2017
1.76k
    {
2018
    /* the result is ECL_ANY: write into the LHS */
2019
1.76k
    if (lengthptr == NULL)
2020
793
      lhs_op_info->code_start[0] = ECL_ANY;
2021
1.76k
    lhs_op_info->length = 1;
2022
1.76k
    lhs_op_info->op_single_type = ECL_ANY;
2023
1.76k
    }
2024
11.8k
  else if (lhs_op_info->op_single_type == ECL_ANY)
2025
313
    {
2026
    /* the result is ECL_ANY: drop the RHS */
2027
313
    }
2028
11.4k
  else
2029
11.4k
    {
2030
    /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
2031
11.4k
    if (lengthptr != NULL)
2032
8.12k
      *lengthptr += 1;
2033
3.36k
    else
2034
3.36k
      {
2035
3.36k
      PCRE2_ASSERT(rhs_op_info->code_start ==
2036
3.36k
          lhs_op_info->code_start + lhs_op_info->length);
2037
3.36k
      rhs_op_info->code_start[rhs_op_info->length] = ECL_OR;
2038
3.36k
      }
2039
11.4k
    lhs_op_info->length += rhs_op_info->length + 1;
2040
11.4k
    lhs_op_info->op_single_type = 0;
2041
11.4k
    }
2042
2043
308k
  for (int i = 0; i < 8; i++)
2044
274k
    lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i];
2045
34.3k
  break;
2046
2047
  /* ECL_XOR truth table:
2048
2049
     LHS  RHS  RESULT
2050
     ----------------
2051
     ANY  *    !RHS
2052
     *    ANY  !LHS
2053
     NONE *    RHS
2054
     *    NONE LHS
2055
     X    Y    X ^ Y
2056
  */
2057
2058
8.07k
  case ECL_XOR:
2059
8.07k
  if (rhs_op_info->op_single_type == ECL_NONE)
2060
685
    {
2061
    /* no-op: drop the RHS */
2062
685
    }
2063
7.39k
  else if (lhs_op_info->op_single_type == ECL_NONE)
2064
1.01k
    {
2065
    /* no-op: drop the LHS, and memmove the RHS into its place */
2066
1.01k
    if (lengthptr == NULL)
2067
474
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2068
474
              CU2BYTES(rhs_op_info->length));
2069
1.01k
    lhs_op_info->length = rhs_op_info->length;
2070
1.01k
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2071
1.01k
    }
2072
6.38k
  else if (rhs_op_info->op_single_type == ECL_ANY)
2073
3.21k
    {
2074
    /* the result is !LHS: fold in the negation, and drop the RHS */
2075
    /* Preserve the classbits, because we promise to deal with them later. */
2076
3.21k
    fold_negation(lhs_op_info, lengthptr, TRUE);
2077
3.21k
    }
2078
3.16k
  else if (lhs_op_info->op_single_type == ECL_ANY)
2079
1.48k
    {
2080
    /* the result is !RHS: drop the LHS, memmove the RHS into its place, and
2081
    fold in the negation */
2082
1.48k
    if (lengthptr == NULL)
2083
727
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2084
727
              CU2BYTES(rhs_op_info->length));
2085
1.48k
    lhs_op_info->length = rhs_op_info->length;
2086
1.48k
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2087
2088
    /* Preserve the classbits, because we promise to deal with them later. */
2089
1.48k
    fold_negation(lhs_op_info, lengthptr, TRUE);
2090
1.48k
    }
2091
1.68k
  else
2092
1.68k
    {
2093
    /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
2094
1.68k
    if (lengthptr != NULL)
2095
1.03k
      *lengthptr += 1;
2096
643
    else
2097
643
      {
2098
643
      PCRE2_ASSERT(rhs_op_info->code_start ==
2099
643
          lhs_op_info->code_start + lhs_op_info->length);
2100
643
      rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR;
2101
643
      }
2102
1.68k
    lhs_op_info->length += rhs_op_info->length + 1;
2103
1.68k
    lhs_op_info->op_single_type = 0;
2104
1.68k
    }
2105
2106
72.7k
  for (int i = 0; i < 8; i++)
2107
64.6k
    lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i];
2108
8.07k
  break;
2109
2110
0
  default:
2111
0
  PCRE2_DEBUG_UNREACHABLE();
2112
0
  break;
2113
57.1k
  }
2114
57.1k
}
2115
2116
2117
2118
static BOOL
2119
compile_eclass_nested(eclass_context *context, BOOL negated,
2120
  uint32_t **pptr, PCRE2_UCHAR **pcode,
2121
  eclass_op_info *pop_info, PCRE2_SIZE *lengthptr);
2122
2123
/* This function consumes a group of implicitly-unioned class elements.
2124
These can be characters, ranges, properties, or nested classes, as long
2125
as they are all joined by being placed adjacently. */
2126
2127
static BOOL
2128
compile_class_operand(eclass_context *context, BOOL negated,
2129
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2130
  PCRE2_SIZE *lengthptr)
2131
79.5k
{
2132
79.5k
uint32_t *ptr = *pptr;
2133
79.5k
uint32_t *prev_ptr;
2134
79.5k
PCRE2_UCHAR *code = *pcode;
2135
79.5k
PCRE2_UCHAR *code_start = code;
2136
79.5k
PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0;
2137
79.5k
PCRE2_SIZE extra_length;
2138
79.5k
uint32_t meta = META_CODE(*ptr);
2139
2140
79.5k
switch (meta)
2141
79.5k
  {
2142
443
  case META_CLASS_EMPTY_NOT:
2143
4.02k
  case META_CLASS_EMPTY:
2144
4.02k
  ++ptr;
2145
4.02k
  pop_info->length = 1;
2146
4.02k
  if ((meta == META_CLASS_EMPTY) == negated)
2147
1.45k
    {
2148
1.45k
    *code++ = pop_info->op_single_type = ECL_ANY;
2149
1.45k
    memset(pop_info->bits.classbits, 0xff, 32);
2150
1.45k
    }
2151
2.56k
  else
2152
2.56k
    {
2153
2.56k
    *code++ = pop_info->op_single_type = ECL_NONE;
2154
2.56k
    memset(pop_info->bits.classbits, 0, 32);
2155
2.56k
    }
2156
4.02k
  break;
2157
2158
21.1k
  case META_CLASS:
2159
27.3k
  case META_CLASS_NOT:
2160
27.3k
  if ((*ptr & CLASS_IS_ECLASS) != 0)
2161
9.02k
    {
2162
9.02k
    if (!compile_eclass_nested(context, negated, &ptr, &code,
2163
9.02k
                               pop_info, lengthptr))
2164
35
      return FALSE;
2165
2166
8.99k
    PCRE2_ASSERT(*ptr == META_CLASS_END);
2167
8.99k
    ptr++;
2168
8.99k
    goto DONE;
2169
9.02k
    }
2170
2171
18.2k
  ptr++;
2172
  /* Fall through */
2173
2174
66.5k
  default:
2175
  /* Scan forward characters, ranges, and properties.
2176
  For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but
2177
  we still need to collect that fragment up into a "leaf" OP_CLASS. */
2178
2179
66.5k
  prev_ptr = ptr;
2180
66.5k
  ptr = PRIV(compile_class_not_nested)(
2181
66.5k
    context->options, context->xoptions, ptr, &code,
2182
66.5k
    (meta != META_CLASS_NOT) == negated, &context->needs_bitmap,
2183
66.5k
    context->errorcodeptr, context->cb, lengthptr);
2184
66.5k
  if (ptr == NULL) return FALSE;
2185
2186
  /* We must have a 100% guarantee that ptr increases when
2187
  compile_class_operand() returns, even on Release builds, so that we can
2188
  statically prove our loops terminate. */
2189
66.5k
  if (ptr <= prev_ptr)
2190
0
    {
2191
0
    PCRE2_DEBUG_UNREACHABLE();
2192
0
    return FALSE;
2193
0
    }
2194
2195
  /* If we fell through above, consume the closing ']'. */
2196
66.5k
  if (meta == META_CLASS || meta == META_CLASS_NOT)
2197
18.2k
    {
2198
18.2k
    PCRE2_ASSERT(*ptr == META_CLASS_END);
2199
18.2k
    ptr++;
2200
18.2k
    }
2201
2202
  /* Regardless of whether (lengthptr == NULL), some data will still be written
2203
  out to *pcode, which we need: we have to peek at it, to transform the opcode
2204
  into the ECLASS version (since we need to hoist up the bitmaps). */
2205
66.5k
  PCRE2_ASSERT(code > code_start);
2206
66.5k
  extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0;
2207
2208
  /* Easiest case: convert OP_ALLANY to ECL_ANY */
2209
2210
66.5k
  if (*code_start == OP_ALLANY)
2211
247
    {
2212
247
    PCRE2_ASSERT(code - code_start == 1 && extra_length == 0);
2213
247
    pop_info->length = 1;
2214
247
    *code_start = pop_info->op_single_type = ECL_ANY;
2215
247
    memset(pop_info->bits.classbits, 0xff, 32);
2216
247
    }
2217
2218
  /* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to
2219
  ECL_NONE / ECL_ANY respectively. */
2220
2221
66.2k
  else if (*code_start == OP_CLASS || *code_start == OP_NCLASS)
2222
40.9k
    {
2223
40.9k
    PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) &&
2224
40.9k
                 extra_length == 0);
2225
40.9k
    pop_info->length = 1;
2226
40.9k
    *code_start = pop_info->op_single_type =
2227
40.9k
        (*code_start == OP_CLASS)? ECL_NONE : ECL_ANY;
2228
40.9k
    memcpy(pop_info->bits.classbits, code_start + 1, 32);
2229
    /* Rewind the code pointer, but make sure we adjust *lengthptr, because we
2230
    do need to reserve that space (even though we only use it temporarily). */
2231
40.9k
    if (lengthptr != NULL)
2232
25.3k
      *lengthptr += code - (code_start + 1);
2233
40.9k
    code = code_start + 1;
2234
2235
40.9k
    if (!context->needs_bitmap && *code_start == ECL_NONE)
2236
4.84k
      {
2237
4.84k
      uint32_t *classwords = pop_info->bits.classwords;
2238
2239
14.6k
      for (int i = 0; i < 8; i++)
2240
14.3k
        if (classwords[i] != 0)
2241
4.49k
          {
2242
4.49k
          context->needs_bitmap = TRUE;
2243
4.49k
          break;
2244
4.49k
          }
2245
4.84k
      }
2246
36.0k
    else
2247
36.0k
      context->needs_bitmap = TRUE;
2248
40.9k
    }
2249
2250
  /* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to
2251
  ECL_XCLASS. */
2252
2253
25.3k
  else
2254
25.3k
    {
2255
25.3k
    PCRE2_ASSERT(*code_start == OP_XCLASS);
2256
25.3k
    *code_start = pop_info->op_single_type = ECL_XCLASS;
2257
2258
25.3k
    PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1);
2259
2260
25.3k
    memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32);
2261
25.3k
    pop_info->length = (code - code_start) + extra_length;
2262
25.3k
    }
2263
2264
66.5k
  break;
2265
79.5k
  }  /* End of switch(meta) */
2266
2267
70.5k
pop_info->code_start = (lengthptr == NULL)? code_start : NULL;
2268
2269
70.5k
if (lengthptr != NULL)
2270
43.9k
  {
2271
43.9k
  *lengthptr += code - code_start;
2272
43.9k
  code = code_start;
2273
43.9k
  }
2274
2275
79.5k
DONE:
2276
79.5k
PCRE2_ASSERT(lengthptr == NULL || (code == code_start));
2277
2278
79.5k
*pptr = ptr;
2279
79.5k
*pcode = code;
2280
79.5k
return TRUE;
2281
70.5k
}
2282
2283
2284
2285
/* This function consumes a group of implicitly-unioned class elements.
2286
These can be characters, ranges, properties, or nested classes, as long
2287
as they are all joined by being placed adjacently. */
2288
2289
static BOOL
2290
compile_class_juxtaposition(eclass_context *context, BOOL negated,
2291
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2292
  PCRE2_SIZE *lengthptr)
2293
34.4k
{
2294
34.4k
uint32_t *ptr = *pptr;
2295
34.4k
PCRE2_UCHAR *code = *pcode;
2296
#ifdef PCRE2_DEBUG
2297
PCRE2_UCHAR *start_code = *pcode;
2298
#endif
2299
2300
/* See compile_class_binary_loose() for comments on compile-time folding of
2301
the "negated" flag. */
2302
2303
/* Because it's a non-empty class, there must be an operand at the start. */
2304
34.4k
if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr))
2305
12
  return FALSE;
2306
2307
79.5k
while (*ptr != META_CLASS_END &&
2308
79.5k
       !(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT))
2309
45.1k
  {
2310
45.1k
  uint32_t op;
2311
45.1k
  BOOL rhs_negated;
2312
45.1k
  eclass_op_info rhs_op_info;
2313
2314
45.1k
  if (negated)
2315
13.2k
    {
2316
    /* !(A juxtapose B)  ->  !A && !B */
2317
13.2k
    op = ECL_AND;
2318
13.2k
    rhs_negated = TRUE;
2319
13.2k
    }
2320
31.8k
  else
2321
31.8k
    {
2322
    /* A juxtapose B  ->  A || B */
2323
31.8k
    op = ECL_OR;
2324
31.8k
    rhs_negated = FALSE;
2325
31.8k
    }
2326
2327
  /* An operand must follow the operator. */
2328
45.1k
  if (!compile_class_operand(context, rhs_negated, &ptr, &code,
2329
45.1k
                             &rhs_op_info, lengthptr))
2330
36
    return FALSE;
2331
2332
  /* Convert infix to postfix (RPN). */
2333
45.0k
  fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2334
45.0k
  if (lengthptr == NULL)
2335
16.0k
    code = pop_info->code_start + pop_info->length;
2336
45.0k
  }
2337
2338
34.3k
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2339
2340
34.3k
*pptr = ptr;
2341
34.3k
*pcode = code;
2342
34.3k
return TRUE;
2343
34.4k
}
2344
2345
2346
2347
/* This function consumes unary prefix operators. */
2348
2349
static BOOL
2350
compile_class_unary(eclass_context *context, BOOL negated,
2351
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2352
  PCRE2_SIZE *lengthptr)
2353
34.4k
{
2354
34.4k
uint32_t *ptr = *pptr;
2355
#ifdef PCRE2_DEBUG
2356
PCRE2_UCHAR *start_code = *pcode;
2357
#endif
2358
2359
35.6k
while (*ptr == META_ECLASS_NOT)
2360
1.15k
  {
2361
1.15k
  ++ptr;
2362
1.15k
  negated = !negated;
2363
1.15k
  }
2364
2365
34.4k
*pptr = ptr;
2366
/* Because it's a non-empty class, there must be an operand. */
2367
34.4k
if (!compile_class_juxtaposition(context, negated, pptr, pcode,
2368
34.4k
                                 pop_info, lengthptr))
2369
48
  return FALSE;
2370
2371
34.3k
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
2372
34.3k
return TRUE;
2373
34.4k
}
2374
2375
2376
2377
/* This function consumes tightly-binding binary operators. */
2378
2379
static BOOL
2380
compile_class_binary_tight(eclass_context *context, BOOL negated,
2381
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2382
  PCRE2_SIZE *lengthptr)
2383
33.5k
{
2384
33.5k
uint32_t *ptr = *pptr;
2385
33.5k
PCRE2_UCHAR *code = *pcode;
2386
#ifdef PCRE2_DEBUG
2387
PCRE2_UCHAR *start_code = *pcode;
2388
#endif
2389
2390
/* See compile_class_binary_loose() for comments on compile-time folding of
2391
the "negated" flag. */
2392
2393
/* Because it's a non-empty class, there must be an operand at the start. */
2394
33.5k
if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr))
2395
48
  return FALSE;
2396
2397
34.3k
while (*ptr == META_ECLASS_AND)
2398
882
  {
2399
882
  uint32_t op;
2400
882
  BOOL rhs_negated;
2401
882
  eclass_op_info rhs_op_info;
2402
2403
882
  if (negated)
2404
236
    {
2405
    /* !(A && B)  ->  !A || !B */
2406
236
    op = ECL_OR;
2407
236
    rhs_negated = TRUE;
2408
236
    }
2409
646
  else
2410
646
    {
2411
    /* A && B  ->  A && B */
2412
646
    op = ECL_AND;
2413
646
    rhs_negated = FALSE;
2414
646
    }
2415
2416
882
  ++ptr;
2417
2418
  /* An operand must follow the operator. */
2419
882
  if (!compile_class_unary(context, rhs_negated, &ptr, &code,
2420
882
                           &rhs_op_info, lengthptr))
2421
0
    return FALSE;
2422
2423
  /* Convert infix to postfix (RPN). */
2424
882
  fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2425
882
  if (lengthptr == NULL)
2426
432
    code = pop_info->code_start + pop_info->length;
2427
882
  }
2428
2429
33.5k
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2430
2431
33.5k
*pptr = ptr;
2432
33.5k
*pcode = code;
2433
33.5k
return TRUE;
2434
33.5k
}
2435
2436
2437
2438
/* This function consumes loosely-binding binary operators. */
2439
2440
static BOOL
2441
compile_class_binary_loose(eclass_context *context, BOOL negated,
2442
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2443
  PCRE2_SIZE *lengthptr)
2444
22.3k
{
2445
22.3k
uint32_t *ptr = *pptr;
2446
22.3k
PCRE2_UCHAR *code = *pcode;
2447
#ifdef PCRE2_DEBUG
2448
PCRE2_UCHAR *start_code = *pcode;
2449
#endif
2450
2451
/* We really want to fold the negation operator, if at all possible, so that
2452
simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want
2453
to produce a fully-folded expression, so that we can guarantee not to emit any
2454
OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode).
2455
2456
This has the consequence that with a little ingenuity, we can in fact avoid
2457
emitting (nearly...) all cases of the "NOT" operator. Imagine that we have:
2458
    !(A ...
2459
We have parsed the preceding "!", and we are about to parse the "A" operand. We
2460
don't know yet whether there will even be a following binary operand! Both of
2461
these are possibilities for what follows:
2462
    !(A && B)
2463
    !(A)
2464
However, we can still fold the "!" into the "A" operand, because no matter what
2465
the following binary operator will be, we can produce an expression which is
2466
equivalent. */
2467
2468
/* Because it's a non-empty class, there must be an operand at the start. */
2469
22.3k
if (!compile_class_binary_tight(context, negated, &ptr, &code,
2470
22.3k
                                pop_info, lengthptr))
2471
20
  return FALSE;
2472
2473
33.5k
while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR)
2474
11.1k
  {
2475
11.1k
  uint32_t op;
2476
11.1k
  BOOL op_neg;
2477
11.1k
  BOOL rhs_negated;
2478
11.1k
  eclass_op_info rhs_op_info;
2479
2480
11.1k
  if (negated)
2481
7.44k
    {
2482
    /* The whole expression is being negated; we respond by unconditionally
2483
    negating the LHS A, before seeing what follows. And hooray! We can recover,
2484
    no matter what follows. */
2485
    /* !(A || B)   ->  !A && !B                     */
2486
    /* !(A -- B)   ->  !(A && !B)    ->  !A || B    */
2487
    /* !(A XOR B)  ->  !(!A XOR !B)  ->  !A XNOR !B */
2488
7.44k
    op = (*ptr == META_ECLASS_OR )? ECL_AND :
2489
7.44k
         (*ptr == META_ECLASS_SUB)? ECL_OR  :
2490
6.95k
         /*ptr == META_ECLASS_XOR*/ ECL_XOR;
2491
7.44k
    op_neg = (*ptr == META_ECLASS_XOR);
2492
7.44k
    rhs_negated = *ptr != META_ECLASS_SUB;
2493
7.44k
    }
2494
3.74k
  else
2495
3.74k
    {
2496
    /* A || B   ->  A || B  */
2497
    /* A -- B   ->  A && !B */
2498
    /* A XOR B  ->  A XOR B */
2499
3.74k
    op = (*ptr == META_ECLASS_OR )? ECL_OR  :
2500
3.74k
         (*ptr == META_ECLASS_SUB)? ECL_AND :
2501
1.64k
         /*ptr == META_ECLASS_XOR*/ ECL_XOR;
2502
3.74k
    op_neg = FALSE;
2503
3.74k
    rhs_negated = *ptr == META_ECLASS_SUB;
2504
3.74k
    }
2505
2506
11.1k
  ++ptr;
2507
2508
  /* An operand must follow the operator. */
2509
11.1k
  if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code,
2510
11.1k
                                  &rhs_op_info, lengthptr))
2511
28
    return FALSE;
2512
2513
  /* Convert infix to postfix (RPN). */
2514
11.1k
  fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2515
11.1k
  if (op_neg) fold_negation(pop_info, lengthptr, FALSE);
2516
11.1k
  if (lengthptr == NULL)
2517
4.36k
    code = pop_info->code_start + pop_info->length;
2518
11.1k
  }
2519
2520
22.3k
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2521
2522
22.3k
*pptr = ptr;
2523
22.3k
*pcode = code;
2524
22.3k
return TRUE;
2525
22.3k
}
2526
2527
2528
2529
/* This function converts the META codes in pptr into opcodes written to
2530
pcode. The pptr must start at a META_CLASS or META_CLASS_NOT.
2531
2532
The class is compiled as a left-associative sequence of operator
2533
applications.
2534
2535
The pptr will be left pointing at the matching META_CLASS_END. */
2536
2537
static BOOL
2538
compile_eclass_nested(eclass_context *context, BOOL negated,
2539
  uint32_t **pptr, PCRE2_UCHAR **pcode,
2540
  eclass_op_info *pop_info, PCRE2_SIZE *lengthptr)
2541
22.3k
{
2542
22.3k
uint32_t *ptr = *pptr;
2543
#ifdef PCRE2_DEBUG
2544
PCRE2_UCHAR *start_code = *pcode;
2545
#endif
2546
2547
/* The CLASS_IS_ECLASS bit must be set since it is a nested class. */
2548
22.3k
PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) ||
2549
22.3k
             *ptr == (META_CLASS_NOT | CLASS_IS_ECLASS));
2550
2551
22.3k
if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS))
2552
5.57k
  negated = !negated;
2553
2554
22.3k
(*pptr)++;
2555
2556
/* Because it's a non-empty class, there must be an operand at the start. */
2557
22.3k
if (!compile_class_binary_loose(context, negated, pptr, pcode,
2558
22.3k
                                pop_info, lengthptr))
2559
48
  return FALSE;
2560
2561
22.3k
PCRE2_ASSERT(**pptr == META_CLASS_END);
2562
22.3k
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
2563
22.3k
return TRUE;
2564
22.3k
}
2565
2566
BOOL
2567
PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions,
2568
  uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr,
2569
  compile_block *cb, PCRE2_SIZE *lengthptr)
2570
13.3k
{
2571
13.3k
eclass_context context;
2572
13.3k
eclass_op_info op_info;
2573
13.3k
PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0;
2574
13.3k
PCRE2_UCHAR *code = *pcode;
2575
13.3k
PCRE2_UCHAR *previous;
2576
13.3k
BOOL allbitsone = TRUE;
2577
2578
13.3k
context.needs_bitmap = FALSE;
2579
13.3k
context.options = options;
2580
13.3k
context.xoptions = xoptions;
2581
13.3k
context.errorcodeptr = errorcodeptr;
2582
13.3k
context.cb = cb;
2583
2584
13.3k
previous = code;
2585
13.3k
*code++ = OP_ECLASS;
2586
13.3k
code += LINK_SIZE;
2587
13.3k
*code++ = 0;  /* Flags, currently zero. */
2588
13.3k
if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr))
2589
13
  return FALSE;
2590
2591
13.3k
if (lengthptr != NULL)
2592
7.60k
  {
2593
7.60k
  *lengthptr += code - previous;
2594
7.60k
  code = previous;
2595
  /* (*lengthptr - previous_length) now holds the amount of buffer that
2596
  we require to make the call to compile_class_nested() with
2597
  lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out
2598
  before that call. */
2599
7.60k
  }
2600
2601
/* Do some useful counting of what's in the bitmap. */
2602
31.2k
for (int i = 0; i < 8; i++)
2603
29.5k
  if (op_info.bits.classwords[i] != 0xffffffff)
2604
11.6k
    {
2605
11.6k
    allbitsone = FALSE;
2606
11.6k
    break;
2607
11.6k
    }
2608
2609
/* After constant-folding the extended class syntax, it may turn out to be
2610
a simple class after all. In that case, we can unwrap it from the
2611
OP_ECLASS container - and in fact, we must do so, because in 8-bit
2612
no-Unicode mode the matcher is compiled without support for OP_ECLASS. */
2613
2614
#ifndef SUPPORT_WIDE_CHARS
2615
PCRE2_ASSERT(op_info.op_single_type != 0);
2616
#else
2617
13.3k
if (op_info.op_single_type != 0)
2618
7.99k
#endif
2619
7.99k
  {
2620
  /* Rewind back over the OP_ECLASS. */
2621
7.99k
  code = previous;
2622
2623
  /* If the bits are all ones, and the "high characters" are all matched
2624
  too, we use a special-cased encoding of OP_ALLANY. */
2625
2626
7.99k
  if (op_info.op_single_type == ECL_ANY && allbitsone)
2627
901
    {
2628
    /* Advancing code means rewinding lengthptr, at this point. */
2629
901
    if (lengthptr != NULL) *lengthptr -= 1;
2630
901
    *code++ = OP_ALLANY;
2631
901
    }
2632
2633
  /* If the high bits are all matched / all not-matched, then we emit an
2634
  OP_NCLASS/OP_CLASS respectively. */
2635
2636
7.09k
  else if (op_info.op_single_type == ECL_ANY ||
2637
7.09k
           op_info.op_single_type == ECL_NONE)
2638
4.22k
    {
2639
4.22k
    PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR));
2640
2641
4.22k
    if (lengthptr != NULL)
2642
2.30k
      {
2643
2.30k
      if (required_len > (*lengthptr - previous_length))
2644
225
      *lengthptr = previous_length + required_len;
2645
2.30k
      }
2646
2647
    /* Advancing code means rewinding lengthptr, at this point. */
2648
4.22k
    if (lengthptr != NULL) *lengthptr -= required_len;
2649
4.22k
    *code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS;
2650
4.22k
    memcpy(code, op_info.bits.classbits, 32);
2651
4.22k
    code += 32 / sizeof(PCRE2_UCHAR);
2652
4.22k
    }
2653
2654
  /* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data
2655
  there, but, we pulled out its bitmap into op_info, so now we have to
2656
  put that back into the OP_XCLASS. */
2657
2658
2.86k
  else
2659
2.86k
    {
2660
#ifndef SUPPORT_WIDE_CHARS
2661
    PCRE2_DEBUG_UNREACHABLE();
2662
#else
2663
2.86k
    BOOL need_map = context.needs_bitmap;
2664
2.86k
    PCRE2_SIZE required_len;
2665
2666
2.86k
    PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS);
2667
2.86k
    required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0);
2668
2669
2.86k
    if (lengthptr != NULL)
2670
1.70k
      {
2671
      /* Don't unconditionally request all the space we need - we may
2672
      already have asked for more during processing of the ECLASS. */
2673
1.70k
      if (required_len > (*lengthptr - previous_length))
2674
245
        *lengthptr = previous_length + required_len;
2675
2676
      /* The code we write out here won't be ignored, even during the
2677
      (lengthptr != NULL) phase, because if there's a following quantifier
2678
      it will peek backwards. So we do have to write out a (truncated)
2679
      OP_XCLASS, even on this branch. */
2680
1.70k
      *lengthptr -= 1 + LINK_SIZE + 1;
2681
1.70k
      *code++ = OP_XCLASS;
2682
1.70k
      PUT(code, 0, 1 + LINK_SIZE + 1);
2683
1.70k
      code += LINK_SIZE;
2684
1.70k
      *code++ = 0;
2685
1.70k
      }
2686
1.16k
    else
2687
1.16k
      {
2688
1.16k
      PCRE2_UCHAR *rest;
2689
1.16k
      PCRE2_SIZE rest_len;
2690
1.16k
      PCRE2_UCHAR flags;
2691
2692
      /* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */
2693
1.16k
      PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1);
2694
1.16k
      rest = op_info.code_start + 1 + LINK_SIZE + 1;
2695
1.16k
      rest_len = (op_info.code_start + op_info.length) - rest;
2696
2697
      /* First read any data we use, before memmove splats it. */
2698
1.16k
      flags = op_info.code_start[1 + LINK_SIZE];
2699
1.16k
      PCRE2_ASSERT((flags & XCL_MAP) == 0);
2700
2701
      /* Next do the memmove before any writes. */
2702
1.16k
      memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0),
2703
1.16k
              rest, CU2BYTES(rest_len));
2704
2705
      /* Finally write the header data. */
2706
1.16k
      *code++ = OP_XCLASS;
2707
1.16k
      PUT(code, 0, (int)required_len);
2708
1.16k
      code += LINK_SIZE;
2709
1.16k
      *code++ = flags | (need_map? XCL_MAP : 0);
2710
1.16k
      if (need_map)
2711
908
        {
2712
908
        memcpy(code, op_info.bits.classbits, 32);
2713
908
        code += 32 / sizeof(PCRE2_UCHAR);
2714
908
        }
2715
1.16k
      code += rest_len;
2716
1.16k
      }
2717
2.86k
#endif /* SUPPORT_WIDE_CHARS */
2718
2.86k
    }
2719
7.99k
  }
2720
2721
/* Otherwise, we're going to keep the OP_ECLASS. However, again we need
2722
to do some adjustment to insert the bitmap if we have one. */
2723
2724
5.33k
#ifdef SUPPORT_WIDE_CHARS
2725
5.33k
else
2726
5.33k
  {
2727
5.33k
  BOOL need_map = context.needs_bitmap;
2728
5.33k
  PCRE2_SIZE required_len =
2729
5.33k
    1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length;
2730
2731
5.33k
  if (lengthptr != NULL)
2732
3.11k
    {
2733
3.11k
    if (required_len > (*lengthptr - previous_length))
2734
1.01k
      *lengthptr = previous_length + required_len;
2735
2736
    /* As for the XCLASS branch above, we do have to write out a dummy
2737
    OP_ECLASS, because of the backwards peek by the quantifier code. Write
2738
    out a (truncated) OP_ECLASS, even on this branch. */
2739
3.11k
    *lengthptr -= 1 + LINK_SIZE + 1;
2740
3.11k
    *code++ = OP_ECLASS;
2741
3.11k
    PUT(code, 0, 1 + LINK_SIZE + 1);
2742
3.11k
    code += LINK_SIZE;
2743
3.11k
    *code++ = 0;
2744
3.11k
    }
2745
2.22k
  else
2746
2.22k
    {
2747
2.22k
    if (need_map)
2748
1.98k
      {
2749
1.98k
      PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1;
2750
1.98k
      previous[1 + LINK_SIZE] |= ECL_MAP;
2751
1.98k
      memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start,
2752
1.98k
              CU2BYTES(code - map_start));
2753
1.98k
      memcpy(map_start, op_info.bits.classbits, 32);
2754
1.98k
      code += 32 / sizeof(PCRE2_UCHAR);
2755
1.98k
      }
2756
2.22k
    PUT(previous, 1, (int)(code - previous));
2757
2.22k
    }
2758
5.33k
  }
2759
13.3k
#endif /* SUPPORT_WIDE_CHARS */
2760
2761
13.3k
*pcode = code;
2762
13.3k
return TRUE;
2763
13.3k
}
2764
2765
/* End of pcre2_compile_class.c */