Coverage Report

Created: 2025-06-13 06:43

/src/php-src/ext/pcre/pcre2lib/pcre2_compile_class.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
#ifdef HAVE_CONFIG_H
42
#include "config.h"
43
#endif
44
45
#include "pcre2_compile.h"
46
47
typedef struct {
48
  /* Option bits for eclass. */
49
  uint32_t options;
50
  uint32_t xoptions;
51
  /* Rarely used members. */
52
  int *errorcodeptr;
53
  compile_block *cb;
54
  /* Bitmap is needed. */
55
  BOOL needs_bitmap;
56
} eclass_context;
57
58
/* Checks the allowed tokens at the end of a class structure in debug mode.
59
When a new token is not processed by all loops, and the token is equals to
60
a) one of the cases here:
61
   the compiler will complain about a duplicated case value.
62
b) none of the cases here:
63
   the loop without the handler will stop with an assertion failure. */
64
65
#ifdef PCRE2_DEBUG
66
#define CLASS_END_CASES(meta) \
67
  default: \
68
  PCRE2_ASSERT((meta) <= META_END); \
69
  /* Fall through */ \
70
  case META_CLASS: \
71
  case META_CLASS_NOT: \
72
  case META_CLASS_EMPTY: \
73
  case META_CLASS_EMPTY_NOT: \
74
  case META_CLASS_END: \
75
  case META_ECLASS_AND: \
76
  case META_ECLASS_OR: \
77
  case META_ECLASS_SUB: \
78
  case META_ECLASS_XOR: \
79
  case META_ECLASS_NOT:
80
#else
81
#define CLASS_END_CASES(meta) \
82
499k
  default:
83
#endif
84
85
#ifdef SUPPORT_WIDE_CHARS
86
87
/* Heapsort algorithm. */
88
89
static void do_heapify(uint32_t *buffer, size_t size, size_t i)
90
62.8k
{
91
62.8k
size_t max;
92
62.8k
size_t left;
93
62.8k
size_t right;
94
62.8k
uint32_t tmp1, tmp2;
95
96
187k
while (TRUE)
97
187k
  {
98
187k
  max = i;
99
187k
  left = (i << 1) + 2;
100
187k
  right = left + 2;
101
102
187k
  if (left < size && buffer[left] > buffer[max]) max = left;
103
187k
  if (right < size && buffer[right] > buffer[max]) max = right;
104
187k
  if (i == max) return;
105
106
  /* Swap items. */
107
124k
  tmp1 = buffer[i];
108
124k
  tmp2 = buffer[i + 1];
109
124k
  buffer[i] = buffer[max];
110
124k
  buffer[i + 1] = buffer[max + 1];
111
124k
  buffer[max] = tmp1;
112
124k
  buffer[max + 1] = tmp2;
113
124k
  i = max;
114
124k
  }
115
62.8k
}
116
117
#ifdef SUPPORT_UNICODE
118
119
1.71k
#define PARSE_CLASS_UTF               0x1
120
54.0k
#define PARSE_CLASS_CASELESS_UTF      0x2
121
46.2k
#define PARSE_CLASS_RESTRICTED_UTF    0x4
122
131k
#define PARSE_CLASS_TURKISH_UTF       0x8
123
124
/* Get the range of nocase characters which includes the
125
'c' character passed as argument, or directly follows 'c'. */
126
127
static const uint32_t*
128
get_nocase_range(uint32_t c)
129
51.2k
{
130
51.2k
uint32_t left = 0;
131
51.2k
uint32_t right = PRIV(ucd_nocase_ranges_size);
132
51.2k
uint32_t middle;
133
134
51.2k
if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;
135
136
284k
while (TRUE)
137
284k
  {
138
  /* Range end of the middle element. */
139
284k
  middle = ((left + right) >> 1) | 0x1;
140
141
284k
  if (PRIV(ucd_nocase_ranges)[middle] <= c)
142
220
    left = middle + 1;
143
284k
  else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)
144
233k
    right = middle - 1;
145
51.2k
  else
146
51.2k
    return PRIV(ucd_nocase_ranges) + (middle - 1);
147
284k
  }
148
51.2k
}
149
150
/* Get the list of othercase characters, which belongs to the passed range.
151
Create ranges from these characters, and append them to the buffer argument. */
152
153
static size_t
154
utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,
155
  uint32_t *buffer)
156
51.2k
{
157
51.2k
uint32_t new_start = start;
158
51.2k
uint32_t new_end = end;
159
51.2k
uint32_t c = start;
160
51.2k
const uint32_t *list;
161
51.2k
uint32_t tmp[3];
162
51.2k
size_t result = 2;
163
51.2k
const uint32_t *skip_range = get_nocase_range(c);
164
51.2k
uint32_t skip_start = skip_range[0];
165
166
51.2k
#if PCRE2_CODE_UNIT_WIDTH == 8
167
51.2k
PCRE2_ASSERT(options & PARSE_CLASS_UTF);
168
51.2k
#endif
169
170
#if PCRE2_CODE_UNIT_WIDTH == 32
171
if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;
172
#endif
173
174
117k
while (c <= end)
175
66.2k
  {
176
66.2k
  uint32_t co;
177
178
66.2k
  if (c > skip_start)
179
22.4k
    {
180
22.4k
    c = skip_range[1];
181
22.4k
    skip_range += 2;
182
22.4k
    skip_start = skip_range[0];
183
22.4k
    continue;
184
22.4k
    }
185
186
  /* Compute caseless set. */
187
188
43.7k
  if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) ==
189
43.7k
        PARSE_CLASS_TURKISH_UTF &&
190
43.7k
      UCD_ANY_I(c))
191
0
    {
192
0
    co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
193
0
    }
194
43.7k
  else if ((co = UCD_CASESET(c)) != 0 &&
195
43.7k
           (options & PARSE_CLASS_RESTRICTED_UTF) != 0 &&
196
43.7k
           PRIV(ucd_caseless_sets)[co] < 128)
197
0
    {
198
0
    co = 0;  /* Ignore the caseless set if it's restricted. */
199
0
    }
200
201
43.7k
  if (co != 0)
202
2.45k
    list = PRIV(ucd_caseless_sets) + co;
203
41.2k
  else
204
41.2k
    {
205
41.2k
    co = UCD_OTHERCASE(c);
206
41.2k
    list = tmp;
207
41.2k
    tmp[0] = c;
208
41.2k
    tmp[1] = NOTACHAR;
209
210
41.2k
    if (co != c)
211
28.0k
      {
212
28.0k
      tmp[1] = co;
213
28.0k
      tmp[2] = NOTACHAR;
214
28.0k
      }
215
41.2k
    }
216
43.7k
  c++;
217
218
  /* Add characters. */
219
43.7k
  do
220
76.6k
    {
221
#if PCRE2_CODE_UNIT_WIDTH == 16
222
    if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue;
223
#endif
224
225
76.6k
    if (*list < new_start)
226
13.2k
      {
227
13.2k
      if (*list + 1 == new_start)
228
0
        {
229
0
        new_start--;
230
0
        continue;
231
0
        }
232
13.2k
      }
233
63.4k
    else if (*list > new_end)
234
19.2k
      {
235
19.2k
      if (*list - 1 == new_end)
236
482
        {
237
482
        new_end++;
238
482
        continue;
239
482
        }
240
19.2k
      }
241
44.2k
    else continue;
242
243
31.9k
    result += 2;
244
31.9k
    if (buffer != NULL)
245
15.9k
      {
246
15.9k
      buffer[0] = *list;
247
15.9k
      buffer[1] = *list;
248
15.9k
      buffer += 2;
249
15.9k
      }
250
31.9k
    }
251
76.6k
  while (*(++list) != NOTACHAR);
252
43.7k
  }
253
254
51.2k
  if (buffer != NULL)
255
25.6k
    {
256
25.6k
    buffer[0] = new_start;
257
25.6k
    buffer[1] = new_end;
258
25.6k
    buffer += 2;
259
25.6k
    (void)buffer;
260
25.6k
    }
261
51.2k
  return result;
262
51.2k
}
263
264
#endif
265
266
/* Add a character list to a buffer. */
267
268
static size_t
269
append_char_list(const uint32_t *p, uint32_t *buffer)
270
0
{
271
0
const uint32_t *n;
272
0
size_t result = 0;
273
274
0
while (*p != NOTACHAR)
275
0
  {
276
0
  n = p;
277
0
  while (n[0] == n[1] - 1) n++;
278
279
0
  PCRE2_ASSERT(*p < 0xffff);
280
281
0
  if (buffer != NULL)
282
0
    {
283
0
    buffer[0] = *p;
284
0
    buffer[1] = *n;
285
0
    buffer += 2;
286
0
    }
287
288
0
  result += 2;
289
0
  p = n + 1;
290
0
  }
291
292
0
  return result;
293
0
}
294
295
static uint32_t
296
get_highest_char(uint32_t options)
297
0
{
298
0
(void)options; /* Avoid compiler warning. */
299
300
0
#if PCRE2_CODE_UNIT_WIDTH == 8
301
0
return MAX_UTF_CODE_POINT;
302
#else
303
#ifdef SUPPORT_UNICODE
304
return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0);
305
#else
306
return MAX_UCHAR_VALUE;
307
#endif
308
#endif
309
0
}
310
311
/* Add a negated character list to a buffer. */
312
static size_t
313
append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer)
314
0
{
315
0
const uint32_t *n;
316
0
uint32_t start = 0;
317
0
size_t result = 2;
318
319
0
PCRE2_ASSERT(*p > 0);
320
321
0
while (*p != NOTACHAR)
322
0
  {
323
0
  n = p;
324
0
  while (n[0] == n[1] - 1) n++;
325
326
0
  PCRE2_ASSERT(*p < 0xffff);
327
328
0
  if (buffer != NULL)
329
0
    {
330
0
    buffer[0] = start;
331
0
    buffer[1] = *p - 1;
332
0
    buffer += 2;
333
0
    }
334
335
0
  result += 2;
336
0
  start = *n + 1;
337
0
  p = n + 1;
338
0
  }
339
340
0
  if (buffer != NULL)
341
0
    {
342
0
    buffer[0] = start;
343
0
    buffer[1] = get_highest_char(options);
344
0
    buffer += 2;
345
0
    (void)buffer;
346
0
    }
347
348
0
  return result;
349
0
}
350
351
static uint32_t *
352
append_non_ascii_range(uint32_t options, uint32_t *buffer)
353
0
{
354
0
  if (buffer == NULL) return NULL;
355
356
0
  buffer[0] = 0x100;
357
0
  buffer[1] = get_highest_char(options);
358
0
  return buffer + 2;
359
0
}
360
361
static size_t
362
parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
363
3.43k
{
364
3.43k
size_t total_size = 0;
365
3.43k
size_t size;
366
3.43k
uint32_t meta_arg;
367
3.43k
uint32_t start_char;
368
369
56.1k
while (TRUE)
370
56.1k
  {
371
56.1k
  switch (META_CODE(*ptr))
372
56.1k
    {
373
380
    case META_ESCAPE:
374
380
      meta_arg = META_DATA(*ptr);
375
380
      switch (meta_arg)
376
380
        {
377
0
        case ESC_D:
378
0
        case ESC_W:
379
0
        case ESC_S:
380
0
        buffer = append_non_ascii_range(options, buffer);
381
0
        total_size += 2;
382
0
        break;
383
384
0
        case ESC_h:
385
0
        size = append_char_list(PRIV(hspace_list), buffer);
386
0
        total_size += size;
387
0
        if (buffer != NULL) buffer += size;
388
0
        break;
389
390
0
        case ESC_H:
391
0
        size = append_negated_char_list(PRIV(hspace_list), options, buffer);
392
0
        total_size += size;
393
0
        if (buffer != NULL) buffer += size;
394
0
        break;
395
396
0
        case ESC_v:
397
0
        size = append_char_list(PRIV(vspace_list), buffer);
398
0
        total_size += size;
399
0
        if (buffer != NULL) buffer += size;
400
0
        break;
401
402
0
        case ESC_V:
403
0
        size = append_negated_char_list(PRIV(vspace_list), options, buffer);
404
0
        total_size += size;
405
0
        if (buffer != NULL) buffer += size;
406
0
        break;
407
408
380
        case ESC_p:
409
380
        case ESC_P:
410
380
        ptr++;
411
380
        if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)
412
0
          {
413
0
          if (buffer != NULL)
414
0
            {
415
0
            buffer[0] = 0;
416
0
            buffer[1] = get_highest_char(options);
417
0
            buffer += 2;
418
0
            }
419
0
          total_size += 2;
420
0
          }
421
380
        break;
422
380
        }
423
380
      ptr++;
424
380
      continue;
425
0
    case META_POSIX_NEG:
426
0
      buffer = append_non_ascii_range(options, buffer);
427
0
      total_size += 2;
428
0
      ptr += 2;
429
0
      continue;
430
0
    case META_POSIX:
431
0
      ptr += 2;
432
0
      continue;
433
0
    case META_BIGVALUE:
434
      /* Character literal */
435
0
      ptr++;
436
0
      break;
437
55.7k
    CLASS_END_CASES(*ptr)
438
55.7k
      if (*ptr >= META_END) return total_size;
439
52.3k
      break;
440
56.1k
    }
441
442
52.3k
    start_char = *ptr;
443
444
52.3k
    if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED)
445
566
      {
446
566
      ptr += 2;
447
566
      PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE);
448
449
566
      if (*ptr == META_BIGVALUE) ptr++;
450
451
#ifdef EBCDIC
452
#error "Missing EBCDIC support"
453
#endif
454
566
      }
455
456
52.3k
#ifdef SUPPORT_UNICODE
457
52.3k
    if (options & PARSE_CLASS_CASELESS_UTF)
458
51.2k
      {
459
51.2k
      size = utf_caseless_extend(start_char, *ptr++, options, buffer);
460
51.2k
      if (buffer != NULL) buffer += size;
461
51.2k
      total_size += size;
462
51.2k
      continue;
463
51.2k
      }
464
1.13k
#endif
465
466
1.13k
    if (buffer != NULL)
467
568
      {
468
568
      buffer[0] = start_char;
469
568
      buffer[1] = *ptr;
470
568
      buffer += 2;
471
568
      }
472
473
1.13k
    ptr++;
474
1.13k
    total_size += 2;
475
1.13k
  }
476
477
0
  return total_size;
478
3.43k
}
479
480
/* Extra uint32_t values for storing the lengths of range lists in
481
the worst case. Two uint32_t lengths and a range end for a range
482
starting before 255 */
483
1.71k
#define CHAR_LIST_EXTRA_SIZE 3
484
485
/* Starting character values for each character list. */
486
487
static const uint32_t char_list_starts[] = {
488
#if PCRE2_CODE_UNIT_WIDTH == 32
489
  XCL_CHAR_LIST_HIGH_32_START,
490
#endif
491
#if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE
492
  XCL_CHAR_LIST_LOW_32_START,
493
#endif
494
  XCL_CHAR_LIST_HIGH_16_START,
495
  /* Must be terminated by XCL_CHAR_LIST_LOW_16_START,
496
  which also represents the end of the bitset. */
497
  XCL_CHAR_LIST_LOW_16_START,
498
};
499
500
static class_ranges *
501
compile_optimize_class(uint32_t *start_ptr, uint32_t options,
502
  uint32_t xoptions, compile_block *cb)
503
1.71k
{
504
1.71k
class_ranges* cranges;
505
1.71k
uint32_t *ptr;
506
1.71k
uint32_t *buffer;
507
1.71k
uint32_t *dst;
508
1.71k
uint32_t class_options = 0;
509
1.71k
size_t range_list_size = 0, total_size, i;
510
1.71k
uint32_t tmp1, tmp2;
511
1.71k
const uint32_t *char_list_next;
512
1.71k
uint16_t *next_char;
513
1.71k
uint32_t char_list_start, char_list_end;
514
1.71k
uint32_t range_start, range_end;
515
516
1.71k
#ifdef SUPPORT_UNICODE
517
1.71k
if (options & PCRE2_UTF)
518
1.71k
  class_options |= PARSE_CLASS_UTF;
519
520
1.71k
if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP)))
521
1.69k
  class_options |= PARSE_CLASS_CASELESS_UTF;
522
523
1.71k
if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT)
524
0
  class_options |= PARSE_CLASS_RESTRICTED_UTF;
525
526
1.71k
if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
527
0
  class_options |= PARSE_CLASS_TURKISH_UTF;
528
1.71k
#endif
529
530
/* Compute required space for the range. */
531
532
1.71k
range_list_size = parse_class(start_ptr, class_options, NULL);
533
1.71k
PCRE2_ASSERT((range_list_size & 0x1) == 0);
534
535
/* Allocate buffer. The total_size also represents the end of the buffer. */
536
537
1.71k
total_size = range_list_size +
538
1.71k
   ((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);
539
540
1.71k
cranges = cb->cx->memctl.malloc(
541
1.71k
  sizeof(class_ranges) + total_size * sizeof(uint32_t),
542
1.71k
  cb->cx->memctl.memory_data);
543
544
1.71k
if (cranges == NULL) return NULL;
545
546
1.71k
cranges->next = NULL;
547
1.71k
cranges->range_list_size = (uint16_t)range_list_size;
548
1.71k
cranges->char_lists_types = 0;
549
1.71k
cranges->char_lists_size = 0;
550
1.71k
cranges->char_lists_start = 0;
551
552
1.71k
if (range_list_size == 0) return cranges;
553
554
1.71k
buffer = (uint32_t*)(cranges + 1);
555
1.71k
parse_class(start_ptr, class_options, buffer);
556
557
/* Using <= instead of == to help static analysis. */
558
1.71k
if (range_list_size <= 2) return cranges;
559
560
/* In-place sorting of ranges. */
561
562
1.71k
i = (((range_list_size >> 2) - 1) << 1);
563
20.7k
while (TRUE)
564
20.7k
  {
565
20.7k
  do_heapify(buffer, range_list_size, i);
566
20.7k
  if (i == 0) break;
567
18.9k
  i -= 2;
568
18.9k
  }
569
570
1.71k
i = range_list_size - 2;
571
42.1k
while (TRUE)
572
42.1k
  {
573
42.1k
  tmp1 = buffer[i];
574
42.1k
  tmp2 = buffer[i + 1];
575
42.1k
  buffer[i] = buffer[0];
576
42.1k
  buffer[i + 1] = buffer[1];
577
42.1k
  buffer[0] = tmp1;
578
42.1k
  buffer[1] = tmp2;
579
580
42.1k
  do_heapify(buffer, i, 0);
581
42.1k
  if (i == 0) break;
582
40.4k
  i -= 2;
583
40.4k
  }
584
585
/* Merge ranges whenever possible. */
586
1.71k
dst = buffer;
587
1.71k
ptr = buffer + 2;
588
1.71k
range_list_size -= 2;
589
590
/* The second condition is a very rare corner case, where the end of the last
591
range is the maximum character. This range cannot be extended further. */
592
593
42.1k
while (range_list_size > 0 && dst[1] != ~(uint32_t)0)
594
40.4k
  {
595
40.4k
  if (dst[1] + 1 < ptr[0])
596
12.7k
    {
597
12.7k
    dst += 2;
598
12.7k
    dst[0] = ptr[0];
599
12.7k
    dst[1] = ptr[1];
600
12.7k
    }
601
27.6k
  else if (dst[1] < ptr[1]) dst[1] = ptr[1];
602
603
40.4k
  ptr += 2;
604
40.4k
  range_list_size -= 2;
605
40.4k
  }
606
607
1.71k
PCRE2_ASSERT(dst[1] <= get_highest_char(class_options));
608
609
/* When the number of ranges are less than six,
610
they are not converted to range lists. */
611
612
1.71k
ptr = buffer;
613
14.1k
while (ptr < dst && ptr[1] < 0x100) ptr += 2;
614
1.71k
if (dst - ptr < (2 * (6 - 1)))
615
1.71k
  {
616
1.71k
  cranges->range_list_size = (uint16_t)(dst + 2 - buffer);
617
1.71k
  return cranges;
618
1.71k
  }
619
620
/* Compute character lists structures. */
621
622
0
char_list_next = char_list_starts;
623
0
char_list_start = *char_list_next++;
624
#if PCRE2_CODE_UNIT_WIDTH == 32
625
char_list_end = XCL_CHAR_LIST_HIGH_32_END;
626
#elif defined SUPPORT_UNICODE
627
0
char_list_end = XCL_CHAR_LIST_LOW_32_END;
628
#else
629
char_list_end = XCL_CHAR_LIST_HIGH_16_END;
630
#endif
631
0
next_char = (uint16_t*)(buffer + total_size);
632
633
0
tmp1 = 0;
634
0
tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN;
635
0
PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN);
636
0
range_start = dst[0];
637
0
range_end = dst[1];
638
639
0
while (TRUE)
640
0
  {
641
0
  if (range_start >= char_list_start)
642
0
    {
643
0
    if (range_start == range_end || range_end < char_list_end)
644
0
      {
645
0
      tmp1++;
646
0
      next_char--;
647
648
0
      if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
649
0
        *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
650
0
      else
651
0
        *(uint32_t*)(--next_char) =
652
0
          (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
653
0
      }
654
655
0
    if (range_start < range_end)
656
0
      {
657
0
      if (range_start > char_list_start)
658
0
        {
659
0
        tmp1++;
660
0
        next_char--;
661
662
0
        if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
663
0
          *next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT);
664
0
        else
665
0
          *(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT);
666
0
        }
667
0
      else
668
0
        cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
669
0
      }
670
671
0
    PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
672
673
0
    if (dst > buffer)
674
0
      {
675
0
      dst -= 2;
676
0
      range_start = dst[0];
677
0
      range_end = dst[1];
678
0
      continue;
679
0
      }
680
681
0
    range_start = 0;
682
0
    range_end = 0;
683
0
    }
684
685
0
  if (range_end >= char_list_start)
686
0
    {
687
0
    PCRE2_ASSERT(range_start < char_list_start);
688
689
0
    if (range_end < char_list_end)
690
0
      {
691
0
      tmp1++;
692
0
      next_char--;
693
694
0
      if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
695
0
        *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
696
0
      else
697
0
        *(uint32_t*)(--next_char) =
698
0
          (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
699
700
0
      PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
701
0
      }
702
703
0
    cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
704
0
    }
705
706
0
  if (tmp1 >= XCL_ITEM_COUNT_MASK)
707
0
    {
708
0
    cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2;
709
0
    next_char--;
710
711
0
    if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
712
0
      *next_char = (uint16_t)tmp1;
713
0
    else
714
0
      *(uint32_t*)(--next_char) = tmp1;
715
0
    }
716
0
  else
717
0
    cranges->char_lists_types |= tmp1 << tmp2;
718
719
0
  if (range_start < XCL_CHAR_LIST_LOW_16_START) break;
720
721
0
  PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN);
722
0
  char_list_end = char_list_start - 1;
723
0
  char_list_start = *char_list_next++;
724
0
  tmp1 = 0;
725
0
  tmp2 -= XCL_TYPE_BIT_LEN;
726
0
  }
727
728
0
if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2;
729
0
PCRE2_ASSERT((uint16_t*)dst <= next_char);
730
731
0
cranges->char_lists_size =
732
0
  (size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char);
733
0
cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer);
734
0
cranges->range_list_size = (uint16_t)(dst - buffer);
735
0
return cranges;
736
1.71k
}
737
738
#endif /* SUPPORT_WIDE_CHARS */
739
740
#ifdef SUPPORT_UNICODE
741
742
void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated,
743
  uint8_t *classbits)
744
2.52k
{
745
/* Update PRIV(xclass) when this function is changed. */
746
2.52k
int c, chartype;
747
2.52k
const ucd_record *prop;
748
2.52k
uint32_t gentype;
749
2.52k
BOOL set_bit;
750
751
2.52k
if (ptype == PT_ANY)
752
0
  {
753
0
  if (!negated) memset(classbits, 0xff, 32);
754
0
  return;
755
0
  }
756
757
648k
for (c = 0; c < 256; c++)
758
646k
  {
759
646k
  prop = GET_UCD(c);
760
646k
  set_bit = FALSE;
761
646k
  (void)set_bit;
762
763
646k
  switch (ptype)
764
646k
    {
765
0
    case PT_LAMP:
766
0
    chartype = prop->chartype;
767
0
    set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt);
768
0
    break;
769
770
548k
    case PT_GC:
771
548k
    set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata);
772
548k
    break;
773
774
52.2k
    case PT_PC:
775
52.2k
    set_bit = (prop->chartype == pdata);
776
52.2k
    break;
777
778
0
    case PT_SC:
779
0
    set_bit = (prop->script == pdata);
780
0
    break;
781
782
0
    case PT_SCX:
783
0
    set_bit = (prop->script == pdata ||
784
0
      MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
785
0
    break;
786
787
0
    case PT_ALNUM:
788
0
    gentype = PRIV(ucp_gentype)[prop->chartype];
789
0
    set_bit = (gentype == ucp_L || gentype == ucp_N);
790
0
    break;
791
792
0
    case PT_SPACE:    /* Perl space */
793
0
    case PT_PXSPACE:  /* POSIX space */
794
0
    switch(c)
795
0
      {
796
0
      HSPACE_BYTE_CASES:
797
0
      VSPACE_BYTE_CASES:
798
0
      set_bit = TRUE;
799
0
      break;
800
801
0
      default:
802
0
      set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z);
803
0
      break;
804
0
      }
805
0
    break;
806
807
45.0k
    case PT_WORD:
808
45.0k
    chartype = prop->chartype;
809
45.0k
    gentype = PRIV(ucp_gentype)[chartype];
810
45.0k
    set_bit = (gentype == ucp_L || gentype == ucp_N ||
811
45.0k
               chartype == ucp_Mn || chartype == ucp_Pc);
812
45.0k
    break;
813
814
0
    case PT_UCNC:
815
0
    set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
816
0
               c == CHAR_GRAVE_ACCENT || c >= 0xa0);
817
0
    break;
818
819
0
    case PT_BIDICL:
820
0
    set_bit = (UCD_BIDICLASS_PROP(prop) == pdata);
821
0
    break;
822
823
0
    case PT_BOOL:
824
0
    set_bit = MAPBIT(PRIV(ucd_boolprop_sets) +
825
0
                     UCD_BPROPS_PROP(prop), pdata) != 0;
826
0
    break;
827
828
0
    case PT_PXGRAPH:
829
0
    chartype = prop->chartype;
830
0
    gentype = PRIV(ucp_gentype)[chartype];
831
0
    set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf));
832
0
    break;
833
834
0
    case PT_PXPRINT:
835
0
    chartype = prop->chartype;
836
0
    set_bit = (chartype != ucp_Zl && chartype != ucp_Zp &&
837
0
       (PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf));
838
0
    break;
839
840
0
    case PT_PXPUNCT:
841
0
    gentype = PRIV(ucp_gentype)[prop->chartype];
842
0
    set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S));
843
0
    break;
844
845
0
    default:
846
0
    PCRE2_ASSERT(ptype == PT_PXXDIGIT);
847
0
    set_bit = (c >= CHAR_0 && c <= CHAR_9) ||
848
0
              (c >= CHAR_A && c <= CHAR_F) ||
849
0
              (c >= CHAR_a && c <= CHAR_f);
850
0
    break;
851
646k
    }
852
853
646k
  if (negated) set_bit = !set_bit;
854
646k
  if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7));
855
646k
  if ((c & 0x7) == 0x7) classbits++;
856
646k
  }
857
2.52k
}
858
859
#endif /* SUPPORT_UNICODE */
860
861
862
863
#ifdef SUPPORT_WIDE_CHARS
864
865
/*************************************************
866
*           XClass related properties            *
867
*************************************************/
868
869
/* XClass needs to be generated. */
870
24.3k
#define XCLASS_REQUIRED 0x1
871
/* XClass has 8 bit character. */
872
383k
#define XCLASS_HAS_8BIT_CHARS 0x2
873
/* XClass has properties. */
874
5.34k
#define XCLASS_HAS_PROPS 0x4
875
/* XClass has character lists. */
876
5.63k
#define XCLASS_HAS_CHAR_LISTS 0x8
877
/* XClass matches to all >= 256 characters. */
878
5.91k
#define XCLASS_HIGH_ANY 0x10
879
880
#endif
881
882
883
/*************************************************
884
*   Internal entry point for add range to class  *
885
*************************************************/
886
887
/* This function sets the overall range for characters < 256.
888
It also handles non-utf case folding.
889
890
Arguments:
891
  options       the options bits
892
  xoptions      the extra options bits
893
  cb            compile data
894
  start         start of range character
895
  end           end of range character
896
897
Returns:        cb->classbits is updated
898
*/
899
900
static void
901
add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
902
  uint32_t start, uint32_t end)
903
399k
{
904
399k
uint8_t *classbits = cb->classbits.classbits;
905
399k
uint32_t c, byte_start, byte_end;
906
399k
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
907
908
/* If caseless matching is required, scan the range and process alternate
909
cases. In Unicode, there are 8-bit characters that have alternate cases that
910
are greater than 255 and vice-versa (though these may be ignored if caseless
911
restriction is in force). Sometimes we can just extend the original range. */
912
913
399k
if ((options & PCRE2_CASELESS) != 0)
914
238k
  {
915
238k
#ifdef SUPPORT_UNICODE
916
  /* UTF mode. This branch is taken if we don't support wide characters (e.g.
917
  8-bit library, without UTF), but we do treat those characters as Unicode
918
  (if UCP flag is set). In this case, we only need to expand the character class
919
  set to include the case pairs which are in the 0-255 codepoint range. */
920
238k
  if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
921
0
    {
922
0
      BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
923
0
        PCRE2_EXTRA_TURKISH_CASING;
924
0
      if (start < 128)
925
0
        {
926
0
        uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127);
927
0
        for (c = start; c <= lo_end; c++)
928
0
          {
929
0
          if (turkish_i && UCD_ANY_I(c)) continue;
930
0
          SETBIT(classbits, cb->fcc[c]);
931
0
          }
932
0
        }
933
0
      if (classbits_end >= 128)
934
0
        {
935
0
        uint32_t hi_start = (start > 128 ? start : 128);
936
0
        for (c = hi_start; c <= classbits_end; c++)
937
0
          {
938
0
          uint32_t co = UCD_OTHERCASE(c);
939
0
          if (co <= 0xff) SETBIT(classbits, co);
940
0
          }
941
0
        }
942
0
    }
943
944
238k
  else
945
238k
#endif  /* SUPPORT_UNICODE */
946
947
  /* Not UTF mode */
948
238k
    {
949
491k
    for (c = start; c <= classbits_end; c++)
950
253k
      SETBIT(classbits, cb->fcc[c]);
951
238k
    }
952
238k
  }
953
954
/* Use the bitmap for characters < 256. Otherwise use extra data. */
955
956
399k
byte_start = (start + 7) >> 3;
957
399k
byte_end = (classbits_end + 1) >> 3;
958
959
399k
if (byte_start >= byte_end)
960
397k
  {
961
799k
  for (c = start; c <= classbits_end; c++)
962
    /* Regardless of start, c will always be <= 255. */
963
402k
    SETBIT(classbits, c);
964
397k
  return;
965
397k
  }
966
967
24.6k
for (c = byte_start; c < byte_end; c++)
968
22.3k
  classbits[c] = 0xff;
969
970
2.32k
byte_start <<= 3;
971
2.32k
byte_end <<= 3;
972
973
9.17k
for (c = start; c < byte_start; c++)
974
6.84k
  SETBIT(classbits, c);
975
976
9.18k
for (c = byte_end; c <= classbits_end; c++)
977
6.85k
  SETBIT(classbits, c);
978
2.32k
}
979
980
981
#if PCRE2_CODE_UNIT_WIDTH == 8
982
/*************************************************
983
*   Internal entry point for add list to class   *
984
*************************************************/
985
986
/* This function is used for adding a list of horizontal or vertical whitespace
987
characters to a class. The list must be in order so that ranges of characters
988
can be detected and handled appropriately. This function sets the overall range
989
so that the internal functions can try to avoid duplication when handling
990
case-independence.
991
992
Arguments:
993
  options       the options bits
994
  xoptions      the extra options bits
995
  cb            contains pointers to tables etc.
996
  p             points to row of 32-bit values, terminated by NOTACHAR
997
998
Returns:        cb->classbits is updated
999
*/
1000
1001
static void
1002
add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
1003
  const uint32_t *p)
1004
4
{
1005
16
while (p[0] < 256)
1006
12
  {
1007
12
  unsigned int n = 0;
1008
1009
12
  while(p[n+1] == p[0] + n + 1) n++;
1010
12
  add_to_class(options, xoptions, cb, p[0], p[n]);
1011
1012
12
  p += n + 1;
1013
12
  }
1014
4
}
1015
1016
1017
1018
/*************************************************
1019
*    Add characters not in a list to a class     *
1020
*************************************************/
1021
1022
/* This function is used for adding the complement of a list of horizontal or
1023
vertical whitespace to a class. The list must be in order.
1024
1025
Arguments:
1026
  options       the options bits
1027
  xoptions      the extra options bits
1028
  cb            contains pointers to tables etc.
1029
  p             points to row of 32-bit values, terminated by NOTACHAR
1030
1031
Returns:        cb->classbits is updated
1032
*/
1033
1034
static void
1035
add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
1036
  const uint32_t *p)
1037
198
{
1038
198
if (p[0] > 0)
1039
198
  add_to_class(options, xoptions, cb, 0, p[0] - 1);
1040
608
while (p[0] < 256)
1041
410
  {
1042
962
  while (p[1] == p[0] + 1) p++;
1043
410
  add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1);
1044
410
  p++;
1045
410
  }
1046
198
}
1047
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1048
1049
1050
1051
/*************************************************
1052
*  Main entry-point to compile a character class *
1053
*************************************************/
1054
1055
/* This function consumes a "leaf", which is a set of characters that will
1056
become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */
1057
1058
uint32_t *
1059
PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions,
1060
  uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap,
1061
  int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr)
1062
19.7k
{
1063
19.7k
uint32_t *pptr = start_ptr;
1064
19.7k
PCRE2_UCHAR *code = *pcode;
1065
19.7k
BOOL should_flip_negation;
1066
19.7k
const uint8_t *cbits = cb->cbits;
1067
/* Some functions such as add_to_class() or eclass processing
1068
expects that the bitset is stored in cb->classbits.classbits. */
1069
19.7k
uint8_t *const classbits = cb->classbits.classbits;
1070
1071
19.7k
#ifdef SUPPORT_UNICODE
1072
19.7k
BOOL utf = (options & PCRE2_UTF) != 0;
1073
#else  /* No Unicode support */
1074
BOOL utf = FALSE;
1075
#endif
1076
1077
/* Helper variables for OP_XCLASS opcode (for characters > 255). */
1078
1079
19.7k
#ifdef SUPPORT_WIDE_CHARS
1080
19.7k
uint32_t xclass_props;
1081
19.7k
PCRE2_UCHAR *class_uchardata;
1082
19.7k
class_ranges* cranges;
1083
19.7k
#endif
1084
1085
/* If an XClass contains a negative special such as \S, we need to flip the
1086
negation flag at the end, so that support for characters > 255 works correctly
1087
(they are all included in the class). An XClass may need to insert specific
1088
matching or non-matching code for wide characters.
1089
*/
1090
1091
19.7k
should_flip_negation = FALSE;
1092
1093
/* XClass will be used when characters > 255 might match. */
1094
1095
19.7k
#ifdef SUPPORT_WIDE_CHARS
1096
19.7k
xclass_props = 0;
1097
1098
19.7k
#if PCRE2_CODE_UNIT_WIDTH == 8
1099
19.7k
cranges = NULL;
1100
1101
19.7k
if (utf)
1102
3.39k
#endif
1103
3.39k
  {
1104
3.39k
  if (lengthptr != NULL)
1105
1.71k
    {
1106
1.71k
    cranges = compile_optimize_class(pptr, options, xoptions, cb);
1107
1108
1.71k
    if (cranges == NULL)
1109
0
      {
1110
0
      *errorcodeptr = ERR21;
1111
0
      return NULL;
1112
0
      }
1113
1114
    /* Caching the pre-processed character ranges. */
1115
1.71k
    if (cb->next_cranges != NULL)
1116
1.53k
      cb->next_cranges->next = cranges;
1117
183
    else
1118
183
      cb->cranges = cranges;
1119
1120
1.71k
    cb->next_cranges = cranges;
1121
1.71k
    }
1122
1.67k
  else
1123
1.67k
    {
1124
    /* Reuse the pre-processed character ranges. */
1125
1.67k
    cranges = cb->cranges;
1126
1.67k
    PCRE2_ASSERT(cranges != NULL);
1127
1.67k
    cb->cranges = cranges->next;
1128
1.67k
    }
1129
1130
3.39k
  if (cranges->range_list_size > 0)
1131
3.39k
    {
1132
3.39k
    const uint32_t *ranges = (const uint32_t*)(cranges + 1);
1133
1134
3.39k
    if (ranges[0] <= 255)
1135
3.39k
      xclass_props |= XCLASS_HAS_8BIT_CHARS;
1136
1137
3.39k
    if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) &&
1138
3.39k
        ranges[cranges->range_list_size - 2] <= 256)
1139
0
      xclass_props |= XCLASS_HIGH_ANY;
1140
3.39k
    }
1141
3.39k
  }
1142
1143
19.7k
class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
1144
19.7k
#endif /* SUPPORT_WIDE_CHARS */
1145
1146
/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
1147
in a temporary bit of memory, in case the class contains fewer than two
1148
8-bit characters because in that case the compiled code doesn't use the bit
1149
map. */
1150
1151
19.7k
memset(classbits, 0, 32);
1152
1153
/* Process items until end_ptr is reached. */
1154
1155
450k
while (TRUE)
1156
450k
  {
1157
450k
  uint32_t meta = *(pptr++);
1158
450k
  BOOL local_negate;
1159
450k
  int posix_class;
1160
450k
  int taboffset, tabopt;
1161
450k
  class_bits_storage pbits;
1162
450k
  uint32_t escape, c;
1163
1164
  /* Handle POSIX classes such as [:alpha:] etc. */
1165
450k
  switch (META_CODE(meta))
1166
450k
    {
1167
0
    case META_POSIX:
1168
0
    case META_POSIX_NEG:
1169
1170
0
    local_negate = (meta == META_POSIX_NEG);
1171
0
    posix_class = *(pptr++);
1172
1173
0
    if (local_negate) should_flip_negation = TRUE;  /* Note negative special */
1174
1175
    /* If matching is caseless, upper and lower are converted to alpha.
1176
    This relies on the fact that the class table starts with alpha,
1177
    lower, upper as the first 3 entries. */
1178
1179
0
    if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
1180
0
      posix_class = 0;
1181
1182
    /* When PCRE2_UCP is set, some of the POSIX classes are converted to
1183
    different escape sequences that use Unicode properties \p or \P.
1184
    Others that are not available via \p or \P have to generate
1185
    XCL_PROP/XCL_NOTPROP directly, which is done here. */
1186
1187
0
#ifdef SUPPORT_UNICODE
1188
    /* TODO This entire block of code here appears to be unreachable!? I simply
1189
    can't see how it can be hit, given that the frontend parser doesn't emit
1190
    META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */
1191
0
    if ((options & PCRE2_UCP) != 0 &&
1192
0
        (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
1193
0
      {
1194
0
      uint32_t ptype;
1195
1196
0
      switch(posix_class)
1197
0
        {
1198
0
        case PC_GRAPH:
1199
0
        case PC_PRINT:
1200
0
        case PC_PUNCT:
1201
0
        ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH :
1202
0
                (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT;
1203
1204
0
        PRIV(update_classbits)(ptype, 0, local_negate, classbits);
1205
1206
0
        if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1207
0
          {
1208
0
          if (lengthptr != NULL)
1209
0
            *lengthptr += 3;
1210
0
          else
1211
0
            {
1212
0
            *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
1213
0
            *class_uchardata++ = (PCRE2_UCHAR)ptype;
1214
0
            *class_uchardata++ = 0;
1215
0
            }
1216
0
          xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1217
0
          }
1218
0
        continue;
1219
1220
        /* For the other POSIX classes (ex: ascii) we are going to
1221
        fall through to the non-UCP case and build a bit map for
1222
        characters with code points less than 256. However, if we are in
1223
        a negated POSIX class, characters with code points greater than
1224
        255 must either all match or all not match, depending on whether
1225
        the whole class is not or is negated. For example, for
1226
        [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
1227
        they must not.
1228
1229
        In the special case where there are no xclass items, this is
1230
        automatically handled by the use of OP_CLASS or OP_NCLASS, but an
1231
        explicit range is needed for OP_XCLASS. Setting a flag here
1232
        causes the range to be generated later when it is known that
1233
        OP_XCLASS is required. In the 8-bit library this is relevant only in
1234
        utf mode, since no wide characters can exist otherwise. */
1235
1236
0
        default:
1237
0
        break;
1238
0
        }
1239
0
      }
1240
0
#endif  /* SUPPORT_UNICODE */
1241
1242
    /* In the non-UCP case, or when UCP makes no difference, we build the
1243
    bit map for the POSIX class in a chunk of local store because we may
1244
    be adding and subtracting from it, and we don't want to subtract bits
1245
    that may be in the main map already. At the end we or the result into
1246
    the bit map that is being built. */
1247
1248
0
    posix_class *= 3;
1249
1250
    /* Copy in the first table (always present) */
1251
1252
0
    memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32);
1253
1254
    /* If there is a second table, add or remove it as required. */
1255
1256
0
    taboffset = PRIV(posix_class_maps)[posix_class + 1];
1257
0
    tabopt = PRIV(posix_class_maps)[posix_class + 2];
1258
1259
0
    if (taboffset >= 0)
1260
0
      {
1261
0
      if (tabopt >= 0)
1262
0
        for (int i = 0; i < 32; i++)
1263
0
          pbits.classbits[i] |= cbits[i + taboffset];
1264
0
      else
1265
0
        for (int i = 0; i < 32; i++)
1266
0
          pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]);
1267
0
      }
1268
1269
    /* Now see if we need to remove any special characters. An option
1270
    value of 1 removes vertical space and 2 removes underscore. */
1271
1272
0
    if (tabopt < 0) tabopt = -tabopt;
1273
0
    if (tabopt == 1) pbits.classbits[1] &= ~0x3c;
1274
0
      else if (tabopt == 2) pbits.classbits[11] &= 0x7f;
1275
1276
    /* Add the POSIX table or its complement into the main table that is
1277
    being built and we are done. */
1278
1279
0
      {
1280
0
      uint32_t *classwords = cb->classbits.classwords;
1281
1282
0
      if (local_negate)
1283
0
        for (int i = 0; i < 8; i++)
1284
0
          classwords[i] |= (uint32_t)(~pbits.classwords[i]);
1285
0
      else
1286
0
        for (int i = 0; i < 8; i++)
1287
0
          classwords[i] |= pbits.classwords[i];
1288
0
      }
1289
1290
0
#ifdef SUPPORT_WIDE_CHARS
1291
    /* Every class contains at least one < 256 character. */
1292
0
    xclass_props |= XCLASS_HAS_8BIT_CHARS;
1293
0
#endif
1294
0
    continue;               /* End of POSIX handling */
1295
1296
    /* Other than POSIX classes, the only items we should encounter are
1297
    \d-type escapes and literal characters (possibly as ranges). */
1298
0
    case META_BIGVALUE:
1299
0
    meta = *(pptr++);
1300
0
    break;
1301
1302
7.02k
    case META_ESCAPE:
1303
7.02k
    escape = META_DATA(meta);
1304
1305
7.02k
    switch(escape)
1306
7.02k
      {
1307
946
      case ESC_d:
1308
31.2k
      for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
1309
946
      break;
1310
1311
1.29k
      case ESC_D:
1312
1.29k
      should_flip_negation = TRUE;
1313
42.7k
      for (int i = 0; i < 32; i++)
1314
41.4k
        classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
1315
1.29k
      break;
1316
1317
198
      case ESC_w:
1318
6.53k
      for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
1319
198
      break;
1320
1321
1.74k
      case ESC_W:
1322
1.74k
      should_flip_negation = TRUE;
1323
57.6k
      for (int i = 0; i < 32; i++)
1324
55.9k
        classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
1325
1.74k
      break;
1326
1327
      /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
1328
      5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
1329
      previously set by something earlier in the character class.
1330
      Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
1331
      we could just adjust the appropriate bit. From PCRE 8.34 we no
1332
      longer treat \s and \S specially. */
1333
1334
42
      case ESC_s:
1335
1.38k
      for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
1336
42
      break;
1337
1338
72
      case ESC_S:
1339
72
      should_flip_negation = TRUE;
1340
2.37k
      for (int i = 0; i < 32; i++)
1341
2.30k
        classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
1342
72
      break;
1343
1344
      /* When adding the horizontal or vertical space lists to a class, or
1345
      their complements, disable PCRE2_CASELESS, because it justs wastes
1346
      time, and in the "not-x" UTF cases can create unwanted duplicates in
1347
      the XCLASS list (provoked by characters that have more than one other
1348
      case and by both cases being in the same "not-x" sublist). */
1349
1350
4
      case ESC_h:
1351
4
#if PCRE2_CODE_UNIT_WIDTH == 8
1352
4
#ifdef SUPPORT_UNICODE
1353
4
      if (cranges != NULL) break;
1354
4
#endif
1355
4
      add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1356
4
        cb, PRIV(hspace_list));
1357
#else
1358
      PCRE2_ASSERT(cranges != NULL);
1359
#endif
1360
4
      break;
1361
1362
14
      case ESC_H:
1363
14
#if PCRE2_CODE_UNIT_WIDTH == 8
1364
14
#ifdef SUPPORT_UNICODE
1365
14
      if (cranges != NULL) break;
1366
14
#endif
1367
14
      add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1368
14
        cb, PRIV(hspace_list));
1369
#else
1370
      PCRE2_ASSERT(cranges != NULL);
1371
#endif
1372
14
      break;
1373
1374
0
      case ESC_v:
1375
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1376
0
#ifdef SUPPORT_UNICODE
1377
0
      if (cranges != NULL) break;
1378
0
#endif
1379
0
      add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1380
0
        cb, PRIV(vspace_list));
1381
#else
1382
      PCRE2_ASSERT(cranges != NULL);
1383
#endif
1384
0
      break;
1385
1386
184
      case ESC_V:
1387
184
#if PCRE2_CODE_UNIT_WIDTH == 8
1388
184
#ifdef SUPPORT_UNICODE
1389
184
      if (cranges != NULL) break;
1390
184
#endif
1391
184
      add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1392
184
        cb, PRIV(vspace_list));
1393
#else
1394
      PCRE2_ASSERT(cranges != NULL);
1395
#endif
1396
184
      break;
1397
1398
      /* If Unicode is not supported, \P and \p are not allowed and are
1399
      faulted at parse time, so will never appear here. */
1400
1401
0
#ifdef SUPPORT_UNICODE
1402
1.83k
      case ESC_p:
1403
2.52k
      case ESC_P:
1404
2.52k
        {
1405
2.52k
        uint32_t ptype = *pptr >> 16;
1406
2.52k
        uint32_t pdata = *(pptr++) & 0xffff;
1407
1408
        /* The "Any" is processed by PRIV(update_classbits)(). */
1409
2.52k
        if (ptype == PT_ANY)
1410
0
          {
1411
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1412
0
          if (!utf && escape == ESC_p) memset(classbits, 0xff, 32);
1413
0
#endif
1414
0
          continue;
1415
0
          }
1416
1417
2.52k
        PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);
1418
1419
2.52k
        if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1420
2.52k
          {
1421
2.52k
          if (lengthptr != NULL)
1422
1.26k
            *lengthptr += 3;
1423
1.26k
          else
1424
1.26k
            {
1425
1.26k
            *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
1426
1.26k
            *class_uchardata++ = ptype;
1427
1.26k
            *class_uchardata++ = pdata;
1428
1.26k
            }
1429
2.52k
          xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1430
2.52k
          }
1431
2.52k
        }
1432
0
      continue;
1433
7.02k
#endif
1434
7.02k
      }
1435
1436
4.50k
#ifdef SUPPORT_WIDE_CHARS
1437
    /* Every non-property class contains at least one < 256 character. */
1438
4.50k
    xclass_props |= XCLASS_HAS_8BIT_CHARS;
1439
4.50k
#endif
1440
    /* End handling \d-type escapes */
1441
4.50k
    continue;
1442
1443
443k
    CLASS_END_CASES(meta)
1444
    /* Literals. */
1445
443k
    if (meta < META_END) break;
1446
    /* Non-literals: end of class contents. */
1447
19.7k
    goto END_PROCESSING;
1448
450k
    }
1449
1450
  /* A literal character may be followed by a range meta. At parse time
1451
  there are checks for out-of-order characters, for ranges where the two
1452
  characters are equal, and for hyphens that cannot indicate a range. At
1453
  this point, therefore, no checking is needed. */
1454
1455
424k
  c = meta;
1456
1457
  /* Remember if \r or \n were explicitly used */
1458
1459
424k
  if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
1460
1461
  /* Process a character range */
1462
1463
424k
  if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED)
1464
1.25k
    {
1465
1.25k
    uint32_t d;
1466
1467
#ifdef EBCDIC
1468
    BOOL range_is_literal = (*pptr == META_RANGE_LITERAL);
1469
#endif
1470
1.25k
    ++pptr;
1471
1.25k
    d = *(pptr++);
1472
1.25k
    if (d == META_BIGVALUE) d = *(pptr++);
1473
1474
    /* Remember an explicit \r or \n, and add the range to the class. */
1475
1476
1.25k
    if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
1477
1478
1.25k
#if PCRE2_CODE_UNIT_WIDTH == 8
1479
1.25k
#ifdef SUPPORT_UNICODE
1480
1.25k
    if (cranges != NULL) continue;
1481
686
    xclass_props |= XCLASS_HAS_8BIT_CHARS;
1482
686
#endif
1483
1484
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
1485
    because there are holes in the encoding, and simply using the range
1486
    A-Z (for example) would include the characters in the holes. This
1487
    applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
1488
1489
#ifdef EBCDIC
1490
    if (range_is_literal &&
1491
         (cb->ctypes[c] & ctype_letter) != 0 &&
1492
         (cb->ctypes[d] & ctype_letter) != 0 &&
1493
         (c <= CHAR_z) == (d <= CHAR_z))
1494
      {
1495
      uint32_t uc = (d <= CHAR_z)? 0 : 64;
1496
      uint32_t C = c - uc;
1497
      uint32_t D = d - uc;
1498
1499
      if (C <= CHAR_i)
1500
        {
1501
        add_to_class(options, xoptions, cb, C + uc,
1502
          ((D < CHAR_i)? D : CHAR_i) + uc);
1503
        C = CHAR_j;
1504
        }
1505
1506
      if (C <= D && C <= CHAR_r)
1507
        {
1508
        add_to_class(options, xoptions, cb, C + uc,
1509
          ((D < CHAR_r)? D : CHAR_r) + uc);
1510
        C = CHAR_s;
1511
        }
1512
1513
      if (C <= D)
1514
        add_to_class(options, xoptions, cb, C + uc, D + uc);
1515
      }
1516
    else
1517
#endif
1518
    /* Not an EBCDIC special range */
1519
1520
686
    add_to_class(options, xoptions, cb, c, d);
1521
#else
1522
    PCRE2_ASSERT(cranges != NULL);
1523
#endif
1524
686
    continue;
1525
1.25k
    }  /* End of range handling */
1526
1527
  /* Character ranges are ignored when class_ranges is present. */
1528
422k
#if PCRE2_CODE_UNIT_WIDTH == 8
1529
422k
#ifdef SUPPORT_UNICODE
1530
422k
  if (cranges != NULL) continue;
1531
371k
  xclass_props |= XCLASS_HAS_8BIT_CHARS;
1532
371k
#endif
1533
  /* Handle a single character. */
1534
1535
371k
  add_to_class(options, xoptions, cb, meta, meta);
1536
#else
1537
  PCRE2_ASSERT(cranges != NULL);
1538
#endif
1539
371k
  }   /* End of main class-processing loop */
1540
1541
19.7k
END_PROCESSING:
1542
1543
19.7k
#ifdef SUPPORT_WIDE_CHARS
1544
19.7k
PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 ||
1545
19.7k
             (xclass_props & XCLASS_HIGH_ANY) == 0);
1546
1547
19.7k
if (cranges != NULL)
1548
3.39k
  {
1549
3.39k
  uint32_t *range = (uint32_t*)(cranges + 1);
1550
3.39k
  uint32_t *end = range + cranges->range_list_size;
1551
1552
29.9k
  while (range < end && range[0] < 256)
1553
26.5k
    {
1554
26.5k
    PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0);
1555
    /* Add range to bitset. If we are in UTF or UCP mode, then clear the
1556
    caseless bit, because the cranges handle caselessness (only) in this
1557
    condition; see the condition for PARSE_CLASS_CASELESS_UTF in
1558
    compile_optimize_class(). */
1559
26.5k
    add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)?
1560
26.5k
        (options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]);
1561
1562
26.5k
    if (range[1] > 255) break;
1563
26.5k
    range += 2;
1564
26.5k
    }
1565
1566
3.39k
  if (cranges->char_lists_size > 0)
1567
0
    {
1568
    /* The cranges structure is still used and freed later. */
1569
0
    PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0);
1570
0
    xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS;
1571
0
    }
1572
3.39k
  else
1573
3.39k
    {
1574
3.39k
    if ((xclass_props & XCLASS_HIGH_ANY) != 0)
1575
0
      {
1576
0
      PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&
1577
0
        range[1] >= GET_MAX_CHAR_VALUE(utf));
1578
0
      should_flip_negation = TRUE;
1579
0
      range = end;
1580
0
      }
1581
1582
5.45k
    while (range < end)
1583
2.06k
      {
1584
2.06k
      uint32_t range_start = range[0];
1585
2.06k
      uint32_t range_end = range[1];
1586
1587
2.06k
      range += 2;
1588
2.06k
      xclass_props |= XCLASS_REQUIRED;
1589
1590
2.06k
      if (range_start < 256) range_start = 256;
1591
1592
2.06k
      if (lengthptr != NULL)
1593
1.03k
        {
1594
1.03k
#ifdef SUPPORT_UNICODE
1595
1.03k
        if (utf)
1596
1.03k
          {
1597
1.03k
          *lengthptr += 1;
1598
1599
1.03k
          if (range_start < range_end)
1600
1
            *lengthptr += PRIV(ord2utf)(range_start, class_uchardata);
1601
1602
1.03k
          *lengthptr += PRIV(ord2utf)(range_end, class_uchardata);
1603
1.03k
          continue;
1604
1.03k
          }
1605
0
#endif  /* SUPPORT_UNICODE */
1606
1607
0
        *lengthptr += range_start < range_end ? 3 : 2;
1608
0
        continue;
1609
1.03k
        }
1610
1611
1.03k
#ifdef SUPPORT_UNICODE
1612
1.03k
      if (utf)
1613
1.03k
        {
1614
1.03k
        if (range_start < range_end)
1615
1
          {
1616
1
          *class_uchardata++ = XCL_RANGE;
1617
1
          class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);
1618
1
          }
1619
1.02k
        else
1620
1.02k
          *class_uchardata++ = XCL_SINGLE;
1621
1622
1.03k
        class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);
1623
1.03k
        continue;
1624
1.03k
        }
1625
1.03k
#endif  /* SUPPORT_UNICODE */
1626
1627
      /* Without UTF support, character values are constrained
1628
      by the bit length, and can only be > 256 for 16-bit and
1629
      32-bit libraries. */
1630
#if PCRE2_CODE_UNIT_WIDTH != 8
1631
      if (range_start < range_end)
1632
        {
1633
        *class_uchardata++ = XCL_RANGE;
1634
        *class_uchardata++ = range_start;
1635
        }
1636
      else
1637
        *class_uchardata++ = XCL_SINGLE;
1638
1639
      *class_uchardata++ = range_end;
1640
#endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
1641
1.03k
      }
1642
1643
3.39k
    if (lengthptr == NULL)
1644
1.67k
      cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
1645
3.39k
    }
1646
3.39k
  }
1647
19.7k
#endif /* SUPPORT_WIDE_CHARS */
1648
1649
/* If there are characters with values > 255, or Unicode property settings
1650
(\p or \P), we have to compile an extended class, with its own opcode,
1651
unless there were no property settings and there was a negated special such
1652
as \S in the class, and PCRE2_UCP is not set, because in that case all
1653
characters > 255 are in or not in the class, so any that were explicitly
1654
given as well can be ignored.
1655
1656
In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
1657
were present in a class, we either have to match or not match all wide
1658
characters (depending on whether the whole class is or is not negated).
1659
This requirement is indicated by match_all_or_no_wide_chars being true.
1660
We do this by including an explicit range, which works in both cases.
1661
This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
1662
cannot be any wide characters in 8-bit non-UTF mode.
1663
1664
When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
1665
class where \S etc is present without PCRE2_UCP, causing an extended class
1666
to be compiled, we make sure that all characters > 255 are included by
1667
forcing match_all_or_no_wide_chars to be true.
1668
1669
If, when generating an xclass, there are no characters < 256, we can omit
1670
the bitmap in the actual compiled code. */
1671
1672
19.7k
#ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
1673
19.7k
if ((xclass_props & XCLASS_REQUIRED) != 0)
1674
2.81k
  {
1675
2.81k
  PCRE2_UCHAR *previous = code;
1676
1677
2.81k
  if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0)
1678
2.81k
    *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
1679
2.81k
  *code++ = OP_XCLASS;
1680
2.81k
  code += LINK_SIZE;
1681
2.81k
  *code = negate_class? XCL_NOT:0;
1682
2.81k
  if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;
1683
1684
  /* If the map is required, move up the extra data to make room for it;
1685
  otherwise just move the code pointer to the end of the extra data. */
1686
1687
2.81k
  if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL)
1688
2.81k
    {
1689
2.81k
    if (negate_class)
1690
1.38k
      {
1691
1.38k
      uint32_t *classwords = cb->classbits.classwords;
1692
12.4k
      for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
1693
1.38k
      }
1694
1695
2.81k
    if (has_bitmap == NULL)
1696
2.81k
      {
1697
2.81k
      *code++ |= XCL_MAP;
1698
2.81k
      (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
1699
2.81k
        CU2BYTES(class_uchardata - code));
1700
2.81k
      memcpy(code, classbits, 32);
1701
2.81k
      code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
1702
2.81k
      }
1703
0
    else
1704
0
      {
1705
0
      code = class_uchardata;
1706
0
      if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0)
1707
0
        *has_bitmap = TRUE;
1708
0
      }
1709
2.81k
    }
1710
0
  else code = class_uchardata;
1711
1712
2.81k
  if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)
1713
0
    {
1714
    /* Char lists size is an even number, because all items are 16 or 32
1715
    bit values. The character list data is always aligned to 32 bits. */
1716
0
    size_t char_lists_size = cranges->char_lists_size;
1717
0
    PCRE2_ASSERT((char_lists_size & 0x1) == 0 &&
1718
0
                 (cb->char_lists_size & 0x3) == 0);
1719
1720
0
    if (lengthptr != NULL)
1721
0
      {
1722
0
      char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
1723
1724
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1725
0
      *lengthptr += 2 + LINK_SIZE;
1726
#else
1727
      *lengthptr += 1 + LINK_SIZE;
1728
#endif
1729
1730
0
      cb->char_lists_size += char_lists_size;
1731
1732
0
      char_lists_size /= sizeof(PCRE2_UCHAR);
1733
1734
      /* Storage space for character lists is included
1735
      in the maximum pattern size. */
1736
0
      if (*lengthptr > MAX_PATTERN_SIZE ||
1737
0
          MAX_PATTERN_SIZE - *lengthptr < char_lists_size)
1738
0
        {
1739
0
        *errorcodeptr = ERR20;   /* Pattern is too large */
1740
0
        return NULL;
1741
0
        }
1742
0
      }
1743
0
    else
1744
0
      {
1745
0
      uint8_t *data;
1746
1747
0
      PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);
1748
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1749
      /* Encode as high / low bytes. */
1750
0
      code[0] = (uint8_t)(XCL_LIST |
1751
0
        (cranges->char_lists_types >> 8));
1752
0
      code[1] = (uint8_t)cranges->char_lists_types;
1753
0
      code += 2;
1754
#else
1755
      *code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
1756
#endif
1757
1758
      /* Character lists are stored in backwards direction from
1759
      byte code start. The non-dfa/dfa matchers can access these
1760
      lists using the byte code start stored in match blocks.
1761
      Each list is aligned to 32 bit with an optional unused
1762
      16 bit value at the beginning of the character list. */
1763
1764
0
      cb->char_lists_size += char_lists_size;
1765
0
      data = (uint8_t*)cb->start_code - cb->char_lists_size;
1766
1767
0
      memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start,
1768
0
        char_lists_size);
1769
1770
      /* Since character lists total size is less than MAX_PATTERN_SIZE,
1771
      their starting offset fits into a value which size is LINK_SIZE. */
1772
1773
0
      char_lists_size = cb->char_lists_size;
1774
0
      PUT(code, 0, (uint32_t)(char_lists_size >> 1));
1775
0
      code += LINK_SIZE;
1776
1777
#if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND
1778
      if ((char_lists_size & 0x2) != 0)
1779
        {
1780
        /* In debug the unused 16 bit value is set
1781
        to a fixed value and marked unused. */
1782
        ((uint16_t*)data)[-1] = 0x5555;
1783
#ifdef SUPPORT_VALGRIND
1784
        VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2);
1785
#endif
1786
        }
1787
#endif
1788
1789
0
      cb->char_lists_size =
1790
0
        CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
1791
1792
0
      cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
1793
0
      }
1794
0
    }
1795
1796
  /* Now fill in the complete length of the item */
1797
1798
2.81k
  PUT(previous, 1, (int)(code - previous));
1799
2.81k
  goto DONE;   /* End of class handling */
1800
2.81k
  }
1801
16.9k
#endif  /* SUPPORT_WIDE_CHARS */
1802
1803
/* If there are no characters > 255, or they are all to be included or
1804
excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
1805
whole class was negated and whether there were negative specials such as \S
1806
(non-UCP) in the class. Then copy the 32-byte map into the code vector,
1807
negating it if necessary. */
1808
1809
16.9k
if (negate_class)
1810
5.03k
  {
1811
5.03k
  uint32_t *classwords = cb->classbits.classwords;
1812
1813
45.2k
  for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
1814
5.03k
  }
1815
1816
16.9k
if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) &&
1817
16.9k
    cb->classbits.classwords[0] == ~(uint32_t)0)
1818
1.45k
  {
1819
1.45k
  const uint32_t *classwords = cb->classbits.classwords;
1820
1.45k
  int i;
1821
1822
4.28k
  for (i = 0; i < 8; i++)
1823
4.11k
    if (classwords[i] != ~(uint32_t)0) break;
1824
1825
1.45k
  if (i == 8)
1826
166
    {
1827
166
    *code++ = OP_ALLANY;
1828
166
    goto DONE;   /* End of class handling */
1829
166
    }
1830
1.45k
  }
1831
1832
16.7k
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
1833
16.7k
memcpy(code, classbits, 32);
1834
16.7k
code += 32 / sizeof(PCRE2_UCHAR);
1835
1836
19.7k
DONE:
1837
19.7k
*pcode = code;
1838
19.7k
return pptr - 1;
1839
16.7k
}
1840
1841
1842
1843
/* ===================================================================*/
1844
/* Here follows a block of ECLASS-compiling functions. You may well want to
1845
read them from top to bottom; they are ordered from leafmost (at the top) to
1846
outermost parser (at the bottom of the file). */
1847
1848
/* This function folds one operand using the negation operator.
1849
The new, combined chunk of stack code is written out to *pop_info. */
1850
1851
static void
1852
fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr,
1853
  BOOL preserve_classbits)
1854
0
{
1855
/* If the chunk of stack code is already composed of multiple ops, we won't
1856
descend in and try and propagate the negation down the tree. (That would lead
1857
to O(n^2) compile-time, which could be exploitable with a malicious regex -
1858
although maybe that's not really too much of a worry in a library that offers
1859
an exponential-time matching function!) */
1860
1861
0
if (pop_info->op_single_type == 0)
1862
0
  {
1863
0
  if (lengthptr != NULL)
1864
0
    *lengthptr += 1;
1865
0
  else
1866
0
    pop_info->code_start[pop_info->length] = ECL_NOT;
1867
0
  pop_info->length += 1;
1868
0
  }
1869
1870
/* Otherwise, it's a nice single-op item, so we can easily fold in the negation
1871
without needing to produce an ECL_NOT. */
1872
1873
0
else if (pop_info->op_single_type == ECL_ANY ||
1874
0
         pop_info->op_single_type == ECL_NONE)
1875
0
  {
1876
0
  pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)?
1877
0
      ECL_ANY : ECL_NONE;
1878
0
  if (lengthptr == NULL)
1879
0
    *(pop_info->code_start) = pop_info->op_single_type;
1880
0
  }
1881
0
else
1882
0
  {
1883
0
  PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS &&
1884
0
               pop_info->length >= 1 + LINK_SIZE + 1);
1885
0
  if (lengthptr == NULL)
1886
0
    pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT;
1887
0
  }
1888
1889
0
if (!preserve_classbits)
1890
0
  {
1891
0
  for (int i = 0; i < 8; i++)
1892
0
    pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i];
1893
0
  }
1894
0
}
1895
1896
1897
1898
/* This function folds together two operands using a binary operator.
1899
The new, combined chunk of stack code is written out to *lhs_op_info. */
1900
1901
static void
1902
fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info,
1903
  PCRE2_SIZE *lengthptr)
1904
0
{
1905
0
switch (op)
1906
0
  {
1907
  /* ECL_AND truth table:
1908
1909
     LHS  RHS  RESULT
1910
     ----------------
1911
     ANY  *    RHS
1912
     *    ANY  LHS
1913
     NONE *    NONE
1914
     *    NONE NONE
1915
     X    Y    X & Y
1916
  */
1917
1918
0
  case ECL_AND:
1919
0
  if (rhs_op_info->op_single_type == ECL_ANY)
1920
0
    {
1921
    /* no-op: drop the RHS */
1922
0
    }
1923
0
  else if (lhs_op_info->op_single_type == ECL_ANY)
1924
0
    {
1925
    /* no-op: drop the LHS, and memmove the RHS into its place */
1926
0
    if (lengthptr == NULL)
1927
0
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
1928
0
              CU2BYTES(rhs_op_info->length));
1929
0
    lhs_op_info->length = rhs_op_info->length;
1930
0
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
1931
0
    }
1932
0
  else if (rhs_op_info->op_single_type == ECL_NONE)
1933
0
    {
1934
    /* the result is ECL_NONE: write into the LHS */
1935
0
    if (lengthptr == NULL)
1936
0
      lhs_op_info->code_start[0] = ECL_NONE;
1937
0
    lhs_op_info->length = 1;
1938
0
    lhs_op_info->op_single_type = ECL_NONE;
1939
0
    }
1940
0
  else if (lhs_op_info->op_single_type == ECL_NONE)
1941
0
    {
1942
    /* the result is ECL_NONE: drop the RHS */
1943
0
    }
1944
0
  else
1945
0
    {
1946
    /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
1947
0
    if (lengthptr != NULL)
1948
0
      *lengthptr += 1;
1949
0
    else
1950
0
      {
1951
0
      PCRE2_ASSERT(rhs_op_info->code_start ==
1952
0
          lhs_op_info->code_start + lhs_op_info->length);
1953
0
      rhs_op_info->code_start[rhs_op_info->length] = ECL_AND;
1954
0
      }
1955
0
    lhs_op_info->length += rhs_op_info->length + 1;
1956
0
    lhs_op_info->op_single_type = 0;
1957
0
    }
1958
1959
0
  for (int i = 0; i < 8; i++)
1960
0
    lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i];
1961
0
  break;
1962
1963
  /* ECL_OR truth table:
1964
1965
     LHS  RHS  RESULT
1966
     ----------------
1967
     ANY  *    ANY
1968
     *    ANY  ANY
1969
     NONE *    RHS
1970
     *    NONE LHS
1971
     X    Y    X | Y
1972
  */
1973
1974
0
  case ECL_OR:
1975
0
  if (rhs_op_info->op_single_type == ECL_NONE)
1976
0
    {
1977
    /* no-op: drop the RHS */
1978
0
    }
1979
0
  else if (lhs_op_info->op_single_type == ECL_NONE)
1980
0
    {
1981
    /* no-op: drop the LHS, and memmove the RHS into its place */
1982
0
    if (lengthptr == NULL)
1983
0
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
1984
0
              CU2BYTES(rhs_op_info->length));
1985
0
    lhs_op_info->length = rhs_op_info->length;
1986
0
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
1987
0
    }
1988
0
  else if (rhs_op_info->op_single_type == ECL_ANY)
1989
0
    {
1990
    /* the result is ECL_ANY: write into the LHS */
1991
0
    if (lengthptr == NULL)
1992
0
      lhs_op_info->code_start[0] = ECL_ANY;
1993
0
    lhs_op_info->length = 1;
1994
0
    lhs_op_info->op_single_type = ECL_ANY;
1995
0
    }
1996
0
  else if (lhs_op_info->op_single_type == ECL_ANY)
1997
0
    {
1998
    /* the result is ECL_ANY: drop the RHS */
1999
0
    }
2000
0
  else
2001
0
    {
2002
    /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
2003
0
    if (lengthptr != NULL)
2004
0
      *lengthptr += 1;
2005
0
    else
2006
0
      {
2007
0
      PCRE2_ASSERT(rhs_op_info->code_start ==
2008
0
          lhs_op_info->code_start + lhs_op_info->length);
2009
0
      rhs_op_info->code_start[rhs_op_info->length] = ECL_OR;
2010
0
      }
2011
0
    lhs_op_info->length += rhs_op_info->length + 1;
2012
0
    lhs_op_info->op_single_type = 0;
2013
0
    }
2014
2015
0
  for (int i = 0; i < 8; i++)
2016
0
    lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i];
2017
0
  break;
2018
2019
  /* ECL_XOR truth table:
2020
2021
     LHS  RHS  RESULT
2022
     ----------------
2023
     ANY  *    !RHS
2024
     *    ANY  !LHS
2025
     NONE *    RHS
2026
     *    NONE LHS
2027
     X    Y    X ^ Y
2028
  */
2029
2030
0
  case ECL_XOR:
2031
0
  if (rhs_op_info->op_single_type == ECL_NONE)
2032
0
    {
2033
    /* no-op: drop the RHS */
2034
0
    }
2035
0
  else if (lhs_op_info->op_single_type == ECL_NONE)
2036
0
    {
2037
    /* no-op: drop the LHS, and memmove the RHS into its place */
2038
0
    if (lengthptr == NULL)
2039
0
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2040
0
              CU2BYTES(rhs_op_info->length));
2041
0
    lhs_op_info->length = rhs_op_info->length;
2042
0
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2043
0
    }
2044
0
  else if (rhs_op_info->op_single_type == ECL_ANY)
2045
0
    {
2046
    /* the result is !LHS: fold in the negation, and drop the RHS */
2047
    /* Preserve the classbits, because we promise to deal with them later. */
2048
0
    fold_negation(lhs_op_info, lengthptr, TRUE);
2049
0
    }
2050
0
  else if (lhs_op_info->op_single_type == ECL_ANY)
2051
0
    {
2052
    /* the result is !RHS: drop the LHS, memmove the RHS into its place, and
2053
    fold in the negation */
2054
0
    if (lengthptr == NULL)
2055
0
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2056
0
              CU2BYTES(rhs_op_info->length));
2057
0
    lhs_op_info->length = rhs_op_info->length;
2058
0
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2059
2060
    /* Preserve the classbits, because we promise to deal with them later. */
2061
0
    fold_negation(lhs_op_info, lengthptr, TRUE);
2062
0
    }
2063
0
  else
2064
0
    {
2065
    /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
2066
0
    if (lengthptr != NULL)
2067
0
      *lengthptr += 1;
2068
0
    else
2069
0
      {
2070
0
      PCRE2_ASSERT(rhs_op_info->code_start ==
2071
0
          lhs_op_info->code_start + lhs_op_info->length);
2072
0
      rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR;
2073
0
      }
2074
0
    lhs_op_info->length += rhs_op_info->length + 1;
2075
0
    lhs_op_info->op_single_type = 0;
2076
0
    }
2077
2078
0
  for (int i = 0; i < 8; i++)
2079
0
    lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i];
2080
0
  break;
2081
2082
0
  default:
2083
0
  PCRE2_DEBUG_UNREACHABLE();
2084
0
  break;
2085
0
  }
2086
0
}
2087
2088
2089
2090
static BOOL
2091
compile_eclass_nested(eclass_context *context, BOOL negated,
2092
  uint32_t **pptr, PCRE2_UCHAR **pcode,
2093
  eclass_op_info *pop_info, PCRE2_SIZE *lengthptr);
2094
2095
/* This function consumes a group of implicitly-unioned class elements.
2096
These can be characters, ranges, properties, or nested classes, as long
2097
as they are all joined by being placed adjacently. */
2098
2099
static BOOL
2100
compile_class_operand(eclass_context *context, BOOL negated,
2101
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2102
  PCRE2_SIZE *lengthptr)
2103
0
{
2104
0
uint32_t *ptr = *pptr;
2105
0
uint32_t *prev_ptr;
2106
0
PCRE2_UCHAR *code = *pcode;
2107
0
PCRE2_UCHAR *code_start = code;
2108
0
PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0;
2109
0
PCRE2_SIZE extra_length;
2110
0
uint32_t meta = META_CODE(*ptr);
2111
2112
0
switch (meta)
2113
0
  {
2114
0
  case META_CLASS_EMPTY_NOT:
2115
0
  case META_CLASS_EMPTY:
2116
0
  ++ptr;
2117
0
  pop_info->length = 1;
2118
0
  if ((meta == META_CLASS_EMPTY) == negated)
2119
0
    {
2120
0
    *code++ = pop_info->op_single_type = ECL_ANY;
2121
0
    memset(pop_info->bits.classbits, 0xff, 32);
2122
0
    }
2123
0
  else
2124
0
    {
2125
0
    *code++ = pop_info->op_single_type = ECL_NONE;
2126
0
    memset(pop_info->bits.classbits, 0, 32);
2127
0
    }
2128
0
  break;
2129
2130
0
  case META_CLASS:
2131
0
  case META_CLASS_NOT:
2132
0
  if ((*ptr & CLASS_IS_ECLASS) != 0)
2133
0
    {
2134
0
    if (!compile_eclass_nested(context, negated, &ptr, &code,
2135
0
                               pop_info, lengthptr))
2136
0
      return FALSE;
2137
2138
0
    PCRE2_ASSERT(*ptr == META_CLASS_END);
2139
0
    ptr++;
2140
0
    goto DONE;
2141
0
    }
2142
2143
0
  ptr++;
2144
  /* Fall through */
2145
2146
0
  default:
2147
  /* Scan forward characters, ranges, and properties.
2148
  For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but
2149
  we still need to collect that fragment up into a "leaf" OP_CLASS. */
2150
2151
0
  prev_ptr = ptr;
2152
0
  ptr = PRIV(compile_class_not_nested)(
2153
0
    context->options, context->xoptions, ptr, &code,
2154
0
    (meta != META_CLASS_NOT) == negated, &context->needs_bitmap,
2155
0
    context->errorcodeptr, context->cb, lengthptr);
2156
0
  if (ptr == NULL) return FALSE;
2157
2158
  /* We must have a 100% guarantee that ptr increases when
2159
  compile_class_operand() returns, even on Release builds, so that we can
2160
  statically prove our loops terminate. */
2161
0
  if (ptr <= prev_ptr)
2162
0
    {
2163
0
    PCRE2_DEBUG_UNREACHABLE();
2164
0
    return FALSE;
2165
0
    }
2166
2167
  /* If we fell through above, consume the closing ']'. */
2168
0
  if (meta == META_CLASS || meta == META_CLASS_NOT)
2169
0
    {
2170
0
    PCRE2_ASSERT(*ptr == META_CLASS_END);
2171
0
    ptr++;
2172
0
    }
2173
2174
  /* Regardless of whether (lengthptr == NULL), some data will still be written
2175
  out to *pcode, which we need: we have to peek at it, to transform the opcode
2176
  into the ECLASS version (since we need to hoist up the bitmaps). */
2177
0
  PCRE2_ASSERT(code > code_start);
2178
0
  extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0;
2179
2180
  /* Easiest case: convert OP_ALLANY to ECL_ANY */
2181
2182
0
  if (*code_start == OP_ALLANY)
2183
0
    {
2184
0
    PCRE2_ASSERT(code - code_start == 1 && extra_length == 0);
2185
0
    pop_info->length = 1;
2186
0
    *code_start = pop_info->op_single_type = ECL_ANY;
2187
0
    memset(pop_info->bits.classbits, 0xff, 32);
2188
0
    }
2189
2190
  /* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to
2191
  ECL_NONE / ECL_ANY respectively. */
2192
2193
0
  else if (*code_start == OP_CLASS || *code_start == OP_NCLASS)
2194
0
    {
2195
0
    PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) &&
2196
0
                 extra_length == 0);
2197
0
    pop_info->length = 1;
2198
0
    *code_start = pop_info->op_single_type =
2199
0
        (*code_start == OP_CLASS)? ECL_NONE : ECL_ANY;
2200
0
    memcpy(pop_info->bits.classbits, code_start + 1, 32);
2201
    /* Rewind the code pointer, but make sure we adjust *lengthptr, because we
2202
    do need to reserve that space (even though we only use it temporarily). */
2203
0
    if (lengthptr != NULL)
2204
0
      *lengthptr += code - (code_start + 1);
2205
0
    code = code_start + 1;
2206
2207
0
    if (!context->needs_bitmap && *code_start == ECL_NONE)
2208
0
      {
2209
0
      uint32_t *classwords = pop_info->bits.classwords;
2210
2211
0
      for (int i = 0; i < 8; i++)
2212
0
        if (classwords[i] != 0)
2213
0
          {
2214
0
          context->needs_bitmap = TRUE;
2215
0
          break;
2216
0
          }
2217
0
      }
2218
0
    else
2219
0
      context->needs_bitmap = TRUE;
2220
0
    }
2221
2222
  /* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to
2223
  ECL_XCLASS. */
2224
2225
0
  else
2226
0
    {
2227
0
    PCRE2_ASSERT(*code_start == OP_XCLASS);
2228
0
    *code_start = pop_info->op_single_type = ECL_XCLASS;
2229
2230
0
    PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1);
2231
2232
0
    memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32);
2233
0
    pop_info->length = (code - code_start) + extra_length;
2234
0
    }
2235
2236
0
  break;
2237
0
  }  /* End of switch(meta) */
2238
2239
0
pop_info->code_start = (lengthptr == NULL)? code_start : NULL;
2240
2241
0
if (lengthptr != NULL)
2242
0
  {
2243
0
  *lengthptr += code - code_start;
2244
0
  code = code_start;
2245
0
  }
2246
2247
0
DONE:
2248
0
PCRE2_ASSERT(lengthptr == NULL || (code == code_start));
2249
2250
0
*pptr = ptr;
2251
0
*pcode = code;
2252
0
return TRUE;
2253
0
}
2254
2255
2256
2257
/* This function consumes a group of implicitly-unioned class elements.
2258
These can be characters, ranges, properties, or nested classes, as long
2259
as they are all joined by being placed adjacently. */
2260
2261
static BOOL
2262
compile_class_juxtaposition(eclass_context *context, BOOL negated,
2263
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2264
  PCRE2_SIZE *lengthptr)
2265
0
{
2266
0
uint32_t *ptr = *pptr;
2267
0
PCRE2_UCHAR *code = *pcode;
2268
#ifdef PCRE2_DEBUG
2269
PCRE2_UCHAR *start_code = *pcode;
2270
#endif
2271
2272
/* See compile_class_binary_loose() for comments on compile-time folding of
2273
the "negated" flag. */
2274
2275
/* Because it's a non-empty class, there must be an operand at the start. */
2276
0
if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr))
2277
0
  return FALSE;
2278
2279
0
while (*ptr != META_CLASS_END &&
2280
0
       !(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT))
2281
0
  {
2282
0
  uint32_t op;
2283
0
  BOOL rhs_negated;
2284
0
  eclass_op_info rhs_op_info;
2285
2286
0
  if (negated)
2287
0
    {
2288
    /* !(A juxtapose B)  ->  !A && !B */
2289
0
    op = ECL_AND;
2290
0
    rhs_negated = TRUE;
2291
0
    }
2292
0
  else
2293
0
    {
2294
    /* A juxtapose B  ->  A || B */
2295
0
    op = ECL_OR;
2296
0
    rhs_negated = FALSE;
2297
0
    }
2298
2299
  /* An operand must follow the operator. */
2300
0
  if (!compile_class_operand(context, rhs_negated, &ptr, &code,
2301
0
                             &rhs_op_info, lengthptr))
2302
0
    return FALSE;
2303
2304
  /* Convert infix to postfix (RPN). */
2305
0
  fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2306
0
  if (lengthptr == NULL)
2307
0
    code = pop_info->code_start + pop_info->length;
2308
0
  }
2309
2310
0
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2311
2312
0
*pptr = ptr;
2313
0
*pcode = code;
2314
0
return TRUE;
2315
0
}
2316
2317
2318
2319
/* This function consumes unary prefix operators. */
2320
2321
static BOOL
2322
compile_class_unary(eclass_context *context, BOOL negated,
2323
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2324
  PCRE2_SIZE *lengthptr)
2325
0
{
2326
0
uint32_t *ptr = *pptr;
2327
#ifdef PCRE2_DEBUG
2328
PCRE2_UCHAR *start_code = *pcode;
2329
#endif
2330
2331
0
while (*ptr == META_ECLASS_NOT)
2332
0
  {
2333
0
  ++ptr;
2334
0
  negated = !negated;
2335
0
  }
2336
2337
0
*pptr = ptr;
2338
/* Because it's a non-empty class, there must be an operand. */
2339
0
if (!compile_class_juxtaposition(context, negated, pptr, pcode,
2340
0
                                 pop_info, lengthptr))
2341
0
  return FALSE;
2342
2343
0
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
2344
0
return TRUE;
2345
0
}
2346
2347
2348
2349
/* This function consumes tightly-binding binary operators. */
2350
2351
static BOOL
2352
compile_class_binary_tight(eclass_context *context, BOOL negated,
2353
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2354
  PCRE2_SIZE *lengthptr)
2355
0
{
2356
0
uint32_t *ptr = *pptr;
2357
0
PCRE2_UCHAR *code = *pcode;
2358
#ifdef PCRE2_DEBUG
2359
PCRE2_UCHAR *start_code = *pcode;
2360
#endif
2361
2362
/* See compile_class_binary_loose() for comments on compile-time folding of
2363
the "negated" flag. */
2364
2365
/* Because it's a non-empty class, there must be an operand at the start. */
2366
0
if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr))
2367
0
  return FALSE;
2368
2369
0
while (*ptr == META_ECLASS_AND)
2370
0
  {
2371
0
  uint32_t op;
2372
0
  BOOL rhs_negated;
2373
0
  eclass_op_info rhs_op_info;
2374
2375
0
  if (negated)
2376
0
    {
2377
    /* !(A && B)  ->  !A || !B */
2378
0
    op = ECL_OR;
2379
0
    rhs_negated = TRUE;
2380
0
    }
2381
0
  else
2382
0
    {
2383
    /* A && B  ->  A && B */
2384
0
    op = ECL_AND;
2385
0
    rhs_negated = FALSE;
2386
0
    }
2387
2388
0
  ++ptr;
2389
2390
  /* An operand must follow the operator. */
2391
0
  if (!compile_class_unary(context, rhs_negated, &ptr, &code,
2392
0
                           &rhs_op_info, lengthptr))
2393
0
    return FALSE;
2394
2395
  /* Convert infix to postfix (RPN). */
2396
0
  fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2397
0
  if (lengthptr == NULL)
2398
0
    code = pop_info->code_start + pop_info->length;
2399
0
  }
2400
2401
0
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2402
2403
0
*pptr = ptr;
2404
0
*pcode = code;
2405
0
return TRUE;
2406
0
}
2407
2408
2409
2410
/* This function consumes loosely-binding binary operators. */
2411
2412
static BOOL
2413
compile_class_binary_loose(eclass_context *context, BOOL negated,
2414
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2415
  PCRE2_SIZE *lengthptr)
2416
0
{
2417
0
uint32_t *ptr = *pptr;
2418
0
PCRE2_UCHAR *code = *pcode;
2419
#ifdef PCRE2_DEBUG
2420
PCRE2_UCHAR *start_code = *pcode;
2421
#endif
2422
2423
/* We really want to fold the negation operator, if at all possible, so that
2424
simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want
2425
to produce a fully-folded expression, so that we can guarantee not to emit any
2426
OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode).
2427
2428
This has the consequence that with a little ingenuity, we can in fact avoid
2429
emitting (nearly...) all cases of the "NOT" operator. Imagine that we have:
2430
    !(A ...
2431
We have parsed the preceding "!", and we are about to parse the "A" operand. We
2432
don't know yet whether there will even be a following binary operand! Both of
2433
these are possibilities for what follows:
2434
    !(A && B)
2435
    !(A)
2436
However, we can still fold the "!" into the "A" operand, because no matter what
2437
the following binary operator will be, we can produce an expression which is
2438
equivalent. */
2439
2440
/* Because it's a non-empty class, there must be an operand at the start. */
2441
0
if (!compile_class_binary_tight(context, negated, &ptr, &code,
2442
0
                                pop_info, lengthptr))
2443
0
  return FALSE;
2444
2445
0
while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR)
2446
0
  {
2447
0
  uint32_t op;
2448
0
  BOOL op_neg;
2449
0
  BOOL rhs_negated;
2450
0
  eclass_op_info rhs_op_info;
2451
2452
0
  if (negated)
2453
0
    {
2454
    /* The whole expression is being negated; we respond by unconditionally
2455
    negating the LHS A, before seeing what follows. And hooray! We can recover,
2456
    no matter what follows. */
2457
    /* !(A || B)   ->  !A && !B                     */
2458
    /* !(A -- B)   ->  !(A && !B)    ->  !A || B    */
2459
    /* !(A XOR B)  ->  !(!A XOR !B)  ->  !A XNOR !B */
2460
0
    op = (*ptr == META_ECLASS_OR )? ECL_AND :
2461
0
         (*ptr == META_ECLASS_SUB)? ECL_OR  :
2462
0
         /*ptr == META_ECLASS_XOR*/ ECL_XOR;
2463
0
    op_neg = (*ptr == META_ECLASS_XOR);
2464
0
    rhs_negated = *ptr != META_ECLASS_SUB;
2465
0
    }
2466
0
  else
2467
0
    {
2468
    /* A || B   ->  A || B  */
2469
    /* A -- B   ->  A && !B */
2470
    /* A XOR B  ->  A XOR B */
2471
0
    op = (*ptr == META_ECLASS_OR )? ECL_OR  :
2472
0
         (*ptr == META_ECLASS_SUB)? ECL_AND :
2473
0
         /*ptr == META_ECLASS_XOR*/ ECL_XOR;
2474
0
    op_neg = FALSE;
2475
0
    rhs_negated = *ptr == META_ECLASS_SUB;
2476
0
    }
2477
2478
0
  ++ptr;
2479
2480
  /* An operand must follow the operator. */
2481
0
  if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code,
2482
0
                                  &rhs_op_info, lengthptr))
2483
0
    return FALSE;
2484
2485
  /* Convert infix to postfix (RPN). */
2486
0
  fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2487
0
  if (op_neg) fold_negation(pop_info, lengthptr, FALSE);
2488
0
  if (lengthptr == NULL)
2489
0
    code = pop_info->code_start + pop_info->length;
2490
0
  }
2491
2492
0
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2493
2494
0
*pptr = ptr;
2495
0
*pcode = code;
2496
0
return TRUE;
2497
0
}
2498
2499
2500
2501
/* This function converts the META codes in pptr into opcodes written to
2502
pcode. The pptr must start at a META_CLASS or META_CLASS_NOT.
2503
2504
The class is compiled as a left-associative sequence of operator
2505
applications.
2506
2507
The pptr will be left pointing at the matching META_CLASS_END. */
2508
2509
static BOOL
2510
compile_eclass_nested(eclass_context *context, BOOL negated,
2511
  uint32_t **pptr, PCRE2_UCHAR **pcode,
2512
  eclass_op_info *pop_info, PCRE2_SIZE *lengthptr)
2513
0
{
2514
0
uint32_t *ptr = *pptr;
2515
#ifdef PCRE2_DEBUG
2516
PCRE2_UCHAR *start_code = *pcode;
2517
#endif
2518
2519
/* The CLASS_IS_ECLASS bit must be set since it is a nested class. */
2520
0
PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) ||
2521
0
             *ptr == (META_CLASS_NOT | CLASS_IS_ECLASS));
2522
2523
0
if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS))
2524
0
  negated = !negated;
2525
2526
0
(*pptr)++;
2527
2528
/* Because it's a non-empty class, there must be an operand at the start. */
2529
0
if (!compile_class_binary_loose(context, negated, pptr, pcode,
2530
0
                                pop_info, lengthptr))
2531
0
  return FALSE;
2532
2533
0
PCRE2_ASSERT(**pptr == META_CLASS_END);
2534
0
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
2535
0
return TRUE;
2536
0
}
2537
2538
BOOL
2539
PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions,
2540
  uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr,
2541
  compile_block *cb, PCRE2_SIZE *lengthptr)
2542
0
{
2543
0
eclass_context context;
2544
0
eclass_op_info op_info;
2545
0
PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0;
2546
0
PCRE2_UCHAR *code = *pcode;
2547
0
PCRE2_UCHAR *previous;
2548
0
BOOL allbitsone = TRUE;
2549
2550
0
context.needs_bitmap = FALSE;
2551
0
context.options = options;
2552
0
context.xoptions = xoptions;
2553
0
context.errorcodeptr = errorcodeptr;
2554
0
context.cb = cb;
2555
2556
0
previous = code;
2557
0
*code++ = OP_ECLASS;
2558
0
code += LINK_SIZE;
2559
0
*code++ = 0;  /* Flags, currently zero. */
2560
0
if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr))
2561
0
  return FALSE;
2562
2563
0
if (lengthptr != NULL)
2564
0
  {
2565
0
  *lengthptr += code - previous;
2566
0
  code = previous;
2567
  /* (*lengthptr - previous_length) now holds the amount of buffer that
2568
  we require to make the call to compile_class_nested() with
2569
  lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out
2570
  before that call. */
2571
0
  }
2572
2573
/* Do some useful counting of what's in the bitmap. */
2574
0
for (int i = 0; i < 8; i++)
2575
0
  if (op_info.bits.classwords[i] != 0xffffffff)
2576
0
    {
2577
0
    allbitsone = FALSE;
2578
0
    break;
2579
0
    }
2580
2581
/* After constant-folding the extended class syntax, it may turn out to be
2582
a simple class after all. In that case, we can unwrap it from the
2583
OP_ECLASS container - and in fact, we must do so, because in 8-bit
2584
no-Unicode mode the matcher is compiled without support for OP_ECLASS. */
2585
2586
#ifndef SUPPORT_WIDE_CHARS
2587
PCRE2_ASSERT(op_info.op_single_type != 0);
2588
#else
2589
0
if (op_info.op_single_type != 0)
2590
0
#endif
2591
0
  {
2592
  /* Rewind back over the OP_ECLASS. */
2593
0
  code = previous;
2594
2595
  /* If the bits are all ones, and the "high characters" are all matched
2596
  too, we use a special-cased encoding of OP_ALLANY. */
2597
2598
0
  if (op_info.op_single_type == ECL_ANY && allbitsone)
2599
0
    {
2600
    /* Advancing code means rewinding lengthptr, at this point. */
2601
0
    if (lengthptr != NULL) *lengthptr -= 1;
2602
0
    *code++ = OP_ALLANY;
2603
0
    }
2604
2605
  /* If the high bits are all matched / all not-matched, then we emit an
2606
  OP_NCLASS/OP_CLASS respectively. */
2607
2608
0
  else if (op_info.op_single_type == ECL_ANY ||
2609
0
           op_info.op_single_type == ECL_NONE)
2610
0
    {
2611
0
    PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR));
2612
2613
0
    if (lengthptr != NULL)
2614
0
      {
2615
0
      if (required_len > (*lengthptr - previous_length))
2616
0
      *lengthptr = previous_length + required_len;
2617
0
      }
2618
2619
    /* Advancing code means rewinding lengthptr, at this point. */
2620
0
    if (lengthptr != NULL) *lengthptr -= required_len;
2621
0
    *code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS;
2622
0
    memcpy(code, op_info.bits.classbits, 32);
2623
0
    code += 32 / sizeof(PCRE2_UCHAR);
2624
0
    }
2625
2626
  /* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data
2627
  there, but, we pulled out its bitmap into op_info, so now we have to
2628
  put that back into the OP_XCLASS. */
2629
2630
0
  else
2631
0
    {
2632
#ifndef SUPPORT_WIDE_CHARS
2633
    PCRE2_DEBUG_UNREACHABLE();
2634
#else
2635
0
    BOOL need_map = context.needs_bitmap;
2636
0
    PCRE2_SIZE required_len;
2637
2638
0
    PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS);
2639
0
    required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0);
2640
2641
0
    if (lengthptr != NULL)
2642
0
      {
2643
      /* Don't unconditionally request all the space we need - we may
2644
      already have asked for more during processing of the ECLASS. */
2645
0
      if (required_len > (*lengthptr - previous_length))
2646
0
        *lengthptr = previous_length + required_len;
2647
2648
      /* The code we write out here won't be ignored, even during the
2649
      (lengthptr != NULL) phase, because if there's a following quantifier
2650
      it will peek backwards. So we do have to write out a (truncated)
2651
      OP_XCLASS, even on this branch. */
2652
0
      *lengthptr -= 1 + LINK_SIZE + 1;
2653
0
      *code++ = OP_XCLASS;
2654
0
      PUT(code, 0, 1 + LINK_SIZE + 1);
2655
0
      code += LINK_SIZE;
2656
0
      *code++ = 0;
2657
0
      }
2658
0
    else
2659
0
      {
2660
0
      PCRE2_UCHAR *rest;
2661
0
      PCRE2_SIZE rest_len;
2662
0
      PCRE2_UCHAR flags;
2663
2664
      /* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */
2665
0
      PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1);
2666
0
      rest = op_info.code_start + 1 + LINK_SIZE + 1;
2667
0
      rest_len = (op_info.code_start + op_info.length) - rest;
2668
2669
      /* First read any data we use, before memmove splats it. */
2670
0
      flags = op_info.code_start[1 + LINK_SIZE];
2671
0
      PCRE2_ASSERT((flags & XCL_MAP) == 0);
2672
2673
      /* Next do the memmove before any writes. */
2674
0
      memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0),
2675
0
              rest, CU2BYTES(rest_len));
2676
2677
      /* Finally write the header data. */
2678
0
      *code++ = OP_XCLASS;
2679
0
      PUT(code, 0, (int)required_len);
2680
0
      code += LINK_SIZE;
2681
0
      *code++ = flags | (need_map? XCL_MAP : 0);
2682
0
      if (need_map)
2683
0
        {
2684
0
        memcpy(code, op_info.bits.classbits, 32);
2685
0
        code += 32 / sizeof(PCRE2_UCHAR);
2686
0
        }
2687
0
      code += rest_len;
2688
0
      }
2689
0
#endif /* SUPPORT_WIDE_CHARS */
2690
0
    }
2691
0
  }
2692
2693
/* Otherwise, we're going to keep the OP_ECLASS. However, again we need
2694
to do some adjustment to insert the bitmap if we have one. */
2695
2696
0
#ifdef SUPPORT_WIDE_CHARS
2697
0
else
2698
0
  {
2699
0
  BOOL need_map = context.needs_bitmap;
2700
0
  PCRE2_SIZE required_len =
2701
0
    1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length;
2702
2703
0
  if (lengthptr != NULL)
2704
0
    {
2705
0
    if (required_len > (*lengthptr - previous_length))
2706
0
      *lengthptr = previous_length + required_len;
2707
2708
    /* As for the XCLASS branch above, we do have to write out a dummy
2709
    OP_ECLASS, because of the backwards peek by the quantifier code. Write
2710
    out a (truncated) OP_ECLASS, even on this branch. */
2711
0
    *lengthptr -= 1 + LINK_SIZE + 1;
2712
0
    *code++ = OP_ECLASS;
2713
0
    PUT(code, 0, 1 + LINK_SIZE + 1);
2714
0
    code += LINK_SIZE;
2715
0
    *code++ = 0;
2716
0
    }
2717
0
  else
2718
0
    {
2719
0
    if (need_map)
2720
0
      {
2721
0
      PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1;
2722
0
      previous[1 + LINK_SIZE] |= ECL_MAP;
2723
0
      memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start,
2724
0
              CU2BYTES(code - map_start));
2725
0
      memcpy(map_start, op_info.bits.classbits, 32);
2726
0
      code += 32 / sizeof(PCRE2_UCHAR);
2727
0
      }
2728
0
    PUT(previous, 1, (int)(code - previous));
2729
0
    }
2730
0
  }
2731
0
#endif /* SUPPORT_WIDE_CHARS */
2732
2733
0
*pcode = code;
2734
0
return TRUE;
2735
0
}
2736
2737
/* End of pcre2_compile_class.c */