Coverage Report

Created: 2025-09-27 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/pcre/pcre2lib/pcre2_compile_class.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
#ifdef HAVE_CONFIG_H
42
#include "config.h"
43
#endif
44
45
#include "pcre2_compile.h"
46
47
typedef struct {
48
  /* Option bits for eclass. */
49
  uint32_t options;
50
  uint32_t xoptions;
51
  /* Rarely used members. */
52
  int *errorcodeptr;
53
  compile_block *cb;
54
  /* Bitmap is needed. */
55
  BOOL needs_bitmap;
56
} eclass_context;
57
58
/* Checks the allowed tokens at the end of a class structure in debug mode.
59
When a new token is not processed by all loops, and the token is equals to
60
a) one of the cases here:
61
   the compiler will complain about a duplicated case value.
62
b) none of the cases here:
63
   the loop without the handler will stop with an assertion failure. */
64
65
#ifdef PCRE2_DEBUG
66
#define CLASS_END_CASES(meta) \
67
  default: \
68
  PCRE2_ASSERT((meta) <= META_END); \
69
  /* Fall through */ \
70
  case META_CLASS: \
71
  case META_CLASS_NOT: \
72
  case META_CLASS_EMPTY: \
73
  case META_CLASS_EMPTY_NOT: \
74
  case META_CLASS_END: \
75
  case META_ECLASS_AND: \
76
  case META_ECLASS_OR: \
77
  case META_ECLASS_SUB: \
78
  case META_ECLASS_XOR: \
79
  case META_ECLASS_NOT:
80
#else
81
#define CLASS_END_CASES(meta) \
82
177k
  default:
83
#endif
84
85
#ifdef SUPPORT_WIDE_CHARS
86
87
/* Heapsort algorithm. */
88
89
static void do_heapify(uint32_t *buffer, size_t size, size_t i)
90
34.1k
{
91
34.1k
size_t max;
92
34.1k
size_t left;
93
34.1k
size_t right;
94
34.1k
uint32_t tmp1, tmp2;
95
96
98.9k
while (TRUE)
97
98.9k
  {
98
98.9k
  max = i;
99
98.9k
  left = (i << 1) + 2;
100
98.9k
  right = left + 2;
101
102
98.9k
  if (left < size && buffer[left] > buffer[max]) max = left;
103
98.9k
  if (right < size && buffer[right] > buffer[max]) max = right;
104
98.9k
  if (i == max) return;
105
106
  /* Swap items. */
107
64.7k
  tmp1 = buffer[i];
108
64.7k
  tmp2 = buffer[i + 1];
109
64.7k
  buffer[i] = buffer[max];
110
64.7k
  buffer[i + 1] = buffer[max + 1];
111
64.7k
  buffer[max] = tmp1;
112
64.7k
  buffer[max + 1] = tmp2;
113
64.7k
  i = max;
114
64.7k
  }
115
34.1k
}
116
117
#ifdef SUPPORT_UNICODE
118
119
1.08k
#define PARSE_CLASS_UTF               0x1
120
27.3k
#define PARSE_CLASS_CASELESS_UTF      0x2
121
29.3k
#define PARSE_CLASS_RESTRICTED_UTF    0x4
122
82.9k
#define PARSE_CLASS_TURKISH_UTF       0x8
123
124
/* Get the range of nocase characters which includes the
125
'c' character passed as argument, or directly follows 'c'. */
126
127
static const uint32_t*
128
get_nocase_range(uint32_t c)
129
24.9k
{
130
24.9k
uint32_t left = 0;
131
24.9k
uint32_t right = PRIV(ucd_nocase_ranges_size);
132
24.9k
uint32_t middle;
133
134
24.9k
if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;
135
136
138k
while (TRUE)
137
138k
  {
138
  /* Range end of the middle element. */
139
138k
  middle = ((left + right) >> 1) | 0x1;
140
141
138k
  if (PRIV(ucd_nocase_ranges)[middle] <= c)
142
64
    left = middle + 1;
143
138k
  else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)
144
113k
    right = middle - 1;
145
24.9k
  else
146
24.9k
    return PRIV(ucd_nocase_ranges) + (middle - 1);
147
138k
  }
148
24.9k
}
149
150
/* Get the list of othercase characters, which belongs to the passed range.
151
Create ranges from these characters, and append them to the buffer argument. */
152
153
static size_t
154
utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,
155
  uint32_t *buffer)
156
24.9k
{
157
24.9k
uint32_t new_start = start;
158
24.9k
uint32_t new_end = end;
159
24.9k
uint32_t c = start;
160
24.9k
const uint32_t *list;
161
24.9k
uint32_t tmp[3];
162
24.9k
size_t result = 2;
163
24.9k
const uint32_t *skip_range = get_nocase_range(c);
164
24.9k
uint32_t skip_start = skip_range[0];
165
166
24.9k
#if PCRE2_CODE_UNIT_WIDTH == 8
167
24.9k
PCRE2_ASSERT(options & PARSE_CLASS_UTF);
168
24.9k
#endif
169
170
#if PCRE2_CODE_UNIT_WIDTH == 32
171
if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;
172
#endif
173
174
62.9k
while (c <= end)
175
38.0k
  {
176
38.0k
  uint32_t co;
177
178
38.0k
  if (c > skip_start)
179
10.4k
    {
180
10.4k
    c = skip_range[1];
181
10.4k
    skip_range += 2;
182
10.4k
    skip_start = skip_range[0];
183
10.4k
    continue;
184
10.4k
    }
185
186
  /* Compute caseless set. */
187
188
27.6k
  if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) ==
189
27.6k
        PARSE_CLASS_TURKISH_UTF &&
190
0
      UCD_ANY_I(c))
191
0
    {
192
0
    co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
193
0
    }
194
27.6k
  else if ((co = UCD_CASESET(c)) != 0 &&
195
1.67k
           (options & PARSE_CLASS_RESTRICTED_UTF) != 0 &&
196
0
           PRIV(ucd_caseless_sets)[co] < 128)
197
0
    {
198
0
    co = 0;  /* Ignore the caseless set if it's restricted. */
199
0
    }
200
201
27.6k
  if (co != 0)
202
1.67k
    list = PRIV(ucd_caseless_sets) + co;
203
25.9k
  else
204
25.9k
    {
205
25.9k
    co = UCD_OTHERCASE(c);
206
25.9k
    list = tmp;
207
25.9k
    tmp[0] = c;
208
25.9k
    tmp[1] = NOTACHAR;
209
210
25.9k
    if (co != c)
211
17.9k
      {
212
17.9k
      tmp[1] = co;
213
17.9k
      tmp[2] = NOTACHAR;
214
17.9k
      }
215
25.9k
    }
216
27.6k
  c++;
217
218
  /* Add characters. */
219
27.6k
  do
220
48.9k
    {
221
#if PCRE2_CODE_UNIT_WIDTH == 16
222
    if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue;
223
#endif
224
225
48.9k
    if (*list < new_start)
226
5.80k
      {
227
5.80k
      if (*list + 1 == new_start)
228
0
        {
229
0
        new_start--;
230
0
        continue;
231
0
        }
232
5.80k
      }
233
43.1k
    else if (*list > new_end)
234
14.1k
      {
235
14.1k
      if (*list - 1 == new_end)
236
358
        {
237
358
        new_end++;
238
358
        continue;
239
358
        }
240
14.1k
      }
241
28.9k
    else continue;
242
243
19.5k
    result += 2;
244
19.5k
    if (buffer != NULL)
245
9.78k
      {
246
9.78k
      buffer[0] = *list;
247
9.78k
      buffer[1] = *list;
248
9.78k
      buffer += 2;
249
9.78k
      }
250
19.5k
    }
251
48.9k
  while (*(++list) != NOTACHAR);
252
27.6k
  }
253
254
24.9k
  if (buffer != NULL)
255
12.4k
    {
256
12.4k
    buffer[0] = new_start;
257
12.4k
    buffer[1] = new_end;
258
12.4k
    buffer += 2;
259
12.4k
    (void)buffer;
260
12.4k
    }
261
24.9k
  return result;
262
24.9k
}
263
264
#endif
265
266
/* Add a character list to a buffer. */
267
268
static size_t
269
append_char_list(const uint32_t *p, uint32_t *buffer)
270
4
{
271
4
const uint32_t *n;
272
4
size_t result = 0;
273
274
40
while (*p != NOTACHAR)
275
36
  {
276
36
  n = p;
277
76
  while (n[0] == n[1] - 1) n++;
278
279
36
  PCRE2_ASSERT(*p < 0xffff);
280
281
36
  if (buffer != NULL)
282
18
    {
283
18
    buffer[0] = *p;
284
18
    buffer[1] = *n;
285
18
    buffer += 2;
286
18
    }
287
288
36
  result += 2;
289
36
  p = n + 1;
290
36
  }
291
292
4
  return result;
293
4
}
294
295
static uint32_t
296
get_highest_char(uint32_t options)
297
0
{
298
0
(void)options; /* Avoid compiler warning. */
299
300
0
#if PCRE2_CODE_UNIT_WIDTH == 8
301
0
return MAX_UTF_CODE_POINT;
302
#else
303
#ifdef SUPPORT_UNICODE
304
return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0);
305
#else
306
return MAX_UCHAR_VALUE;
307
#endif
308
#endif
309
0
}
310
311
/* Add a negated character list to a buffer. */
312
static size_t
313
append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer)
314
0
{
315
0
const uint32_t *n;
316
0
uint32_t start = 0;
317
0
size_t result = 2;
318
319
0
PCRE2_ASSERT(*p > 0);
320
321
0
while (*p != NOTACHAR)
322
0
  {
323
0
  n = p;
324
0
  while (n[0] == n[1] - 1) n++;
325
326
0
  PCRE2_ASSERT(*p < 0xffff);
327
328
0
  if (buffer != NULL)
329
0
    {
330
0
    buffer[0] = start;
331
0
    buffer[1] = *p - 1;
332
0
    buffer += 2;
333
0
    }
334
335
0
  result += 2;
336
0
  start = *n + 1;
337
0
  p = n + 1;
338
0
  }
339
340
0
  if (buffer != NULL)
341
0
    {
342
0
    buffer[0] = start;
343
0
    buffer[1] = get_highest_char(options);
344
0
    buffer += 2;
345
0
    (void)buffer;
346
0
    }
347
348
0
  return result;
349
0
}
350
351
static uint32_t *
352
append_non_ascii_range(uint32_t options, uint32_t *buffer)
353
0
{
354
0
  if (buffer == NULL) return NULL;
355
356
0
  buffer[0] = 0x100;
357
0
  buffer[1] = get_highest_char(options);
358
0
  return buffer + 2;
359
0
}
360
361
static size_t
362
parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
363
2.16k
{
364
2.16k
size_t total_size = 0;
365
2.16k
size_t size;
366
2.16k
uint32_t meta_arg;
367
2.16k
uint32_t start_char;
368
369
28.6k
while (TRUE)
370
28.6k
  {
371
28.6k
  switch (META_CODE(*ptr))
372
28.6k
    {
373
132
    case META_ESCAPE:
374
132
      meta_arg = META_DATA(*ptr);
375
132
      switch (meta_arg)
376
132
        {
377
0
        case ESC_D:
378
0
        case ESC_W:
379
0
        case ESC_S:
380
0
        buffer = append_non_ascii_range(options, buffer);
381
0
        total_size += 2;
382
0
        break;
383
384
4
        case ESC_h:
385
4
        size = append_char_list(PRIV(hspace_list), buffer);
386
4
        total_size += size;
387
4
        if (buffer != NULL) buffer += size;
388
4
        break;
389
390
0
        case ESC_H:
391
0
        size = append_negated_char_list(PRIV(hspace_list), options, buffer);
392
0
        total_size += size;
393
0
        if (buffer != NULL) buffer += size;
394
0
        break;
395
396
0
        case ESC_v:
397
0
        size = append_char_list(PRIV(vspace_list), buffer);
398
0
        total_size += size;
399
0
        if (buffer != NULL) buffer += size;
400
0
        break;
401
402
0
        case ESC_V:
403
0
        size = append_negated_char_list(PRIV(vspace_list), options, buffer);
404
0
        total_size += size;
405
0
        if (buffer != NULL) buffer += size;
406
0
        break;
407
408
128
        case ESC_p:
409
128
        case ESC_P:
410
128
        ptr++;
411
128
        if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)
412
0
          {
413
0
          if (buffer != NULL)
414
0
            {
415
0
            buffer[0] = 0;
416
0
            buffer[1] = get_highest_char(options);
417
0
            buffer += 2;
418
0
            }
419
0
          total_size += 2;
420
0
          }
421
128
        break;
422
132
        }
423
132
      ptr++;
424
132
      continue;
425
0
    case META_POSIX_NEG:
426
0
      buffer = append_non_ascii_range(options, buffer);
427
0
      total_size += 2;
428
0
      ptr += 2;
429
0
      continue;
430
0
    case META_POSIX:
431
0
      ptr += 2;
432
0
      continue;
433
0
    case META_BIGVALUE:
434
      /* Character literal */
435
0
      ptr++;
436
0
      break;
437
28.5k
    CLASS_END_CASES(*ptr)
438
28.5k
      if (*ptr >= META_END) return total_size;
439
26.3k
      break;
440
28.6k
    }
441
442
26.3k
    start_char = *ptr;
443
444
26.3k
    if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED)
445
458
      {
446
458
      ptr += 2;
447
458
      PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE);
448
449
458
      if (*ptr == META_BIGVALUE) ptr++;
450
451
#ifdef EBCDIC
452
#error "Missing EBCDIC support"
453
#endif
454
458
      }
455
456
26.3k
#ifdef SUPPORT_UNICODE
457
26.3k
    if (options & PARSE_CLASS_CASELESS_UTF)
458
24.9k
      {
459
24.9k
      size = utf_caseless_extend(start_char, *ptr++, options, buffer);
460
24.9k
      if (buffer != NULL) buffer += size;
461
24.9k
      total_size += size;
462
24.9k
      continue;
463
24.9k
      }
464
1.43k
#endif
465
466
1.43k
    if (buffer != NULL)
467
717
      {
468
717
      buffer[0] = start_char;
469
717
      buffer[1] = *ptr;
470
717
      buffer += 2;
471
717
      }
472
473
1.43k
    ptr++;
474
1.43k
    total_size += 2;
475
1.43k
  }
476
477
0
  return total_size;
478
2.16k
}
479
480
/* Extra uint32_t values for storing the lengths of range lists in
481
the worst case. Two uint32_t lengths and a range end for a range
482
starting before 255 */
483
1.08k
#define CHAR_LIST_EXTRA_SIZE 3
484
485
/* Starting character values for each character list. */
486
487
static const uint32_t char_list_starts[] = {
488
#if PCRE2_CODE_UNIT_WIDTH == 32
489
  XCL_CHAR_LIST_HIGH_32_START,
490
#endif
491
#if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE
492
  XCL_CHAR_LIST_LOW_32_START,
493
#endif
494
  XCL_CHAR_LIST_HIGH_16_START,
495
  /* Must be terminated by XCL_CHAR_LIST_LOW_16_START,
496
  which also represents the end of the bitset. */
497
  XCL_CHAR_LIST_LOW_16_START,
498
};
499
500
static class_ranges *
501
compile_optimize_class(uint32_t *start_ptr, uint32_t options,
502
  uint32_t xoptions, compile_block *cb)
503
1.08k
{
504
1.08k
class_ranges* cranges;
505
1.08k
uint32_t *ptr;
506
1.08k
uint32_t *buffer;
507
1.08k
uint32_t *dst;
508
1.08k
uint32_t class_options = 0;
509
1.08k
size_t range_list_size = 0, total_size, i;
510
1.08k
uint32_t tmp1, tmp2;
511
1.08k
const uint32_t *char_list_next;
512
1.08k
uint16_t *next_char;
513
1.08k
uint32_t char_list_start, char_list_end;
514
1.08k
uint32_t range_start, range_end;
515
516
1.08k
#ifdef SUPPORT_UNICODE
517
1.08k
if (options & PCRE2_UTF)
518
1.08k
  class_options |= PARSE_CLASS_UTF;
519
520
1.08k
if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP)))
521
1.05k
  class_options |= PARSE_CLASS_CASELESS_UTF;
522
523
1.08k
if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT)
524
0
  class_options |= PARSE_CLASS_RESTRICTED_UTF;
525
526
1.08k
if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
527
0
  class_options |= PARSE_CLASS_TURKISH_UTF;
528
1.08k
#endif
529
530
/* Compute required space for the range. */
531
532
1.08k
range_list_size = parse_class(start_ptr, class_options, NULL);
533
1.08k
PCRE2_ASSERT((range_list_size & 0x1) == 0);
534
535
/* Allocate buffer. The total_size also represents the end of the buffer. */
536
537
1.08k
total_size = range_list_size +
538
1.08k
   ((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);
539
540
1.08k
cranges = cb->cx->memctl.malloc(
541
1.08k
  sizeof(class_ranges) + total_size * sizeof(uint32_t),
542
1.08k
  cb->cx->memctl.memory_data);
543
544
1.08k
if (cranges == NULL) return NULL;
545
546
1.08k
cranges->next = NULL;
547
1.08k
cranges->range_list_size = (uint16_t)range_list_size;
548
1.08k
cranges->char_lists_types = 0;
549
1.08k
cranges->char_lists_size = 0;
550
1.08k
cranges->char_lists_start = 0;
551
552
1.08k
if (range_list_size == 0) return cranges;
553
554
1.08k
buffer = (uint32_t*)(cranges + 1);
555
1.08k
parse_class(start_ptr, class_options, buffer);
556
557
/* Using <= instead of == to help static analysis. */
558
1.08k
if (range_list_size <= 2) return cranges;
559
560
/* In-place sorting of ranges. */
561
562
1.08k
i = (((range_list_size >> 2) - 1) << 1);
563
11.2k
while (TRUE)
564
11.2k
  {
565
11.2k
  do_heapify(buffer, range_list_size, i);
566
11.2k
  if (i == 0) break;
567
10.1k
  i -= 2;
568
10.1k
  }
569
570
1.08k
i = range_list_size - 2;
571
22.9k
while (TRUE)
572
22.9k
  {
573
22.9k
  tmp1 = buffer[i];
574
22.9k
  tmp2 = buffer[i + 1];
575
22.9k
  buffer[i] = buffer[0];
576
22.9k
  buffer[i + 1] = buffer[1];
577
22.9k
  buffer[0] = tmp1;
578
22.9k
  buffer[1] = tmp2;
579
580
22.9k
  do_heapify(buffer, i, 0);
581
22.9k
  if (i == 0) break;
582
21.8k
  i -= 2;
583
21.8k
  }
584
585
/* Merge ranges whenever possible. */
586
1.08k
dst = buffer;
587
1.08k
ptr = buffer + 2;
588
1.08k
range_list_size -= 2;
589
590
/* The second condition is a very rare corner case, where the end of the last
591
range is the maximum character. This range cannot be extended further. */
592
593
22.9k
while (range_list_size > 0 && dst[1] != ~(uint32_t)0)
594
21.8k
  {
595
21.8k
  if (dst[1] + 1 < ptr[0])
596
7.12k
    {
597
7.12k
    dst += 2;
598
7.12k
    dst[0] = ptr[0];
599
7.12k
    dst[1] = ptr[1];
600
7.12k
    }
601
14.7k
  else if (dst[1] < ptr[1]) dst[1] = ptr[1];
602
603
21.8k
  ptr += 2;
604
21.8k
  range_list_size -= 2;
605
21.8k
  }
606
607
1.08k
PCRE2_ASSERT(dst[1] <= get_highest_char(class_options));
608
609
/* When the number of ranges are less than six,
610
they are not converted to range lists. */
611
612
1.08k
ptr = buffer;
613
7.88k
while (ptr < dst && ptr[1] < 0x100) ptr += 2;
614
1.08k
if (dst - ptr < (2 * (6 - 1)))
615
1.08k
  {
616
1.08k
  cranges->range_list_size = (uint16_t)(dst + 2 - buffer);
617
1.08k
  return cranges;
618
1.08k
  }
619
620
/* Compute character lists structures. */
621
622
3
char_list_next = char_list_starts;
623
3
char_list_start = *char_list_next++;
624
#if PCRE2_CODE_UNIT_WIDTH == 32
625
char_list_end = XCL_CHAR_LIST_HIGH_32_END;
626
#elif defined SUPPORT_UNICODE
627
3
char_list_end = XCL_CHAR_LIST_LOW_32_END;
628
#else
629
char_list_end = XCL_CHAR_LIST_HIGH_16_END;
630
#endif
631
3
next_char = (uint16_t*)(buffer + total_size);
632
633
3
tmp1 = 0;
634
3
tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN;
635
3
PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN);
636
3
range_start = dst[0];
637
3
range_end = dst[1];
638
639
35
while (TRUE)
640
35
  {
641
35
  if (range_start >= char_list_start)
642
26
    {
643
26
    if (range_start == range_end || range_end < char_list_end)
644
26
      {
645
26
      tmp1++;
646
26
      next_char--;
647
648
26
      if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
649
26
        *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
650
0
      else
651
0
        *(uint32_t*)(--next_char) =
652
0
          (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
653
26
      }
654
655
26
    if (range_start < range_end)
656
8
      {
657
8
      if (range_start > char_list_start)
658
8
        {
659
8
        tmp1++;
660
8
        next_char--;
661
662
8
        if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
663
8
          *next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT);
664
0
        else
665
0
          *(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT);
666
8
        }
667
0
      else
668
0
        cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
669
8
      }
670
671
26
    PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
672
673
26
    if (dst > buffer)
674
26
      {
675
26
      dst -= 2;
676
26
      range_start = dst[0];
677
26
      range_end = dst[1];
678
26
      continue;
679
26
      }
680
681
0
    range_start = 0;
682
0
    range_end = 0;
683
0
    }
684
685
9
  if (range_end >= char_list_start)
686
1
    {
687
1
    PCRE2_ASSERT(range_start < char_list_start);
688
689
1
    if (range_end < char_list_end)
690
1
      {
691
1
      tmp1++;
692
1
      next_char--;
693
694
1
      if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
695
1
        *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
696
0
      else
697
0
        *(uint32_t*)(--next_char) =
698
0
          (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
699
700
1
      PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
701
1
      }
702
703
1
    cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
704
1
    }
705
706
9
  if (tmp1 >= XCL_ITEM_COUNT_MASK)
707
4
    {
708
4
    cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2;
709
4
    next_char--;
710
711
4
    if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
712
4
      *next_char = (uint16_t)tmp1;
713
0
    else
714
0
      *(uint32_t*)(--next_char) = tmp1;
715
4
    }
716
5
  else
717
5
    cranges->char_lists_types |= tmp1 << tmp2;
718
719
9
  if (range_start < XCL_CHAR_LIST_LOW_16_START) break;
720
721
6
  PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN);
722
6
  char_list_end = char_list_start - 1;
723
6
  char_list_start = *char_list_next++;
724
6
  tmp1 = 0;
725
6
  tmp2 -= XCL_TYPE_BIT_LEN;
726
6
  }
727
728
3
if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2;
729
3
PCRE2_ASSERT((uint16_t*)dst <= next_char);
730
731
3
cranges->char_lists_size =
732
3
  (size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char);
733
3
cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer);
734
3
cranges->range_list_size = (uint16_t)(dst - buffer);
735
3
return cranges;
736
1.08k
}
737
738
#endif /* SUPPORT_WIDE_CHARS */
739
740
#ifdef SUPPORT_UNICODE
741
742
void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated,
743
  uint8_t *classbits)
744
276
{
745
/* Update PRIV(xclass) when this function is changed. */
746
276
int c, chartype;
747
276
const ucd_record *prop;
748
276
uint32_t gentype;
749
276
BOOL set_bit;
750
751
276
if (ptype == PT_ANY)
752
0
  {
753
0
  if (!negated) memset(classbits, 0xff, 32);
754
0
  return;
755
0
  }
756
757
70.9k
for (c = 0; c < 256; c++)
758
70.6k
  {
759
70.6k
  prop = GET_UCD(c);
760
70.6k
  set_bit = FALSE;
761
70.6k
  (void)set_bit;
762
763
70.6k
  switch (ptype)
764
70.6k
    {
765
0
    case PT_LAMP:
766
0
    chartype = prop->chartype;
767
0
    set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt);
768
0
    break;
769
770
37.8k
    case PT_GC:
771
37.8k
    set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata);
772
37.8k
    break;
773
774
3.07k
    case PT_PC:
775
3.07k
    set_bit = (prop->chartype == pdata);
776
3.07k
    break;
777
778
0
    case PT_SC:
779
0
    set_bit = (prop->script == pdata);
780
0
    break;
781
782
0
    case PT_SCX:
783
0
    set_bit = (prop->script == pdata ||
784
0
      MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
785
0
    break;
786
787
0
    case PT_ALNUM:
788
0
    gentype = PRIV(ucp_gentype)[prop->chartype];
789
0
    set_bit = (gentype == ucp_L || gentype == ucp_N);
790
0
    break;
791
792
0
    case PT_SPACE:    /* Perl space */
793
0
    case PT_PXSPACE:  /* POSIX space */
794
0
    switch(c)
795
0
      {
796
0
      HSPACE_BYTE_CASES:
797
0
      VSPACE_BYTE_CASES:
798
0
      set_bit = TRUE;
799
0
      break;
800
801
0
      default:
802
0
      set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z);
803
0
      break;
804
0
      }
805
0
    break;
806
807
29.6k
    case PT_WORD:
808
29.6k
    chartype = prop->chartype;
809
29.6k
    gentype = PRIV(ucp_gentype)[chartype];
810
29.6k
    set_bit = (gentype == ucp_L || gentype == ucp_N ||
811
14.2k
               chartype == ucp_Mn || chartype == ucp_Pc);
812
29.6k
    break;
813
814
0
    case PT_UCNC:
815
0
    set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
816
0
               c == CHAR_GRAVE_ACCENT || c >= 0xa0);
817
0
    break;
818
819
0
    case PT_BIDICL:
820
0
    set_bit = (UCD_BIDICLASS_PROP(prop) == pdata);
821
0
    break;
822
823
0
    case PT_BOOL:
824
0
    set_bit = MAPBIT(PRIV(ucd_boolprop_sets) +
825
0
                     UCD_BPROPS_PROP(prop), pdata) != 0;
826
0
    break;
827
828
0
    case PT_PXGRAPH:
829
0
    chartype = prop->chartype;
830
0
    gentype = PRIV(ucp_gentype)[chartype];
831
0
    set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf));
832
0
    break;
833
834
0
    case PT_PXPRINT:
835
0
    chartype = prop->chartype;
836
0
    set_bit = (chartype != ucp_Zl && chartype != ucp_Zp &&
837
0
       (PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf));
838
0
    break;
839
840
0
    case PT_PXPUNCT:
841
0
    gentype = PRIV(ucp_gentype)[prop->chartype];
842
0
    set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S));
843
0
    break;
844
845
0
    default:
846
0
    PCRE2_ASSERT(ptype == PT_PXXDIGIT);
847
0
    set_bit = (c >= CHAR_0 && c <= CHAR_9) ||
848
0
              (c >= CHAR_A && c <= CHAR_F) ||
849
0
              (c >= CHAR_a && c <= CHAR_f);
850
0
    break;
851
70.6k
    }
852
853
70.6k
  if (negated) set_bit = !set_bit;
854
70.6k
  if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7));
855
70.6k
  if ((c & 0x7) == 0x7) classbits++;
856
70.6k
  }
857
276
}
858
859
#endif /* SUPPORT_UNICODE */
860
861
862
863
#ifdef SUPPORT_WIDE_CHARS
864
865
/*************************************************
866
*           XClass related properties            *
867
*************************************************/
868
869
/* XClass needs to be generated. */
870
8.45k
#define XCLASS_REQUIRED 0x1
871
/* XClass has 8 bit character. */
872
120k
#define XCLASS_HAS_8BIT_CHARS 0x2
873
/* XClass has properties. */
874
1.30k
#define XCLASS_HAS_PROPS 0x4
875
/* XClass has character lists. */
876
2.06k
#define XCLASS_HAS_CHAR_LISTS 0x8
877
/* XClass matches to all >= 256 characters. */
878
2.43k
#define XCLASS_HIGH_ANY 0x10
879
880
#endif
881
882
883
/*************************************************
884
*   Internal entry point for add range to class  *
885
*************************************************/
886
887
/* This function sets the overall range for characters < 256.
888
It also handles non-utf case folding.
889
890
Arguments:
891
  options       the options bits
892
  xoptions      the extra options bits
893
  cb            compile data
894
  start         start of range character
895
  end           end of range character
896
897
Returns:        cb->classbits is updated
898
*/
899
900
static void
901
add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
902
  uint32_t start, uint32_t end)
903
131k
{
904
131k
uint8_t *classbits = cb->classbits.classbits;
905
131k
uint32_t c, byte_start, byte_end;
906
131k
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
907
908
/* If caseless matching is required, scan the range and process alternate
909
cases. In Unicode, there are 8-bit characters that have alternate cases that
910
are greater than 255 and vice-versa (though these may be ignored if caseless
911
restriction is in force). Sometimes we can just extend the original range. */
912
913
131k
if ((options & PCRE2_CASELESS) != 0)
914
60.4k
  {
915
60.4k
#ifdef SUPPORT_UNICODE
916
  /* UTF mode. This branch is taken if we don't support wide characters (e.g.
917
  8-bit library, without UTF), but we do treat those characters as Unicode
918
  (if UCP flag is set). In this case, we only need to expand the character class
919
  set to include the case pairs which are in the 0-255 codepoint range. */
920
60.4k
  if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
921
0
    {
922
0
      BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
923
0
        PCRE2_EXTRA_TURKISH_CASING;
924
0
      if (start < 128)
925
0
        {
926
0
        uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127);
927
0
        for (c = start; c <= lo_end; c++)
928
0
          {
929
0
          if (turkish_i && UCD_ANY_I(c)) continue;
930
0
          SETBIT(classbits, cb->fcc[c]);
931
0
          }
932
0
        }
933
0
      if (classbits_end >= 128)
934
0
        {
935
0
        uint32_t hi_start = (start > 128 ? start : 128);
936
0
        for (c = hi_start; c <= classbits_end; c++)
937
0
          {
938
0
          uint32_t co = UCD_OTHERCASE(c);
939
0
          if (co <= 0xff) SETBIT(classbits, co);
940
0
          }
941
0
        }
942
0
    }
943
944
60.4k
  else
945
60.4k
#endif  /* SUPPORT_UNICODE */
946
947
  /* Not UTF mode */
948
60.4k
    {
949
130k
    for (c = start; c <= classbits_end; c++)
950
70.4k
      SETBIT(classbits, cb->fcc[c]);
951
60.4k
    }
952
60.4k
  }
953
954
/* Use the bitmap for characters < 256. Otherwise use extra data. */
955
956
131k
byte_start = (start + 7) >> 3;
957
131k
byte_end = (classbits_end + 1) >> 3;
958
959
131k
if (byte_start >= byte_end)
960
129k
  {
961
261k
  for (c = start; c <= classbits_end; c++)
962
    /* Regardless of start, c will always be <= 255. */
963
131k
    SETBIT(classbits, c);
964
129k
  return;
965
129k
  }
966
967
16.9k
for (c = byte_start; c < byte_end; c++)
968
15.3k
  classbits[c] = 0xff;
969
970
1.61k
byte_start <<= 3;
971
1.61k
byte_end <<= 3;
972
973
6.53k
for (c = start; c < byte_start; c++)
974
4.92k
  SETBIT(classbits, c);
975
976
6.08k
for (c = byte_end; c <= classbits_end; c++)
977
4.47k
  SETBIT(classbits, c);
978
1.61k
}
979
980
981
#if PCRE2_CODE_UNIT_WIDTH == 8
982
/*************************************************
983
*   Internal entry point for add list to class   *
984
*************************************************/
985
986
/* This function is used for adding a list of horizontal or vertical whitespace
987
characters to a class. The list must be in order so that ranges of characters
988
can be detected and handled appropriately. This function sets the overall range
989
so that the internal functions can try to avoid duplication when handling
990
case-independence.
991
992
Arguments:
993
  options       the options bits
994
  xoptions      the extra options bits
995
  cb            contains pointers to tables etc.
996
  p             points to row of 32-bit values, terminated by NOTACHAR
997
998
Returns:        cb->classbits is updated
999
*/
1000
1001
static void
1002
add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
1003
  const uint32_t *p)
1004
0
{
1005
0
while (p[0] < 256)
1006
0
  {
1007
0
  unsigned int n = 0;
1008
1009
0
  while(p[n+1] == p[0] + n + 1) n++;
1010
0
  add_to_class(options, xoptions, cb, p[0], p[n]);
1011
1012
0
  p += n + 1;
1013
0
  }
1014
0
}
1015
1016
1017
1018
/*************************************************
1019
*    Add characters not in a list to a class     *
1020
*************************************************/
1021
1022
/* This function is used for adding the complement of a list of horizontal or
1023
vertical whitespace to a class. The list must be in order.
1024
1025
Arguments:
1026
  options       the options bits
1027
  xoptions      the extra options bits
1028
  cb            contains pointers to tables etc.
1029
  p             points to row of 32-bit values, terminated by NOTACHAR
1030
1031
Returns:        cb->classbits is updated
1032
*/
1033
1034
static void
1035
add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
1036
  const uint32_t *p)
1037
130
{
1038
130
if (p[0] > 0)
1039
130
  add_to_class(options, xoptions, cb, 0, p[0] - 1);
1040
398
while (p[0] < 256)
1041
268
  {
1042
634
  while (p[1] == p[0] + 1) p++;
1043
268
  add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1);
1044
268
  p++;
1045
268
  }
1046
130
}
1047
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1048
1049
1050
1051
/*************************************************
1052
*  Main entry-point to compile a character class *
1053
*************************************************/
1054
1055
/* This function consumes a "leaf", which is a set of characters that will
1056
become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */
1057
1058
uint32_t *
1059
PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions,
1060
  uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap,
1061
  int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr)
1062
6.70k
{
1063
6.70k
uint32_t *pptr = start_ptr;
1064
6.70k
PCRE2_UCHAR *code = *pcode;
1065
6.70k
BOOL should_flip_negation;
1066
6.70k
const uint8_t *cbits = cb->cbits;
1067
/* Some functions such as add_to_class() or eclass processing
1068
expects that the bitset is stored in cb->classbits.classbits. */
1069
6.70k
uint8_t *const classbits = cb->classbits.classbits;
1070
1071
6.70k
#ifdef SUPPORT_UNICODE
1072
6.70k
BOOL utf = (options & PCRE2_UTF) != 0;
1073
#else  /* No Unicode support */
1074
BOOL utf = FALSE;
1075
#endif
1076
1077
/* Helper variables for OP_XCLASS opcode (for characters > 255). */
1078
1079
6.70k
#ifdef SUPPORT_WIDE_CHARS
1080
6.70k
uint32_t xclass_props;
1081
6.70k
PCRE2_UCHAR *class_uchardata;
1082
6.70k
class_ranges* cranges;
1083
6.70k
#endif
1084
1085
/* If an XClass contains a negative special such as \S, we need to flip the
1086
negation flag at the end, so that support for characters > 255 works correctly
1087
(they are all included in the class). An XClass may need to insert specific
1088
matching or non-matching code for wide characters.
1089
*/
1090
1091
6.70k
should_flip_negation = FALSE;
1092
1093
/* XClass will be used when characters > 255 might match. */
1094
1095
6.70k
#ifdef SUPPORT_WIDE_CHARS
1096
6.70k
xclass_props = 0;
1097
1098
6.70k
#if PCRE2_CODE_UNIT_WIDTH == 8
1099
6.70k
cranges = NULL;
1100
1101
6.70k
if (utf)
1102
2.16k
#endif
1103
2.16k
  {
1104
2.16k
  if (lengthptr != NULL)
1105
1.08k
    {
1106
1.08k
    cranges = compile_optimize_class(pptr, options, xoptions, cb);
1107
1108
1.08k
    if (cranges == NULL)
1109
0
      {
1110
0
      *errorcodeptr = ERR21;
1111
0
      return NULL;
1112
0
      }
1113
1114
    /* Caching the pre-processed character ranges. */
1115
1.08k
    if (cb->next_cranges != NULL)
1116
975
      cb->next_cranges->next = cranges;
1117
108
    else
1118
108
      cb->cranges = cranges;
1119
1120
1.08k
    cb->next_cranges = cranges;
1121
1.08k
    }
1122
1.07k
  else
1123
1.07k
    {
1124
    /* Reuse the pre-processed character ranges. */
1125
1.07k
    cranges = cb->cranges;
1126
1.07k
    PCRE2_ASSERT(cranges != NULL);
1127
1.07k
    cb->cranges = cranges->next;
1128
1.07k
    }
1129
1130
2.16k
  if (cranges->range_list_size > 0)
1131
2.16k
    {
1132
2.16k
    const uint32_t *ranges = (const uint32_t*)(cranges + 1);
1133
1134
2.16k
    if (ranges[0] <= 255)
1135
2.16k
      xclass_props |= XCLASS_HAS_8BIT_CHARS;
1136
1137
2.16k
    if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) &&
1138
0
        ranges[cranges->range_list_size - 2] <= 256)
1139
0
      xclass_props |= XCLASS_HIGH_ANY;
1140
2.16k
    }
1141
2.16k
  }
1142
1143
6.70k
class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
1144
6.70k
#endif /* SUPPORT_WIDE_CHARS */
1145
1146
/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
1147
in a temporary bit of memory, in case the class contains fewer than two
1148
8-bit characters because in that case the compiled code doesn't use the bit
1149
map. */
1150
1151
6.70k
memset(classbits, 0, 32);
1152
1153
/* Process items until end_ptr is reached. */
1154
1155
150k
while (TRUE)
1156
150k
  {
1157
150k
  uint32_t meta = *(pptr++);
1158
150k
  BOOL local_negate;
1159
150k
  int posix_class;
1160
150k
  int taboffset, tabopt;
1161
150k
  class_bits_storage pbits;
1162
150k
  uint32_t escape, c;
1163
1164
  /* Handle POSIX classes such as [:alpha:] etc. */
1165
150k
  switch (META_CODE(meta))
1166
150k
    {
1167
0
    case META_POSIX:
1168
0
    case META_POSIX_NEG:
1169
1170
0
    local_negate = (meta == META_POSIX_NEG);
1171
0
    posix_class = *(pptr++);
1172
1173
0
    if (local_negate) should_flip_negation = TRUE;  /* Note negative special */
1174
1175
    /* If matching is caseless, upper and lower are converted to alpha.
1176
    This relies on the fact that the class table starts with alpha,
1177
    lower, upper as the first 3 entries. */
1178
1179
0
    if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
1180
0
      posix_class = 0;
1181
1182
    /* When PCRE2_UCP is set, some of the POSIX classes are converted to
1183
    different escape sequences that use Unicode properties \p or \P.
1184
    Others that are not available via \p or \P have to generate
1185
    XCL_PROP/XCL_NOTPROP directly, which is done here. */
1186
1187
0
#ifdef SUPPORT_UNICODE
1188
    /* TODO This entire block of code here appears to be unreachable!? I simply
1189
    can't see how it can be hit, given that the frontend parser doesn't emit
1190
    META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */
1191
0
    if ((options & PCRE2_UCP) != 0 &&
1192
0
        (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
1193
0
      {
1194
0
      uint32_t ptype;
1195
1196
0
      switch(posix_class)
1197
0
        {
1198
0
        case PC_GRAPH:
1199
0
        case PC_PRINT:
1200
0
        case PC_PUNCT:
1201
0
        ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH :
1202
0
                (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT;
1203
1204
0
        PRIV(update_classbits)(ptype, 0, local_negate, classbits);
1205
1206
0
        if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1207
0
          {
1208
0
          if (lengthptr != NULL)
1209
0
            *lengthptr += 3;
1210
0
          else
1211
0
            {
1212
0
            *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
1213
0
            *class_uchardata++ = (PCRE2_UCHAR)ptype;
1214
0
            *class_uchardata++ = 0;
1215
0
            }
1216
0
          xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1217
0
          }
1218
0
        continue;
1219
1220
        /* For the other POSIX classes (ex: ascii) we are going to
1221
        fall through to the non-UCP case and build a bit map for
1222
        characters with code points less than 256. However, if we are in
1223
        a negated POSIX class, characters with code points greater than
1224
        255 must either all match or all not match, depending on whether
1225
        the whole class is not or is negated. For example, for
1226
        [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
1227
        they must not.
1228
1229
        In the special case where there are no xclass items, this is
1230
        automatically handled by the use of OP_CLASS or OP_NCLASS, but an
1231
        explicit range is needed for OP_XCLASS. Setting a flag here
1232
        causes the range to be generated later when it is known that
1233
        OP_XCLASS is required. In the 8-bit library this is relevant only in
1234
        utf mode, since no wide characters can exist otherwise. */
1235
1236
0
        default:
1237
0
        break;
1238
0
        }
1239
0
      }
1240
0
#endif  /* SUPPORT_UNICODE */
1241
1242
    /* In the non-UCP case, or when UCP makes no difference, we build the
1243
    bit map for the POSIX class in a chunk of local store because we may
1244
    be adding and subtracting from it, and we don't want to subtract bits
1245
    that may be in the main map already. At the end we or the result into
1246
    the bit map that is being built. */
1247
1248
0
    posix_class *= 3;
1249
1250
    /* Copy in the first table (always present) */
1251
1252
0
    memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32);
1253
1254
    /* If there is a second table, add or remove it as required. */
1255
1256
0
    taboffset = PRIV(posix_class_maps)[posix_class + 1];
1257
0
    tabopt = PRIV(posix_class_maps)[posix_class + 2];
1258
1259
0
    if (taboffset >= 0)
1260
0
      {
1261
0
      if (tabopt >= 0)
1262
0
        for (int i = 0; i < 32; i++)
1263
0
          pbits.classbits[i] |= cbits[i + taboffset];
1264
0
      else
1265
0
        for (int i = 0; i < 32; i++)
1266
0
          pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]);
1267
0
      }
1268
1269
    /* Now see if we need to remove any special characters. An option
1270
    value of 1 removes vertical space and 2 removes underscore. */
1271
1272
0
    if (tabopt < 0) tabopt = -tabopt;
1273
0
    if (tabopt == 1) pbits.classbits[1] &= ~0x3c;
1274
0
      else if (tabopt == 2) pbits.classbits[11] &= 0x7f;
1275
1276
    /* Add the POSIX table or its complement into the main table that is
1277
    being built and we are done. */
1278
1279
0
      {
1280
0
      uint32_t *classwords = cb->classbits.classwords;
1281
1282
0
      if (local_negate)
1283
0
        for (int i = 0; i < 8; i++)
1284
0
          classwords[i] |= (uint32_t)(~pbits.classwords[i]);
1285
0
      else
1286
0
        for (int i = 0; i < 8; i++)
1287
0
          classwords[i] |= pbits.classwords[i];
1288
0
      }
1289
1290
0
#ifdef SUPPORT_WIDE_CHARS
1291
    /* Every class contains at least one < 256 character. */
1292
0
    xclass_props |= XCLASS_HAS_8BIT_CHARS;
1293
0
#endif
1294
0
    continue;               /* End of POSIX handling */
1295
1296
    /* Other than POSIX classes, the only items we should encounter are
1297
    \d-type escapes and literal characters (possibly as ranges). */
1298
0
    case META_BIGVALUE:
1299
0
    meta = *(pptr++);
1300
0
    break;
1301
1302
1.55k
    case META_ESCAPE:
1303
1.55k
    escape = META_DATA(meta);
1304
1305
1.55k
    switch(escape)
1306
1.55k
      {
1307
376
      case ESC_d:
1308
12.4k
      for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
1309
376
      break;
1310
1311
42
      case ESC_D:
1312
42
      should_flip_negation = TRUE;
1313
1.38k
      for (int i = 0; i < 32; i++)
1314
1.34k
        classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
1315
42
      break;
1316
1317
134
      case ESC_w:
1318
4.42k
      for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
1319
134
      break;
1320
1321
571
      case ESC_W:
1322
571
      should_flip_negation = TRUE;
1323
18.8k
      for (int i = 0; i < 32; i++)
1324
18.2k
        classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
1325
571
      break;
1326
1327
      /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
1328
      5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
1329
      previously set by something earlier in the character class.
1330
      Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
1331
      we could just adjust the appropriate bit. From PCRE 8.34 we no
1332
      longer treat \s and \S specially. */
1333
1334
6
      case ESC_s:
1335
198
      for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
1336
6
      break;
1337
1338
16
      case ESC_S:
1339
16
      should_flip_negation = TRUE;
1340
528
      for (int i = 0; i < 32; i++)
1341
512
        classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
1342
16
      break;
1343
1344
      /* When adding the horizontal or vertical space lists to a class, or
1345
      their complements, disable PCRE2_CASELESS, because it justs wastes
1346
      time, and in the "not-x" UTF cases can create unwanted duplicates in
1347
      the XCLASS list (provoked by characters that have more than one other
1348
      case and by both cases being in the same "not-x" sublist). */
1349
1350
4
      case ESC_h:
1351
4
#if PCRE2_CODE_UNIT_WIDTH == 8
1352
4
#ifdef SUPPORT_UNICODE
1353
4
      if (cranges != NULL) break;
1354
0
#endif
1355
0
      add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1356
0
        cb, PRIV(hspace_list));
1357
#else
1358
      PCRE2_ASSERT(cranges != NULL);
1359
#endif
1360
0
      break;
1361
1362
8
      case ESC_H:
1363
8
#if PCRE2_CODE_UNIT_WIDTH == 8
1364
8
#ifdef SUPPORT_UNICODE
1365
8
      if (cranges != NULL) break;
1366
8
#endif
1367
8
      add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1368
8
        cb, PRIV(hspace_list));
1369
#else
1370
      PCRE2_ASSERT(cranges != NULL);
1371
#endif
1372
8
      break;
1373
1374
0
      case ESC_v:
1375
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1376
0
#ifdef SUPPORT_UNICODE
1377
0
      if (cranges != NULL) break;
1378
0
#endif
1379
0
      add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1380
0
        cb, PRIV(vspace_list));
1381
#else
1382
      PCRE2_ASSERT(cranges != NULL);
1383
#endif
1384
0
      break;
1385
1386
122
      case ESC_V:
1387
122
#if PCRE2_CODE_UNIT_WIDTH == 8
1388
122
#ifdef SUPPORT_UNICODE
1389
122
      if (cranges != NULL) break;
1390
122
#endif
1391
122
      add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1392
122
        cb, PRIV(vspace_list));
1393
#else
1394
      PCRE2_ASSERT(cranges != NULL);
1395
#endif
1396
122
      break;
1397
1398
      /* If Unicode is not supported, \P and \p are not allowed and are
1399
      faulted at parse time, so will never appear here. */
1400
1401
0
#ifdef SUPPORT_UNICODE
1402
152
      case ESC_p:
1403
276
      case ESC_P:
1404
276
        {
1405
276
        uint32_t ptype = *pptr >> 16;
1406
276
        uint32_t pdata = *(pptr++) & 0xffff;
1407
1408
        /* The "Any" is processed by PRIV(update_classbits)(). */
1409
276
        if (ptype == PT_ANY)
1410
0
          {
1411
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1412
0
          if (!utf && escape == ESC_p) memset(classbits, 0xff, 32);
1413
0
#endif
1414
0
          continue;
1415
0
          }
1416
1417
276
        PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);
1418
1419
276
        if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1420
276
          {
1421
276
          if (lengthptr != NULL)
1422
138
            *lengthptr += 3;
1423
138
          else
1424
138
            {
1425
138
            *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
1426
138
            *class_uchardata++ = ptype;
1427
138
            *class_uchardata++ = pdata;
1428
138
            }
1429
276
          xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1430
276
          }
1431
276
        }
1432
0
      continue;
1433
1.55k
#endif
1434
1.55k
      }
1435
1436
1.27k
#ifdef SUPPORT_WIDE_CHARS
1437
    /* Every non-property class contains at least one < 256 character. */
1438
1.27k
    xclass_props |= XCLASS_HAS_8BIT_CHARS;
1439
1.27k
#endif
1440
    /* End handling \d-type escapes */
1441
1.27k
    continue;
1442
1443
148k
    CLASS_END_CASES(meta)
1444
    /* Literals. */
1445
148k
    if (meta < META_END) break;
1446
    /* Non-literals: end of class contents. */
1447
6.70k
    goto END_PROCESSING;
1448
150k
    }
1449
1450
  /* A literal character may be followed by a range meta. At parse time
1451
  there are checks for out-of-order characters, for ranges where the two
1452
  characters are equal, and for hyphens that cannot indicate a range. At
1453
  this point, therefore, no checking is needed. */
1454
1455
142k
  c = meta;
1456
1457
  /* Remember if \r or \n were explicitly used */
1458
1459
142k
  if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
1460
1461
  /* Process a character range */
1462
1463
142k
  if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED)
1464
828
    {
1465
828
    uint32_t d;
1466
1467
#ifdef EBCDIC
1468
    BOOL range_is_literal = (*pptr == META_RANGE_LITERAL);
1469
#endif
1470
828
    ++pptr;
1471
828
    d = *(pptr++);
1472
828
    if (d == META_BIGVALUE) d = *(pptr++);
1473
1474
    /* Remember an explicit \r or \n, and add the range to the class. */
1475
1476
828
    if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
1477
1478
828
#if PCRE2_CODE_UNIT_WIDTH == 8
1479
828
#ifdef SUPPORT_UNICODE
1480
828
    if (cranges != NULL) continue;
1481
370
    xclass_props |= XCLASS_HAS_8BIT_CHARS;
1482
370
#endif
1483
1484
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
1485
    because there are holes in the encoding, and simply using the range
1486
    A-Z (for example) would include the characters in the holes. This
1487
    applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
1488
1489
#ifdef EBCDIC
1490
    if (range_is_literal &&
1491
         (cb->ctypes[c] & ctype_letter) != 0 &&
1492
         (cb->ctypes[d] & ctype_letter) != 0 &&
1493
         (c <= CHAR_z) == (d <= CHAR_z))
1494
      {
1495
      uint32_t uc = (d <= CHAR_z)? 0 : 64;
1496
      uint32_t C = c - uc;
1497
      uint32_t D = d - uc;
1498
1499
      if (C <= CHAR_i)
1500
        {
1501
        add_to_class(options, xoptions, cb, C + uc,
1502
          ((D < CHAR_i)? D : CHAR_i) + uc);
1503
        C = CHAR_j;
1504
        }
1505
1506
      if (C <= D && C <= CHAR_r)
1507
        {
1508
        add_to_class(options, xoptions, cb, C + uc,
1509
          ((D < CHAR_r)? D : CHAR_r) + uc);
1510
        C = CHAR_s;
1511
        }
1512
1513
      if (C <= D)
1514
        add_to_class(options, xoptions, cb, C + uc, D + uc);
1515
      }
1516
    else
1517
#endif
1518
    /* Not an EBCDIC special range */
1519
1520
370
    add_to_class(options, xoptions, cb, c, d);
1521
#else
1522
    PCRE2_ASSERT(cranges != NULL);
1523
#endif
1524
370
    continue;
1525
828
    }  /* End of range handling */
1526
1527
  /* Character ranges are ignored when class_ranges is present. */
1528
141k
#if PCRE2_CODE_UNIT_WIDTH == 8
1529
141k
#ifdef SUPPORT_UNICODE
1530
141k
  if (cranges != NULL) continue;
1531
115k
  xclass_props |= XCLASS_HAS_8BIT_CHARS;
1532
115k
#endif
1533
  /* Handle a single character. */
1534
1535
115k
  add_to_class(options, xoptions, cb, meta, meta);
1536
#else
1537
  PCRE2_ASSERT(cranges != NULL);
1538
#endif
1539
115k
  }   /* End of main class-processing loop */
1540
1541
6.70k
END_PROCESSING:
1542
1543
6.70k
#ifdef SUPPORT_WIDE_CHARS
1544
6.70k
PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 ||
1545
6.70k
             (xclass_props & XCLASS_HIGH_ANY) == 0);
1546
1547
6.70k
if (cranges != NULL)
1548
2.16k
  {
1549
2.16k
  uint32_t *range = (uint32_t*)(cranges + 1);
1550
2.16k
  uint32_t *end = range + cranges->range_list_size;
1551
1552
17.0k
  while (range < end && range[0] < 256)
1553
14.8k
    {
1554
14.8k
    PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0);
1555
    /* Add range to bitset. If we are in UTF or UCP mode, then clear the
1556
    caseless bit, because the cranges handle caselessness (only) in this
1557
    condition; see the condition for PARSE_CLASS_CASELESS_UTF in
1558
    compile_optimize_class(). */
1559
14.8k
    add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)?
1560
14.8k
        (options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]);
1561
1562
14.8k
    if (range[1] > 255) break;
1563
14.8k
    range += 2;
1564
14.8k
    }
1565
1566
2.16k
  if (cranges->char_lists_size > 0)
1567
6
    {
1568
    /* The cranges structure is still used and freed later. */
1569
6
    PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0);
1570
6
    xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS;
1571
6
    }
1572
2.15k
  else
1573
2.15k
    {
1574
2.15k
    if ((xclass_props & XCLASS_HIGH_ANY) != 0)
1575
0
      {
1576
0
      PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&
1577
0
        range[1] >= GET_MAX_CHAR_VALUE(utf));
1578
0
      should_flip_negation = TRUE;
1579
0
      range = end;
1580
0
      }
1581
1582
3.61k
    while (range < end)
1583
1.46k
      {
1584
1.46k
      uint32_t range_start = range[0];
1585
1.46k
      uint32_t range_end = range[1];
1586
1587
1.46k
      range += 2;
1588
1.46k
      xclass_props |= XCLASS_REQUIRED;
1589
1590
1.46k
      if (range_start < 256) range_start = 256;
1591
1592
1.46k
      if (lengthptr != NULL)
1593
732
        {
1594
732
#ifdef SUPPORT_UNICODE
1595
732
        if (utf)
1596
732
          {
1597
732
          *lengthptr += 1;
1598
1599
732
          if (range_start < range_end)
1600
0
            *lengthptr += PRIV(ord2utf)(range_start, class_uchardata);
1601
1602
732
          *lengthptr += PRIV(ord2utf)(range_end, class_uchardata);
1603
732
          continue;
1604
732
          }
1605
0
#endif  /* SUPPORT_UNICODE */
1606
1607
0
        *lengthptr += range_start < range_end ? 3 : 2;
1608
0
        continue;
1609
732
        }
1610
1611
732
#ifdef SUPPORT_UNICODE
1612
732
      if (utf)
1613
732
        {
1614
732
        if (range_start < range_end)
1615
0
          {
1616
0
          *class_uchardata++ = XCL_RANGE;
1617
0
          class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);
1618
0
          }
1619
732
        else
1620
732
          *class_uchardata++ = XCL_SINGLE;
1621
1622
732
        class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);
1623
732
        continue;
1624
732
        }
1625
732
#endif  /* SUPPORT_UNICODE */
1626
1627
      /* Without UTF support, character values are constrained
1628
      by the bit length, and can only be > 256 for 16-bit and
1629
      32-bit libraries. */
1630
#if PCRE2_CODE_UNIT_WIDTH != 8
1631
      if (range_start < range_end)
1632
        {
1633
        *class_uchardata++ = XCL_RANGE;
1634
        *class_uchardata++ = range_start;
1635
        }
1636
      else
1637
        *class_uchardata++ = XCL_SINGLE;
1638
1639
      *class_uchardata++ = range_end;
1640
#endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
1641
732
      }
1642
1643
2.15k
    if (lengthptr == NULL)
1644
1.07k
      cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
1645
2.15k
    }
1646
2.16k
  }
1647
6.70k
#endif /* SUPPORT_WIDE_CHARS */
1648
1649
/* If there are characters with values > 255, or Unicode property settings
1650
(\p or \P), we have to compile an extended class, with its own opcode,
1651
unless there were no property settings and there was a negated special such
1652
as \S in the class, and PCRE2_UCP is not set, because in that case all
1653
characters > 255 are in or not in the class, so any that were explicitly
1654
given as well can be ignored.
1655
1656
In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
1657
were present in a class, we either have to match or not match all wide
1658
characters (depending on whether the whole class is or is not negated).
1659
This requirement is indicated by match_all_or_no_wide_chars being true.
1660
We do this by including an explicit range, which works in both cases.
1661
This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
1662
cannot be any wide characters in 8-bit non-UTF mode.
1663
1664
When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
1665
class where \S etc is present without PCRE2_UCP, causing an extended class
1666
to be compiled, we make sure that all characters > 255 are included by
1667
forcing match_all_or_no_wide_chars to be true.
1668
1669
If, when generating an xclass, there are no characters < 256, we can omit
1670
the bitmap in the actual compiled code. */
1671
1672
6.70k
#ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
1673
6.70k
if ((xclass_props & XCLASS_REQUIRED) != 0)
1674
1.02k
  {
1675
1.02k
  PCRE2_UCHAR *previous = code;
1676
1677
1.02k
  if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0)
1678
1.02k
    *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
1679
1.02k
  *code++ = OP_XCLASS;
1680
1.02k
  code += LINK_SIZE;
1681
1.02k
  *code = negate_class? XCL_NOT:0;
1682
1.02k
  if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;
1683
1684
  /* If the map is required, move up the extra data to make room for it;
1685
  otherwise just move the code pointer to the end of the extra data. */
1686
1687
1.02k
  if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL)
1688
1.02k
    {
1689
1.02k
    if (negate_class)
1690
614
      {
1691
614
      uint32_t *classwords = cb->classbits.classwords;
1692
5.52k
      for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
1693
614
      }
1694
1695
1.02k
    if (has_bitmap == NULL)
1696
1.02k
      {
1697
1.02k
      *code++ |= XCL_MAP;
1698
1.02k
      (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
1699
1.02k
        CU2BYTES(class_uchardata - code));
1700
1.02k
      memcpy(code, classbits, 32);
1701
1.02k
      code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
1702
1.02k
      }
1703
0
    else
1704
0
      {
1705
0
      code = class_uchardata;
1706
0
      if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0)
1707
0
        *has_bitmap = TRUE;
1708
0
      }
1709
1.02k
    }
1710
0
  else code = class_uchardata;
1711
1712
1.02k
  if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)
1713
6
    {
1714
    /* Char lists size is an even number, because all items are 16 or 32
1715
    bit values. The character list data is always aligned to 32 bits. */
1716
6
    size_t char_lists_size = cranges->char_lists_size;
1717
6
    PCRE2_ASSERT((char_lists_size & 0x1) == 0 &&
1718
6
                 (cb->char_lists_size & 0x3) == 0);
1719
1720
6
    if (lengthptr != NULL)
1721
3
      {
1722
3
      char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
1723
1724
3
#if PCRE2_CODE_UNIT_WIDTH == 8
1725
3
      *lengthptr += 2 + LINK_SIZE;
1726
#else
1727
      *lengthptr += 1 + LINK_SIZE;
1728
#endif
1729
1730
3
      cb->char_lists_size += char_lists_size;
1731
1732
3
      char_lists_size /= sizeof(PCRE2_UCHAR);
1733
1734
      /* Storage space for character lists is included
1735
      in the maximum pattern size. */
1736
3
      if (*lengthptr > MAX_PATTERN_SIZE ||
1737
3
          MAX_PATTERN_SIZE - *lengthptr < char_lists_size)
1738
0
        {
1739
0
        *errorcodeptr = ERR20;   /* Pattern is too large */
1740
0
        return NULL;
1741
0
        }
1742
3
      }
1743
3
    else
1744
3
      {
1745
3
      uint8_t *data;
1746
1747
3
      PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);
1748
3
#if PCRE2_CODE_UNIT_WIDTH == 8
1749
      /* Encode as high / low bytes. */
1750
3
      code[0] = (uint8_t)(XCL_LIST |
1751
3
        (cranges->char_lists_types >> 8));
1752
3
      code[1] = (uint8_t)cranges->char_lists_types;
1753
3
      code += 2;
1754
#else
1755
      *code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
1756
#endif
1757
1758
      /* Character lists are stored in backwards direction from
1759
      byte code start. The non-dfa/dfa matchers can access these
1760
      lists using the byte code start stored in match blocks.
1761
      Each list is aligned to 32 bit with an optional unused
1762
      16 bit value at the beginning of the character list. */
1763
1764
3
      cb->char_lists_size += char_lists_size;
1765
3
      data = (uint8_t*)cb->start_code - cb->char_lists_size;
1766
1767
3
      memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start,
1768
3
        char_lists_size);
1769
1770
      /* Since character lists total size is less than MAX_PATTERN_SIZE,
1771
      their starting offset fits into a value which size is LINK_SIZE. */
1772
1773
3
      char_lists_size = cb->char_lists_size;
1774
3
      PUT(code, 0, (uint32_t)(char_lists_size >> 1));
1775
3
      code += LINK_SIZE;
1776
1777
#if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND
1778
      if ((char_lists_size & 0x2) != 0)
1779
        {
1780
        /* In debug the unused 16 bit value is set
1781
        to a fixed value and marked unused. */
1782
        ((uint16_t*)data)[-1] = 0x5555;
1783
#ifdef SUPPORT_VALGRIND
1784
        VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2);
1785
#endif
1786
        }
1787
#endif
1788
1789
3
      cb->char_lists_size =
1790
3
        CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
1791
1792
3
      cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
1793
3
      }
1794
6
    }
1795
1796
  /* Now fill in the complete length of the item */
1797
1798
1.02k
  PUT(previous, 1, (int)(code - previous));
1799
1.02k
  goto DONE;   /* End of class handling */
1800
1.02k
  }
1801
5.68k
#endif  /* SUPPORT_WIDE_CHARS */
1802
1803
/* If there are no characters > 255, or they are all to be included or
1804
excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
1805
whole class was negated and whether there were negative specials such as \S
1806
(non-UCP) in the class. Then copy the 32-byte map into the code vector,
1807
negating it if necessary. */
1808
1809
5.68k
if (negate_class)
1810
1.53k
  {
1811
1.53k
  uint32_t *classwords = cb->classbits.classwords;
1812
1813
13.7k
  for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
1814
1.53k
  }
1815
1816
5.68k
if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) &&
1817
5.16k
    cb->classbits.classwords[0] == ~(uint32_t)0)
1818
563
  {
1819
563
  const uint32_t *classwords = cb->classbits.classwords;
1820
563
  int i;
1821
1822
2.32k
  for (i = 0; i < 8; i++)
1823
2.17k
    if (classwords[i] != ~(uint32_t)0) break;
1824
1825
563
  if (i == 8)
1826
150
    {
1827
150
    *code++ = OP_ALLANY;
1828
150
    goto DONE;   /* End of class handling */
1829
150
    }
1830
563
  }
1831
1832
5.53k
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
1833
5.53k
memcpy(code, classbits, 32);
1834
5.53k
code += 32 / sizeof(PCRE2_UCHAR);
1835
1836
6.70k
DONE:
1837
6.70k
*pcode = code;
1838
6.70k
return pptr - 1;
1839
5.53k
}
1840
1841
1842
1843
/* ===================================================================*/
1844
/* Here follows a block of ECLASS-compiling functions. You may well want to
1845
read them from top to bottom; they are ordered from leafmost (at the top) to
1846
outermost parser (at the bottom of the file). */
1847
1848
/* This function folds one operand using the negation operator.
1849
The new, combined chunk of stack code is written out to *pop_info. */
1850
1851
static void
1852
fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr,
1853
  BOOL preserve_classbits)
1854
0
{
1855
/* If the chunk of stack code is already composed of multiple ops, we won't
1856
descend in and try and propagate the negation down the tree. (That would lead
1857
to O(n^2) compile-time, which could be exploitable with a malicious regex -
1858
although maybe that's not really too much of a worry in a library that offers
1859
an exponential-time matching function!) */
1860
1861
0
if (pop_info->op_single_type == 0)
1862
0
  {
1863
0
  if (lengthptr != NULL)
1864
0
    *lengthptr += 1;
1865
0
  else
1866
0
    pop_info->code_start[pop_info->length] = ECL_NOT;
1867
0
  pop_info->length += 1;
1868
0
  }
1869
1870
/* Otherwise, it's a nice single-op item, so we can easily fold in the negation
1871
without needing to produce an ECL_NOT. */
1872
1873
0
else if (pop_info->op_single_type == ECL_ANY ||
1874
0
         pop_info->op_single_type == ECL_NONE)
1875
0
  {
1876
0
  pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)?
1877
0
      ECL_ANY : ECL_NONE;
1878
0
  if (lengthptr == NULL)
1879
0
    *(pop_info->code_start) = pop_info->op_single_type;
1880
0
  }
1881
0
else
1882
0
  {
1883
0
  PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS &&
1884
0
               pop_info->length >= 1 + LINK_SIZE + 1);
1885
0
  if (lengthptr == NULL)
1886
0
    pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT;
1887
0
  }
1888
1889
0
if (!preserve_classbits)
1890
0
  {
1891
0
  for (int i = 0; i < 8; i++)
1892
0
    pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i];
1893
0
  }
1894
0
}
1895
1896
1897
1898
/* This function folds together two operands using a binary operator.
1899
The new, combined chunk of stack code is written out to *lhs_op_info. */
1900
1901
static void
1902
fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info,
1903
  PCRE2_SIZE *lengthptr)
1904
0
{
1905
0
switch (op)
1906
0
  {
1907
  /* ECL_AND truth table:
1908
1909
     LHS  RHS  RESULT
1910
     ----------------
1911
     ANY  *    RHS
1912
     *    ANY  LHS
1913
     NONE *    NONE
1914
     *    NONE NONE
1915
     X    Y    X & Y
1916
  */
1917
1918
0
  case ECL_AND:
1919
0
  if (rhs_op_info->op_single_type == ECL_ANY)
1920
0
    {
1921
    /* no-op: drop the RHS */
1922
0
    }
1923
0
  else if (lhs_op_info->op_single_type == ECL_ANY)
1924
0
    {
1925
    /* no-op: drop the LHS, and memmove the RHS into its place */
1926
0
    if (lengthptr == NULL)
1927
0
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
1928
0
              CU2BYTES(rhs_op_info->length));
1929
0
    lhs_op_info->length = rhs_op_info->length;
1930
0
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
1931
0
    }
1932
0
  else if (rhs_op_info->op_single_type == ECL_NONE)
1933
0
    {
1934
    /* the result is ECL_NONE: write into the LHS */
1935
0
    if (lengthptr == NULL)
1936
0
      lhs_op_info->code_start[0] = ECL_NONE;
1937
0
    lhs_op_info->length = 1;
1938
0
    lhs_op_info->op_single_type = ECL_NONE;
1939
0
    }
1940
0
  else if (lhs_op_info->op_single_type == ECL_NONE)
1941
0
    {
1942
    /* the result is ECL_NONE: drop the RHS */
1943
0
    }
1944
0
  else
1945
0
    {
1946
    /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
1947
0
    if (lengthptr != NULL)
1948
0
      *lengthptr += 1;
1949
0
    else
1950
0
      {
1951
0
      PCRE2_ASSERT(rhs_op_info->code_start ==
1952
0
          lhs_op_info->code_start + lhs_op_info->length);
1953
0
      rhs_op_info->code_start[rhs_op_info->length] = ECL_AND;
1954
0
      }
1955
0
    lhs_op_info->length += rhs_op_info->length + 1;
1956
0
    lhs_op_info->op_single_type = 0;
1957
0
    }
1958
1959
0
  for (int i = 0; i < 8; i++)
1960
0
    lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i];
1961
0
  break;
1962
1963
  /* ECL_OR truth table:
1964
1965
     LHS  RHS  RESULT
1966
     ----------------
1967
     ANY  *    ANY
1968
     *    ANY  ANY
1969
     NONE *    RHS
1970
     *    NONE LHS
1971
     X    Y    X | Y
1972
  */
1973
1974
0
  case ECL_OR:
1975
0
  if (rhs_op_info->op_single_type == ECL_NONE)
1976
0
    {
1977
    /* no-op: drop the RHS */
1978
0
    }
1979
0
  else if (lhs_op_info->op_single_type == ECL_NONE)
1980
0
    {
1981
    /* no-op: drop the LHS, and memmove the RHS into its place */
1982
0
    if (lengthptr == NULL)
1983
0
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
1984
0
              CU2BYTES(rhs_op_info->length));
1985
0
    lhs_op_info->length = rhs_op_info->length;
1986
0
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
1987
0
    }
1988
0
  else if (rhs_op_info->op_single_type == ECL_ANY)
1989
0
    {
1990
    /* the result is ECL_ANY: write into the LHS */
1991
0
    if (lengthptr == NULL)
1992
0
      lhs_op_info->code_start[0] = ECL_ANY;
1993
0
    lhs_op_info->length = 1;
1994
0
    lhs_op_info->op_single_type = ECL_ANY;
1995
0
    }
1996
0
  else if (lhs_op_info->op_single_type == ECL_ANY)
1997
0
    {
1998
    /* the result is ECL_ANY: drop the RHS */
1999
0
    }
2000
0
  else
2001
0
    {
2002
    /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
2003
0
    if (lengthptr != NULL)
2004
0
      *lengthptr += 1;
2005
0
    else
2006
0
      {
2007
0
      PCRE2_ASSERT(rhs_op_info->code_start ==
2008
0
          lhs_op_info->code_start + lhs_op_info->length);
2009
0
      rhs_op_info->code_start[rhs_op_info->length] = ECL_OR;
2010
0
      }
2011
0
    lhs_op_info->length += rhs_op_info->length + 1;
2012
0
    lhs_op_info->op_single_type = 0;
2013
0
    }
2014
2015
0
  for (int i = 0; i < 8; i++)
2016
0
    lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i];
2017
0
  break;
2018
2019
  /* ECL_XOR truth table:
2020
2021
     LHS  RHS  RESULT
2022
     ----------------
2023
     ANY  *    !RHS
2024
     *    ANY  !LHS
2025
     NONE *    RHS
2026
     *    NONE LHS
2027
     X    Y    X ^ Y
2028
  */
2029
2030
0
  case ECL_XOR:
2031
0
  if (rhs_op_info->op_single_type == ECL_NONE)
2032
0
    {
2033
    /* no-op: drop the RHS */
2034
0
    }
2035
0
  else if (lhs_op_info->op_single_type == ECL_NONE)
2036
0
    {
2037
    /* no-op: drop the LHS, and memmove the RHS into its place */
2038
0
    if (lengthptr == NULL)
2039
0
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2040
0
              CU2BYTES(rhs_op_info->length));
2041
0
    lhs_op_info->length = rhs_op_info->length;
2042
0
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2043
0
    }
2044
0
  else if (rhs_op_info->op_single_type == ECL_ANY)
2045
0
    {
2046
    /* the result is !LHS: fold in the negation, and drop the RHS */
2047
    /* Preserve the classbits, because we promise to deal with them later. */
2048
0
    fold_negation(lhs_op_info, lengthptr, TRUE);
2049
0
    }
2050
0
  else if (lhs_op_info->op_single_type == ECL_ANY)
2051
0
    {
2052
    /* the result is !RHS: drop the LHS, memmove the RHS into its place, and
2053
    fold in the negation */
2054
0
    if (lengthptr == NULL)
2055
0
      memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2056
0
              CU2BYTES(rhs_op_info->length));
2057
0
    lhs_op_info->length = rhs_op_info->length;
2058
0
    lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2059
2060
    /* Preserve the classbits, because we promise to deal with them later. */
2061
0
    fold_negation(lhs_op_info, lengthptr, TRUE);
2062
0
    }
2063
0
  else
2064
0
    {
2065
    /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
2066
0
    if (lengthptr != NULL)
2067
0
      *lengthptr += 1;
2068
0
    else
2069
0
      {
2070
0
      PCRE2_ASSERT(rhs_op_info->code_start ==
2071
0
          lhs_op_info->code_start + lhs_op_info->length);
2072
0
      rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR;
2073
0
      }
2074
0
    lhs_op_info->length += rhs_op_info->length + 1;
2075
0
    lhs_op_info->op_single_type = 0;
2076
0
    }
2077
2078
0
  for (int i = 0; i < 8; i++)
2079
0
    lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i];
2080
0
  break;
2081
2082
0
  default:
2083
0
  PCRE2_DEBUG_UNREACHABLE();
2084
0
  break;
2085
0
  }
2086
0
}
2087
2088
2089
2090
static BOOL
2091
compile_eclass_nested(eclass_context *context, BOOL negated,
2092
  uint32_t **pptr, PCRE2_UCHAR **pcode,
2093
  eclass_op_info *pop_info, PCRE2_SIZE *lengthptr);
2094
2095
/* This function consumes a group of implicitly-unioned class elements.
2096
These can be characters, ranges, properties, or nested classes, as long
2097
as they are all joined by being placed adjacently. */
2098
2099
static BOOL
2100
compile_class_operand(eclass_context *context, BOOL negated,
2101
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2102
  PCRE2_SIZE *lengthptr)
2103
0
{
2104
0
uint32_t *ptr = *pptr;
2105
0
uint32_t *prev_ptr;
2106
0
PCRE2_UCHAR *code = *pcode;
2107
0
PCRE2_UCHAR *code_start = code;
2108
0
PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0;
2109
0
PCRE2_SIZE extra_length;
2110
0
uint32_t meta = META_CODE(*ptr);
2111
2112
0
switch (meta)
2113
0
  {
2114
0
  case META_CLASS_EMPTY_NOT:
2115
0
  case META_CLASS_EMPTY:
2116
0
  ++ptr;
2117
0
  pop_info->length = 1;
2118
0
  if ((meta == META_CLASS_EMPTY) == negated)
2119
0
    {
2120
0
    *code++ = pop_info->op_single_type = ECL_ANY;
2121
0
    memset(pop_info->bits.classbits, 0xff, 32);
2122
0
    }
2123
0
  else
2124
0
    {
2125
0
    *code++ = pop_info->op_single_type = ECL_NONE;
2126
0
    memset(pop_info->bits.classbits, 0, 32);
2127
0
    }
2128
0
  break;
2129
2130
0
  case META_CLASS:
2131
0
  case META_CLASS_NOT:
2132
0
  if ((*ptr & CLASS_IS_ECLASS) != 0)
2133
0
    {
2134
0
    if (!compile_eclass_nested(context, negated, &ptr, &code,
2135
0
                               pop_info, lengthptr))
2136
0
      return FALSE;
2137
2138
0
    PCRE2_ASSERT(*ptr == META_CLASS_END);
2139
0
    ptr++;
2140
0
    goto DONE;
2141
0
    }
2142
2143
0
  ptr++;
2144
  /* Fall through */
2145
2146
0
  default:
2147
  /* Scan forward characters, ranges, and properties.
2148
  For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but
2149
  we still need to collect that fragment up into a "leaf" OP_CLASS. */
2150
2151
0
  prev_ptr = ptr;
2152
0
  ptr = PRIV(compile_class_not_nested)(
2153
0
    context->options, context->xoptions, ptr, &code,
2154
0
    (meta != META_CLASS_NOT) == negated, &context->needs_bitmap,
2155
0
    context->errorcodeptr, context->cb, lengthptr);
2156
0
  if (ptr == NULL) return FALSE;
2157
2158
  /* We must have a 100% guarantee that ptr increases when
2159
  compile_class_operand() returns, even on Release builds, so that we can
2160
  statically prove our loops terminate. */
2161
0
  if (ptr <= prev_ptr)
2162
0
    {
2163
0
    PCRE2_DEBUG_UNREACHABLE();
2164
0
    return FALSE;
2165
0
    }
2166
2167
  /* If we fell through above, consume the closing ']'. */
2168
0
  if (meta == META_CLASS || meta == META_CLASS_NOT)
2169
0
    {
2170
0
    PCRE2_ASSERT(*ptr == META_CLASS_END);
2171
0
    ptr++;
2172
0
    }
2173
2174
  /* Regardless of whether (lengthptr == NULL), some data will still be written
2175
  out to *pcode, which we need: we have to peek at it, to transform the opcode
2176
  into the ECLASS version (since we need to hoist up the bitmaps). */
2177
0
  PCRE2_ASSERT(code > code_start);
2178
0
  extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0;
2179
2180
  /* Easiest case: convert OP_ALLANY to ECL_ANY */
2181
2182
0
  if (*code_start == OP_ALLANY)
2183
0
    {
2184
0
    PCRE2_ASSERT(code - code_start == 1 && extra_length == 0);
2185
0
    pop_info->length = 1;
2186
0
    *code_start = pop_info->op_single_type = ECL_ANY;
2187
0
    memset(pop_info->bits.classbits, 0xff, 32);
2188
0
    }
2189
2190
  /* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to
2191
  ECL_NONE / ECL_ANY respectively. */
2192
2193
0
  else if (*code_start == OP_CLASS || *code_start == OP_NCLASS)
2194
0
    {
2195
0
    PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) &&
2196
0
                 extra_length == 0);
2197
0
    pop_info->length = 1;
2198
0
    *code_start = pop_info->op_single_type =
2199
0
        (*code_start == OP_CLASS)? ECL_NONE : ECL_ANY;
2200
0
    memcpy(pop_info->bits.classbits, code_start + 1, 32);
2201
    /* Rewind the code pointer, but make sure we adjust *lengthptr, because we
2202
    do need to reserve that space (even though we only use it temporarily). */
2203
0
    if (lengthptr != NULL)
2204
0
      *lengthptr += code - (code_start + 1);
2205
0
    code = code_start + 1;
2206
2207
0
    if (!context->needs_bitmap && *code_start == ECL_NONE)
2208
0
      {
2209
0
      uint32_t *classwords = pop_info->bits.classwords;
2210
2211
0
      for (int i = 0; i < 8; i++)
2212
0
        if (classwords[i] != 0)
2213
0
          {
2214
0
          context->needs_bitmap = TRUE;
2215
0
          break;
2216
0
          }
2217
0
      }
2218
0
    else
2219
0
      context->needs_bitmap = TRUE;
2220
0
    }
2221
2222
  /* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to
2223
  ECL_XCLASS. */
2224
2225
0
  else
2226
0
    {
2227
0
    PCRE2_ASSERT(*code_start == OP_XCLASS);
2228
0
    *code_start = pop_info->op_single_type = ECL_XCLASS;
2229
2230
0
    PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1);
2231
2232
0
    memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32);
2233
0
    pop_info->length = (code - code_start) + extra_length;
2234
0
    }
2235
2236
0
  break;
2237
0
  }  /* End of switch(meta) */
2238
2239
0
pop_info->code_start = (lengthptr == NULL)? code_start : NULL;
2240
2241
0
if (lengthptr != NULL)
2242
0
  {
2243
0
  *lengthptr += code - code_start;
2244
0
  code = code_start;
2245
0
  }
2246
2247
0
DONE:
2248
0
PCRE2_ASSERT(lengthptr == NULL || (code == code_start));
2249
2250
0
*pptr = ptr;
2251
0
*pcode = code;
2252
0
return TRUE;
2253
0
}
2254
2255
2256
2257
/* This function consumes a group of implicitly-unioned class elements.
2258
These can be characters, ranges, properties, or nested classes, as long
2259
as they are all joined by being placed adjacently. */
2260
2261
static BOOL
2262
compile_class_juxtaposition(eclass_context *context, BOOL negated,
2263
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2264
  PCRE2_SIZE *lengthptr)
2265
0
{
2266
0
uint32_t *ptr = *pptr;
2267
0
PCRE2_UCHAR *code = *pcode;
2268
#ifdef PCRE2_DEBUG
2269
PCRE2_UCHAR *start_code = *pcode;
2270
#endif
2271
2272
/* See compile_class_binary_loose() for comments on compile-time folding of
2273
the "negated" flag. */
2274
2275
/* Because it's a non-empty class, there must be an operand at the start. */
2276
0
if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr))
2277
0
  return FALSE;
2278
2279
0
while (*ptr != META_CLASS_END &&
2280
0
       !(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT))
2281
0
  {
2282
0
  uint32_t op;
2283
0
  BOOL rhs_negated;
2284
0
  eclass_op_info rhs_op_info;
2285
2286
0
  if (negated)
2287
0
    {
2288
    /* !(A juxtapose B)  ->  !A && !B */
2289
0
    op = ECL_AND;
2290
0
    rhs_negated = TRUE;
2291
0
    }
2292
0
  else
2293
0
    {
2294
    /* A juxtapose B  ->  A || B */
2295
0
    op = ECL_OR;
2296
0
    rhs_negated = FALSE;
2297
0
    }
2298
2299
  /* An operand must follow the operator. */
2300
0
  if (!compile_class_operand(context, rhs_negated, &ptr, &code,
2301
0
                             &rhs_op_info, lengthptr))
2302
0
    return FALSE;
2303
2304
  /* Convert infix to postfix (RPN). */
2305
0
  fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2306
0
  if (lengthptr == NULL)
2307
0
    code = pop_info->code_start + pop_info->length;
2308
0
  }
2309
2310
0
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2311
2312
0
*pptr = ptr;
2313
0
*pcode = code;
2314
0
return TRUE;
2315
0
}
2316
2317
2318
2319
/* This function consumes unary prefix operators. */
2320
2321
static BOOL
2322
compile_class_unary(eclass_context *context, BOOL negated,
2323
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2324
  PCRE2_SIZE *lengthptr)
2325
0
{
2326
0
uint32_t *ptr = *pptr;
2327
#ifdef PCRE2_DEBUG
2328
PCRE2_UCHAR *start_code = *pcode;
2329
#endif
2330
2331
0
while (*ptr == META_ECLASS_NOT)
2332
0
  {
2333
0
  ++ptr;
2334
0
  negated = !negated;
2335
0
  }
2336
2337
0
*pptr = ptr;
2338
/* Because it's a non-empty class, there must be an operand. */
2339
0
if (!compile_class_juxtaposition(context, negated, pptr, pcode,
2340
0
                                 pop_info, lengthptr))
2341
0
  return FALSE;
2342
2343
0
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
2344
0
return TRUE;
2345
0
}
2346
2347
2348
2349
/* This function consumes tightly-binding binary operators. */
2350
2351
static BOOL
2352
compile_class_binary_tight(eclass_context *context, BOOL negated,
2353
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2354
  PCRE2_SIZE *lengthptr)
2355
0
{
2356
0
uint32_t *ptr = *pptr;
2357
0
PCRE2_UCHAR *code = *pcode;
2358
#ifdef PCRE2_DEBUG
2359
PCRE2_UCHAR *start_code = *pcode;
2360
#endif
2361
2362
/* See compile_class_binary_loose() for comments on compile-time folding of
2363
the "negated" flag. */
2364
2365
/* Because it's a non-empty class, there must be an operand at the start. */
2366
0
if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr))
2367
0
  return FALSE;
2368
2369
0
while (*ptr == META_ECLASS_AND)
2370
0
  {
2371
0
  uint32_t op;
2372
0
  BOOL rhs_negated;
2373
0
  eclass_op_info rhs_op_info;
2374
2375
0
  if (negated)
2376
0
    {
2377
    /* !(A && B)  ->  !A || !B */
2378
0
    op = ECL_OR;
2379
0
    rhs_negated = TRUE;
2380
0
    }
2381
0
  else
2382
0
    {
2383
    /* A && B  ->  A && B */
2384
0
    op = ECL_AND;
2385
0
    rhs_negated = FALSE;
2386
0
    }
2387
2388
0
  ++ptr;
2389
2390
  /* An operand must follow the operator. */
2391
0
  if (!compile_class_unary(context, rhs_negated, &ptr, &code,
2392
0
                           &rhs_op_info, lengthptr))
2393
0
    return FALSE;
2394
2395
  /* Convert infix to postfix (RPN). */
2396
0
  fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2397
0
  if (lengthptr == NULL)
2398
0
    code = pop_info->code_start + pop_info->length;
2399
0
  }
2400
2401
0
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2402
2403
0
*pptr = ptr;
2404
0
*pcode = code;
2405
0
return TRUE;
2406
0
}
2407
2408
2409
2410
/* This function consumes loosely-binding binary operators. */
2411
2412
static BOOL
2413
compile_class_binary_loose(eclass_context *context, BOOL negated,
2414
  uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2415
  PCRE2_SIZE *lengthptr)
2416
0
{
2417
0
uint32_t *ptr = *pptr;
2418
0
PCRE2_UCHAR *code = *pcode;
2419
#ifdef PCRE2_DEBUG
2420
PCRE2_UCHAR *start_code = *pcode;
2421
#endif
2422
2423
/* We really want to fold the negation operator, if at all possible, so that
2424
simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want
2425
to produce a fully-folded expression, so that we can guarantee not to emit any
2426
OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode).
2427
2428
This has the consequence that with a little ingenuity, we can in fact avoid
2429
emitting (nearly...) all cases of the "NOT" operator. Imagine that we have:
2430
    !(A ...
2431
We have parsed the preceding "!", and we are about to parse the "A" operand. We
2432
don't know yet whether there will even be a following binary operand! Both of
2433
these are possibilities for what follows:
2434
    !(A && B)
2435
    !(A)
2436
However, we can still fold the "!" into the "A" operand, because no matter what
2437
the following binary operator will be, we can produce an expression which is
2438
equivalent. */
2439
2440
/* Because it's a non-empty class, there must be an operand at the start. */
2441
0
if (!compile_class_binary_tight(context, negated, &ptr, &code,
2442
0
                                pop_info, lengthptr))
2443
0
  return FALSE;
2444
2445
0
while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR)
2446
0
  {
2447
0
  uint32_t op;
2448
0
  BOOL op_neg;
2449
0
  BOOL rhs_negated;
2450
0
  eclass_op_info rhs_op_info;
2451
2452
0
  if (negated)
2453
0
    {
2454
    /* The whole expression is being negated; we respond by unconditionally
2455
    negating the LHS A, before seeing what follows. And hooray! We can recover,
2456
    no matter what follows. */
2457
    /* !(A || B)   ->  !A && !B                     */
2458
    /* !(A -- B)   ->  !(A && !B)    ->  !A || B    */
2459
    /* !(A XOR B)  ->  !(!A XOR !B)  ->  !A XNOR !B */
2460
0
    op = (*ptr == META_ECLASS_OR )? ECL_AND :
2461
0
         (*ptr == META_ECLASS_SUB)? ECL_OR  :
2462
0
         /*ptr == META_ECLASS_XOR*/ ECL_XOR;
2463
0
    op_neg = (*ptr == META_ECLASS_XOR);
2464
0
    rhs_negated = *ptr != META_ECLASS_SUB;
2465
0
    }
2466
0
  else
2467
0
    {
2468
    /* A || B   ->  A || B  */
2469
    /* A -- B   ->  A && !B */
2470
    /* A XOR B  ->  A XOR B */
2471
0
    op = (*ptr == META_ECLASS_OR )? ECL_OR  :
2472
0
         (*ptr == META_ECLASS_SUB)? ECL_AND :
2473
0
         /*ptr == META_ECLASS_XOR*/ ECL_XOR;
2474
0
    op_neg = FALSE;
2475
0
    rhs_negated = *ptr == META_ECLASS_SUB;
2476
0
    }
2477
2478
0
  ++ptr;
2479
2480
  /* An operand must follow the operator. */
2481
0
  if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code,
2482
0
                                  &rhs_op_info, lengthptr))
2483
0
    return FALSE;
2484
2485
  /* Convert infix to postfix (RPN). */
2486
0
  fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2487
0
  if (op_neg) fold_negation(pop_info, lengthptr, FALSE);
2488
0
  if (lengthptr == NULL)
2489
0
    code = pop_info->code_start + pop_info->length;
2490
0
  }
2491
2492
0
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2493
2494
0
*pptr = ptr;
2495
0
*pcode = code;
2496
0
return TRUE;
2497
0
}
2498
2499
2500
2501
/* This function converts the META codes in pptr into opcodes written to
2502
pcode. The pptr must start at a META_CLASS or META_CLASS_NOT.
2503
2504
The class is compiled as a left-associative sequence of operator
2505
applications.
2506
2507
The pptr will be left pointing at the matching META_CLASS_END. */
2508
2509
static BOOL
2510
compile_eclass_nested(eclass_context *context, BOOL negated,
2511
  uint32_t **pptr, PCRE2_UCHAR **pcode,
2512
  eclass_op_info *pop_info, PCRE2_SIZE *lengthptr)
2513
0
{
2514
0
uint32_t *ptr = *pptr;
2515
#ifdef PCRE2_DEBUG
2516
PCRE2_UCHAR *start_code = *pcode;
2517
#endif
2518
2519
/* The CLASS_IS_ECLASS bit must be set since it is a nested class. */
2520
0
PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) ||
2521
0
             *ptr == (META_CLASS_NOT | CLASS_IS_ECLASS));
2522
2523
0
if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS))
2524
0
  negated = !negated;
2525
2526
0
(*pptr)++;
2527
2528
/* Because it's a non-empty class, there must be an operand at the start. */
2529
0
if (!compile_class_binary_loose(context, negated, pptr, pcode,
2530
0
                                pop_info, lengthptr))
2531
0
  return FALSE;
2532
2533
0
PCRE2_ASSERT(**pptr == META_CLASS_END);
2534
0
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
2535
0
return TRUE;
2536
0
}
2537
2538
BOOL
2539
PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions,
2540
  uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr,
2541
  compile_block *cb, PCRE2_SIZE *lengthptr)
2542
0
{
2543
0
eclass_context context;
2544
0
eclass_op_info op_info;
2545
0
PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0;
2546
0
PCRE2_UCHAR *code = *pcode;
2547
0
PCRE2_UCHAR *previous;
2548
0
BOOL allbitsone = TRUE;
2549
2550
0
context.needs_bitmap = FALSE;
2551
0
context.options = options;
2552
0
context.xoptions = xoptions;
2553
0
context.errorcodeptr = errorcodeptr;
2554
0
context.cb = cb;
2555
2556
0
previous = code;
2557
0
*code++ = OP_ECLASS;
2558
0
code += LINK_SIZE;
2559
0
*code++ = 0;  /* Flags, currently zero. */
2560
0
if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr))
2561
0
  return FALSE;
2562
2563
0
if (lengthptr != NULL)
2564
0
  {
2565
0
  *lengthptr += code - previous;
2566
0
  code = previous;
2567
  /* (*lengthptr - previous_length) now holds the amount of buffer that
2568
  we require to make the call to compile_class_nested() with
2569
  lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out
2570
  before that call. */
2571
0
  }
2572
2573
/* Do some useful counting of what's in the bitmap. */
2574
0
for (int i = 0; i < 8; i++)
2575
0
  if (op_info.bits.classwords[i] != 0xffffffff)
2576
0
    {
2577
0
    allbitsone = FALSE;
2578
0
    break;
2579
0
    }
2580
2581
/* After constant-folding the extended class syntax, it may turn out to be
2582
a simple class after all. In that case, we can unwrap it from the
2583
OP_ECLASS container - and in fact, we must do so, because in 8-bit
2584
no-Unicode mode the matcher is compiled without support for OP_ECLASS. */
2585
2586
#ifndef SUPPORT_WIDE_CHARS
2587
PCRE2_ASSERT(op_info.op_single_type != 0);
2588
#else
2589
0
if (op_info.op_single_type != 0)
2590
0
#endif
2591
0
  {
2592
  /* Rewind back over the OP_ECLASS. */
2593
0
  code = previous;
2594
2595
  /* If the bits are all ones, and the "high characters" are all matched
2596
  too, we use a special-cased encoding of OP_ALLANY. */
2597
2598
0
  if (op_info.op_single_type == ECL_ANY && allbitsone)
2599
0
    {
2600
    /* Advancing code means rewinding lengthptr, at this point. */
2601
0
    if (lengthptr != NULL) *lengthptr -= 1;
2602
0
    *code++ = OP_ALLANY;
2603
0
    }
2604
2605
  /* If the high bits are all matched / all not-matched, then we emit an
2606
  OP_NCLASS/OP_CLASS respectively. */
2607
2608
0
  else if (op_info.op_single_type == ECL_ANY ||
2609
0
           op_info.op_single_type == ECL_NONE)
2610
0
    {
2611
0
    PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR));
2612
2613
0
    if (lengthptr != NULL)
2614
0
      {
2615
0
      if (required_len > (*lengthptr - previous_length))
2616
0
      *lengthptr = previous_length + required_len;
2617
0
      }
2618
2619
    /* Advancing code means rewinding lengthptr, at this point. */
2620
0
    if (lengthptr != NULL) *lengthptr -= required_len;
2621
0
    *code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS;
2622
0
    memcpy(code, op_info.bits.classbits, 32);
2623
0
    code += 32 / sizeof(PCRE2_UCHAR);
2624
0
    }
2625
2626
  /* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data
2627
  there, but, we pulled out its bitmap into op_info, so now we have to
2628
  put that back into the OP_XCLASS. */
2629
2630
0
  else
2631
0
    {
2632
#ifndef SUPPORT_WIDE_CHARS
2633
    PCRE2_DEBUG_UNREACHABLE();
2634
#else
2635
0
    BOOL need_map = context.needs_bitmap;
2636
0
    PCRE2_SIZE required_len;
2637
2638
0
    PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS);
2639
0
    required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0);
2640
2641
0
    if (lengthptr != NULL)
2642
0
      {
2643
      /* Don't unconditionally request all the space we need - we may
2644
      already have asked for more during processing of the ECLASS. */
2645
0
      if (required_len > (*lengthptr - previous_length))
2646
0
        *lengthptr = previous_length + required_len;
2647
2648
      /* The code we write out here won't be ignored, even during the
2649
      (lengthptr != NULL) phase, because if there's a following quantifier
2650
      it will peek backwards. So we do have to write out a (truncated)
2651
      OP_XCLASS, even on this branch. */
2652
0
      *lengthptr -= 1 + LINK_SIZE + 1;
2653
0
      *code++ = OP_XCLASS;
2654
0
      PUT(code, 0, 1 + LINK_SIZE + 1);
2655
0
      code += LINK_SIZE;
2656
0
      *code++ = 0;
2657
0
      }
2658
0
    else
2659
0
      {
2660
0
      PCRE2_UCHAR *rest;
2661
0
      PCRE2_SIZE rest_len;
2662
0
      PCRE2_UCHAR flags;
2663
2664
      /* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */
2665
0
      PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1);
2666
0
      rest = op_info.code_start + 1 + LINK_SIZE + 1;
2667
0
      rest_len = (op_info.code_start + op_info.length) - rest;
2668
2669
      /* First read any data we use, before memmove splats it. */
2670
0
      flags = op_info.code_start[1 + LINK_SIZE];
2671
0
      PCRE2_ASSERT((flags & XCL_MAP) == 0);
2672
2673
      /* Next do the memmove before any writes. */
2674
0
      memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0),
2675
0
              rest, CU2BYTES(rest_len));
2676
2677
      /* Finally write the header data. */
2678
0
      *code++ = OP_XCLASS;
2679
0
      PUT(code, 0, (int)required_len);
2680
0
      code += LINK_SIZE;
2681
0
      *code++ = flags | (need_map? XCL_MAP : 0);
2682
0
      if (need_map)
2683
0
        {
2684
0
        memcpy(code, op_info.bits.classbits, 32);
2685
0
        code += 32 / sizeof(PCRE2_UCHAR);
2686
0
        }
2687
0
      code += rest_len;
2688
0
      }
2689
0
#endif /* SUPPORT_WIDE_CHARS */
2690
0
    }
2691
0
  }
2692
2693
/* Otherwise, we're going to keep the OP_ECLASS. However, again we need
2694
to do some adjustment to insert the bitmap if we have one. */
2695
2696
0
#ifdef SUPPORT_WIDE_CHARS
2697
0
else
2698
0
  {
2699
0
  BOOL need_map = context.needs_bitmap;
2700
0
  PCRE2_SIZE required_len =
2701
0
    1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length;
2702
2703
0
  if (lengthptr != NULL)
2704
0
    {
2705
0
    if (required_len > (*lengthptr - previous_length))
2706
0
      *lengthptr = previous_length + required_len;
2707
2708
    /* As for the XCLASS branch above, we do have to write out a dummy
2709
    OP_ECLASS, because of the backwards peek by the quantifier code. Write
2710
    out a (truncated) OP_ECLASS, even on this branch. */
2711
0
    *lengthptr -= 1 + LINK_SIZE + 1;
2712
0
    *code++ = OP_ECLASS;
2713
0
    PUT(code, 0, 1 + LINK_SIZE + 1);
2714
0
    code += LINK_SIZE;
2715
0
    *code++ = 0;
2716
0
    }
2717
0
  else
2718
0
    {
2719
0
    if (need_map)
2720
0
      {
2721
0
      PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1;
2722
0
      previous[1 + LINK_SIZE] |= ECL_MAP;
2723
0
      memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start,
2724
0
              CU2BYTES(code - map_start));
2725
0
      memcpy(map_start, op_info.bits.classbits, 32);
2726
0
      code += 32 / sizeof(PCRE2_UCHAR);
2727
0
      }
2728
0
    PUT(previous, 1, (int)(code - previous));
2729
0
    }
2730
0
  }
2731
0
#endif /* SUPPORT_WIDE_CHARS */
2732
2733
0
*pcode = code;
2734
0
return TRUE;
2735
0
}
2736
2737
/* End of pcre2_compile_class.c */