Coverage Report

Created: 2025-06-13 06:43

/src/php-src/ext/pcre/pcre2lib/pcre2_substitute.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#ifdef HAVE_CONFIG_H
43
#include "config.h"
44
#endif
45
46
#include "pcre2_internal.h"
47
48
0
#define PTR_STACK_SIZE 20
49
50
#define SUBSTITUTE_OPTIONS \
51
0
  (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52
0
   PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53
0
   PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54
0
   PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
55
56
57
58
/*************************************************
59
*           Find end of substitute text          *
60
*************************************************/
61
62
/* In extended mode, we recognize ${name:+set text:unset text} and similar
63
constructions. This requires the identification of unescaped : and }
64
characters. This function scans for such. It must deal with nested ${
65
constructions. The pointer to the text is updated, either to the required end
66
character, or to where an error was detected.
67
68
Arguments:
69
  code      points to the compiled expression (for options)
70
  ptrptr    points to the pointer to the start of the text (updated)
71
  ptrend    end of the whole string
72
  last      TRUE if the last expected string (only } recognized)
73
74
Returns:    0 on success
75
            negative error code on failure
76
*/
77
78
static int
79
find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80
  BOOL last)
81
0
{
82
0
int rc = 0;
83
0
uint32_t nestlevel = 0;
84
0
BOOL literal = FALSE;
85
0
PCRE2_SPTR ptr = *ptrptr;
86
87
0
for (; ptr < ptrend; ptr++)
88
0
  {
89
0
  if (literal)
90
0
    {
91
0
    if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92
0
      {
93
0
      literal = FALSE;
94
0
      ptr += 1;
95
0
      }
96
0
    }
97
98
0
  else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99
0
    {
100
0
    if (nestlevel == 0) goto EXIT;
101
0
    nestlevel--;
102
0
    }
103
104
0
  else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105
106
0
  else if (*ptr == CHAR_DOLLAR_SIGN)
107
0
    {
108
0
    if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109
0
      {
110
0
      nestlevel++;
111
0
      ptr += 1;
112
0
      }
113
0
    }
114
115
0
  else if (*ptr == CHAR_BACKSLASH)
116
0
    {
117
0
    int erc;
118
0
    int errorcode;
119
0
    uint32_t ch;
120
121
0
    if (ptr < ptrend - 1) switch (ptr[1])
122
0
      {
123
0
      case CHAR_L:
124
0
      case CHAR_l:
125
0
      case CHAR_U:
126
0
      case CHAR_u:
127
0
      ptr += 1;
128
0
      continue;
129
0
      }
130
131
0
    ptr += 1;  /* Must point after \ */
132
0
    erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
133
0
      code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);
134
0
    ptr -= 1;  /* Back to last code unit of escape */
135
0
    if (errorcode != 0)
136
0
      {
137
      /* errorcode from check_escape is positive, so must not be returned by
138
      pcre2_substitute(). */
139
0
      rc = PCRE2_ERROR_BADREPESCAPE;
140
0
      goto EXIT;
141
0
      }
142
143
0
    switch(erc)
144
0
      {
145
0
      case 0:      /* Data character */
146
0
      case ESC_b:  /* Data character */
147
0
      case ESC_v:  /* Data character */
148
0
      case ESC_E:  /* Isolated \E is ignored */
149
0
      break;
150
151
0
      case ESC_Q:
152
0
      literal = TRUE;
153
0
      break;
154
155
0
      case ESC_g:
156
      /* The \g<name> form (\g<number> already handled by check_escape)
157
158
      Don't worry about finding the matching ">". We are super, super lenient
159
      about validating ${} replacements inside find_text_end(), so we certainly
160
      don't need to worry about other syntax. Importantly, a \g<..> or $<...>
161
      sequence can't contain a '}' character. */
162
0
      break;
163
164
0
      default:
165
0
      if (erc < 0)
166
0
          break;  /* capture group reference */
167
0
      rc = PCRE2_ERROR_BADREPESCAPE;
168
0
      goto EXIT;
169
0
      }
170
0
    }
171
0
  }
172
173
0
rc = PCRE2_ERROR_REPMISSINGBRACE;   /* Terminator not found */
174
175
0
EXIT:
176
0
*ptrptr = ptr;
177
0
return rc;
178
0
}
179
180
181
/*************************************************
182
*           Validate group name                  *
183
*************************************************/
184
185
/* This function scans for a capture group name, validating it
186
consists of legal characters, is not empty, and does not exceed
187
MAX_NAME_SIZE.
188
189
Arguments:
190
  ptrptr    points to the pointer to the start of the text (updated)
191
  ptrend    end of the whole string
192
  utf       true if the input is UTF-encoded
193
  ctypes    pointer to the character types table
194
195
Returns:    TRUE if a name was read
196
            FALSE otherwise
197
*/
198
199
static BOOL
200
read_name_subst(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf,
201
    const uint8_t* ctypes)
202
0
{
203
0
PCRE2_SPTR ptr = *ptrptr;
204
0
PCRE2_SPTR nameptr = ptr;
205
206
0
if (ptr >= ptrend)                 /* No characters in name */
207
0
  goto FAILED;
208
209
/* We do not need to check whether the name starts with a non-digit.
210
We are simply referencing names here, not defining them. */
211
212
/* See read_name in the pcre2_compile.c for the corresponding logic
213
restricting group names inside the pattern itself. */
214
215
0
#ifdef SUPPORT_UNICODE
216
0
if (utf)
217
0
  {
218
0
  uint32_t c, type;
219
220
0
  while (ptr < ptrend)
221
0
    {
222
0
    GETCHAR(c, ptr);
223
0
    type = UCD_CHARTYPE(c);
224
0
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
225
0
        c != CHAR_UNDERSCORE) break;
226
0
    ptr++;
227
0
    FORWARDCHARTEST(ptr, ptrend);
228
0
    }
229
0
  }
230
0
else
231
#else
232
(void)utf;  /* Avoid compiler warning */
233
#endif      /* SUPPORT_UNICODE */
234
235
/* Handle group names in non-UTF modes. */
236
237
0
  {
238
0
  while (ptr < ptrend && MAX_255(*ptr) && (ctypes[*ptr] & ctype_word) != 0)
239
0
    {
240
0
    ptr++;
241
0
    }
242
0
  }
243
244
/* Check name length */
245
246
0
if (ptr - nameptr > MAX_NAME_SIZE)
247
0
  goto FAILED;
248
249
/* Subpattern names must not be empty */
250
0
if (ptr == nameptr)
251
0
  goto FAILED;
252
253
0
*ptrptr = ptr;
254
0
return TRUE;
255
256
0
FAILED:
257
0
*ptrptr = ptr;
258
0
return FALSE;
259
0
}
260
261
262
/*************************************************
263
*              Case transformations              *
264
*************************************************/
265
266
0
#define PCRE2_SUBSTITUTE_CASE_NONE                 0
267
// 1, 2, 3 are PCRE2_SUBSTITUTE_CASE_LOWER, UPPER, TITLE_FIRST.
268
0
#define PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST  4
269
270
typedef struct {
271
  int to_case; /* One of PCRE2_SUBSTITUTE_CASE_xyz */
272
  BOOL single_char;
273
} case_state;
274
275
/* Helper to guess how much a string is likely to increase in size when
276
case-transformed. Usually, strings don't change size at all, but some rare
277
characters do grow. Estimate +10%, plus another few characters.
278
279
Performing this estimation is unfortunate, but inevitable, since we can't call
280
the callout if we ran out of buffer space to prepare its input.
281
282
Because this estimate is inexact (and in pathological cases, underestimates the
283
required buffer size) we must document that when you have a
284
substitute_case_callout, and you are using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, you
285
may need more than two calls to determine the final buffer size. */
286
287
static PCRE2_SIZE
288
pessimistic_case_inflation(PCRE2_SIZE len)
289
0
{
290
0
return (len >> 3u) + 10;
291
0
}
292
293
/* Case transformation behaviour if no callout is passed. */
294
295
static PCRE2_SIZE
296
default_substitute_case_callout(
297
  PCRE2_SPTR input, PCRE2_SIZE input_len,
298
  PCRE2_UCHAR *output, PCRE2_SIZE output_cap,
299
  case_state *state, const pcre2_code *code)
300
0
{
301
0
PCRE2_SPTR input_end = input + input_len;
302
0
#ifdef SUPPORT_UNICODE
303
0
BOOL utf;
304
0
BOOL ucp;
305
0
#endif
306
0
PCRE2_UCHAR temp[6];
307
0
BOOL next_to_upper;
308
0
BOOL rest_to_upper;
309
0
BOOL single_char;
310
0
BOOL overflow = FALSE;
311
0
PCRE2_SIZE written = 0;
312
313
/* Helpful simplifying invariant: input and output are disjoint buffers.
314
I believe that this code is technically undefined behaviour, because the two
315
pointers input/output are "unrelated" pointers and hence not comparable. Casting
316
via char* bypasses some but not all of those technical rules. It is not included
317
in release builds, in any case. */
318
0
PCRE2_ASSERT((char *)(input + input_len) <= (char *)output ||
319
0
             (char *)(output + output_cap) <= (char *)input);
320
321
0
#ifdef SUPPORT_UNICODE
322
0
utf = (code->overall_options & PCRE2_UTF) != 0;
323
0
ucp = (code->overall_options & PCRE2_UCP) != 0;
324
0
#endif
325
326
0
if (input_len == 0) return 0;
327
328
0
switch (state->to_case)
329
0
  {
330
0
  default:
331
0
  PCRE2_DEBUG_UNREACHABLE();
332
0
  return 0;
333
334
0
  case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE
335
0
  case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE
336
0
  next_to_upper = rest_to_upper = (state->to_case == PCRE2_SUBSTITUTE_CASE_UPPER);
337
0
  break;
338
339
0
  case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE
340
0
  next_to_upper = TRUE;
341
0
  rest_to_upper = FALSE;
342
0
  state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
343
0
  break;
344
345
0
  case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE
346
0
  next_to_upper = FALSE;
347
0
  rest_to_upper = TRUE;
348
0
  state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
349
0
  break;
350
0
  }
351
352
0
single_char = state->single_char;
353
0
if (single_char)
354
0
  state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;
355
356
0
while (input < input_end)
357
0
  {
358
0
  uint32_t ch;
359
0
  unsigned int chlen;
360
361
0
  GETCHARINCTEST(ch, input);
362
363
0
#ifdef SUPPORT_UNICODE
364
0
  if ((utf || ucp) && ch >= 128)
365
0
    {
366
0
    uint32_t type = UCD_CHARTYPE(ch);
367
0
    if (PRIV(ucp_gentype)[type] == ucp_L &&
368
0
        type != (next_to_upper? ucp_Lu : ucp_Ll))
369
0
      ch = UCD_OTHERCASE(ch);
370
371
    /* TODO This is far from correct... it doesn't support the SpecialCasing.txt
372
    mappings, but worse, it's not even correct for all the ordinary case
373
    mappings. We should add support for those (at least), and then add the
374
    SpecialCasing.txt mappings for Esszet and ligatures, and finally use the
375
    Turkish casing flag on the match context. */
376
0
    }
377
0
  else
378
0
#endif
379
0
  if (MAX_255(ch))
380
0
    {
381
0
    if (((code->tables + cbits_offset +
382
0
        (next_to_upper? cbit_upper:cbit_lower)
383
0
        )[ch/8] & (1u << (ch%8))) == 0)
384
0
      ch = (code->tables + fcc_offset)[ch];
385
0
    }
386
387
0
#ifdef SUPPORT_UNICODE
388
0
  if (utf) chlen = PRIV(ord2utf)(ch, temp); else
389
0
#endif
390
0
    {
391
0
    temp[0] = ch;
392
0
    chlen = 1;
393
0
    }
394
395
0
  if (!overflow && chlen <= output_cap)
396
0
    {
397
0
    memcpy(output, temp, CU2BYTES(chlen));
398
0
    output += chlen;
399
0
    output_cap -= chlen;
400
0
    }
401
0
  else
402
0
    {
403
0
    overflow = TRUE;
404
0
    }
405
406
0
  if (chlen > ~(PCRE2_SIZE)0 - written)  /* Integer overflow */
407
0
    return ~(PCRE2_SIZE)0;
408
0
  written += chlen;
409
410
0
  next_to_upper = rest_to_upper;
411
412
  /* memcpy the remainder, if only transforming a single character. */
413
414
0
  if (single_char)
415
0
    {
416
0
    PCRE2_SIZE rest_len = input_end - input;
417
418
0
    if (!overflow && rest_len <= output_cap)
419
0
      memcpy(output, input, CU2BYTES(rest_len));
420
421
0
    if (rest_len > ~(PCRE2_SIZE)0 - written)  /* Integer overflow */
422
0
      return ~(PCRE2_SIZE)0;
423
0
    written += rest_len;
424
425
0
    return written;
426
0
    }
427
0
  }
428
429
0
return written;
430
0
}
431
432
/* Helper to perform the call to the substitute_case_callout. We wrap the
433
user-provided callout because our internal arguments are slightly extended. We
434
don't want the user callout to handle the case of "\l" (first character only to
435
lowercase) or "\l\U" (first character to lowercase, rest to uppercase) because
436
those are not operations defined by Unicode. Instead the user callout simply
437
needs to provide the three Unicode primitives: lower, upper, titlecase. */
438
439
static PCRE2_SIZE
440
do_case_copy(
441
  PCRE2_UCHAR *input_output, PCRE2_SIZE input_len, PCRE2_SIZE output_cap,
442
  case_state *state, BOOL utf,
443
  PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
444
                                        PCRE2_SIZE, int, void *),
445
  void *substitute_case_callout_data)
446
0
{
447
0
PCRE2_SPTR input = input_output;
448
0
PCRE2_UCHAR *output = input_output;
449
0
PCRE2_SIZE rc;
450
0
PCRE2_SIZE rc2;
451
0
int ch1_to_case;
452
0
int rest_to_case;
453
0
PCRE2_UCHAR ch1[6];
454
0
PCRE2_SIZE ch1_len;
455
0
PCRE2_SPTR rest;
456
0
PCRE2_SIZE rest_len;
457
0
BOOL ch1_overflow = FALSE;
458
0
BOOL rest_overflow = FALSE;
459
460
#if PCRE2_CODE_UNIT_WIDTH == 32 || !defined(SUPPORT_UNICODE)
461
(void)utf; /* Avoid compiler warning. */
462
#endif
463
464
0
PCRE2_ASSERT(input_len != 0);
465
466
0
switch (state->to_case)
467
0
  {
468
0
  default:
469
0
  PCRE2_DEBUG_UNREACHABLE();
470
0
  return 0;
471
472
0
  case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE
473
0
  case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE
474
0
  case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE
475
476
  /* The easy case, where our internal casing operations align with those of
477
  the callout. */
478
479
0
  if (state->single_char == FALSE)
480
0
    {
481
0
    rc = substitute_case_callout(input, input_len, output, output_cap,
482
0
                                 state->to_case, substitute_case_callout_data);
483
484
0
    if (state->to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST)
485
0
      state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
486
487
0
    return rc;
488
0
    }
489
490
0
  ch1_to_case = state->to_case;
491
0
  rest_to_case = PCRE2_SUBSTITUTE_CASE_NONE;
492
0
  break;
493
494
0
  case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE
495
0
  ch1_to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
496
0
  rest_to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
497
0
  break;
498
0
  }
499
500
/* Identify the leading character. Take copy, because its storage overlaps with
501
`output`, and hence may be scrambled by the callout. */
502
503
0
  {
504
0
  PCRE2_SPTR ch_end = input;
505
0
  uint32_t ch;
506
507
0
  GETCHARINCTEST(ch, ch_end);
508
0
  (void) ch;
509
0
  PCRE2_ASSERT(ch_end <= input + input_len && ch_end - input <= 6);
510
0
  ch1_len = ch_end - input;
511
0
  memcpy(ch1, input, CU2BYTES(ch1_len));
512
0
  }
513
514
0
rest = input + ch1_len;
515
0
rest_len = input_len - ch1_len;
516
517
/* Transform just ch1. The buffers are always in-place (input == output). With a
518
custom callout, we need a loop to discover its required buffer size. The loop
519
wouldn't be required if the callout were well-behaved, but it might be naughty
520
and return "5" the first time, then "10" the next time we call it using the
521
exact same input! */
522
523
0
  {
524
0
  PCRE2_SIZE ch1_cap;
525
0
  PCRE2_SIZE max_ch1_cap;
526
527
0
  ch1_cap = ch1_len;  /* First attempt uses the space vacated by ch1. */
528
0
  PCRE2_ASSERT(output_cap >= input_len && input_len >= rest_len);
529
0
  max_ch1_cap = output_cap - rest_len;
530
531
0
  while (TRUE)
532
0
    {
533
0
    rc = substitute_case_callout(ch1, ch1_len, output, ch1_cap, ch1_to_case,
534
0
                                 substitute_case_callout_data);
535
0
    if (rc == ~(PCRE2_SIZE)0) return rc;
536
537
0
    if (rc <= ch1_cap) break;
538
539
0
    if (rc > max_ch1_cap)
540
0
      {
541
0
      ch1_overflow = TRUE;
542
0
      break;
543
0
      }
544
545
    /* Move the rest to the right, to make room for expanding ch1. */
546
547
0
    memmove(input_output + rc, rest, CU2BYTES(rest_len));
548
0
    rest = input + rc;
549
550
0
    ch1_cap = rc;
551
552
    /* Proof of loop termination: `ch1_cap` is growing on each iteration, but
553
    the loop ends if `rc` reaches the (unchanging) upper bound of output_cap. */
554
0
    }
555
0
  }
556
557
0
if (rest_to_case == PCRE2_SUBSTITUTE_CASE_NONE)
558
0
  {
559
0
  if (!ch1_overflow)
560
0
    {
561
0
    PCRE2_ASSERT(rest_len <= output_cap - rc);
562
0
    memmove(output + rc, rest, CU2BYTES(rest_len));
563
0
    }
564
0
  rc2 = rest_len;
565
566
0
  state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;
567
0
  }
568
0
else
569
0
  {
570
0
  PCRE2_UCHAR dummy[1];
571
572
0
  rc2 = substitute_case_callout(rest, rest_len,
573
0
                                ch1_overflow? dummy : output + rc,
574
0
                                ch1_overflow? 0u : output_cap - rc,
575
0
                                rest_to_case, substitute_case_callout_data);
576
0
  if (rc2 == ~(PCRE2_SIZE)0) return rc2;
577
578
0
  if (!ch1_overflow && rc2 > output_cap - rc) rest_overflow = TRUE;
579
580
  /* If ch1 grows so that `xform(ch1)+rest` can't fit in the buffer, but then
581
  `rest` shrinks, it's actually possible for the total calculated length of
582
  `xform(ch1)+xform(rest)` to come out at less than output_cap. But we can't
583
  report that, because it would make it seem that the operation succeeded.
584
  If either of xform(ch1) or xform(rest) won't fit in the buffer, our final
585
  result must be > output_cap. */
586
0
  if (ch1_overflow && rc2 < rest_len)
587
0
    rc2 = rest_len;
588
589
0
  state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
590
0
  }
591
592
0
if (rc2 > ~(PCRE2_SIZE)0 - rc)  /* Integer overflow */
593
0
  return ~(PCRE2_SIZE)0;
594
595
0
PCRE2_ASSERT(!(ch1_overflow || rest_overflow) || rc + rc2 > output_cap);
596
0
(void)rest_overflow;
597
598
0
return rc + rc2;
599
0
}
600
601
602
/*************************************************
603
*              Match and substitute              *
604
*************************************************/
605
606
/* This function applies a compiled re to a subject string and creates a new
607
string with substitutions. The first 7 arguments are the same as for
608
pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
609
610
Arguments:
611
  code            points to the compiled expression
612
  subject         points to the subject string
613
  length          length of subject string (may contain binary zeros)
614
  start_offset    where to start in the subject string
615
  options         option bits
616
  match_data      points to a match_data block, or is NULL
617
  context         points a PCRE2 context
618
  replacement     points to the replacement string
619
  rlength         length of replacement string
620
  buffer          where to put the substituted string
621
  blength         points to length of buffer; updated to length of string
622
623
Returns:          >= 0 number of substitutions made
624
                  < 0 an error code
625
                  PCRE2_ERROR_BADREPLACEMENT means invalid use of $
626
*/
627
628
/* This macro checks for space in the buffer before copying into it. On
629
overflow, either give an error immediately, or keep on, accumulating the
630
length. */
631
632
#define CHECKMEMCPY(from, length_) \
633
0
  do {    \
634
0
     PCRE2_SIZE chkmc_length = length_; \
635
0
     if (overflowed) \
636
0
       {  \
637
0
       if (chkmc_length > ~(PCRE2_SIZE)0 - extra_needed)  /* Integer overflow */ \
638
0
         goto TOOLARGEREPLACE; \
639
0
       extra_needed += chkmc_length; \
640
0
       }  \
641
0
     else if (lengthleft < chkmc_length) \
642
0
       {  \
643
0
       if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
644
0
       overflowed = TRUE; \
645
0
       extra_needed = chkmc_length - lengthleft; \
646
0
       }  \
647
0
     else \
648
0
       {  \
649
0
       memcpy(buffer + buff_offset, from, CU2BYTES(chkmc_length)); \
650
0
       buff_offset += chkmc_length; \
651
0
       lengthleft -= chkmc_length; \
652
0
       }  \
653
0
     }    \
654
0
  while (0)
655
656
/* This macro checks for space and copies characters with casing modifications.
657
On overflow, it behaves as for CHECKMEMCPY().
658
659
When substitute_case_callout is NULL, the source and destination buffers must
660
not overlap, because our default handler does not support this. */
661
662
#define CHECKCASECPY_BASE(length_, do_call) \
663
0
  do {    \
664
0
     PCRE2_SIZE chkcc_length = (PCRE2_SIZE)(length_); \
665
0
     PCRE2_SIZE chkcc_rc; \
666
0
     do_call \
667
0
     if (lengthleft < chkcc_rc) \
668
0
       {  \
669
0
       if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
670
0
       overflowed = TRUE; \
671
0
       extra_needed = chkcc_rc - lengthleft; \
672
0
       }  \
673
0
     else \
674
0
       {  \
675
0
       buff_offset += chkcc_rc; \
676
0
       lengthleft -= chkcc_rc; \
677
0
       }  \
678
0
     }    \
679
0
  while (0)
680
681
#define CHECKCASECPY_DEFAULT(from, length_) \
682
0
  CHECKCASECPY_BASE(length_, { \
683
0
    chkcc_rc = default_substitute_case_callout(from, chkcc_length,         \
684
0
                                               buffer + buff_offset,       \
685
0
                                               overflowed? 0 : lengthleft, \
686
0
                                               &forcecase, code);          \
687
0
    if (overflowed) \
688
0
      { \
689
0
      if (chkcc_rc > ~(PCRE2_SIZE)0 - extra_needed)  /* Integer overflow */ \
690
0
        goto TOOLARGEREPLACE; \
691
0
      extra_needed += chkcc_rc; \
692
0
      break; \
693
0
      } \
694
0
  })
695
696
#define CHECKCASECPY_CALLOUT(length_) \
697
0
  CHECKCASECPY_BASE(length_, { \
698
0
    chkcc_rc = do_case_copy(buffer + buff_offset, chkcc_length, \
699
0
                            lengthleft, &forcecase, utf,        \
700
0
                            substitute_case_callout,            \
701
0
                            substitute_case_callout_data);      \
702
0
    if (chkcc_rc == ~(PCRE2_SIZE)0) goto CASEERROR; \
703
0
  })
704
705
/* This macro does a delayed case transformation, for the situation when we have
706
a case-forcing callout. */
707
708
#define DELAYEDFORCECASE() \
709
0
  do {      \
710
0
     PCRE2_SIZE chars_outstanding = (buff_offset - casestart_offset) + \
711
0
            (extra_needed - casestart_extra_needed); \
712
0
     if (chars_outstanding > 0) \
713
0
       {    \
714
0
       if (overflowed) \
715
0
         {  \
716
0
         PCRE2_SIZE guess = pessimistic_case_inflation(chars_outstanding); \
717
0
         if (guess > ~(PCRE2_SIZE)0 - extra_needed)  /* Integer overflow */ \
718
0
           goto TOOLARGEREPLACE; \
719
0
         extra_needed += guess; \
720
0
         }  \
721
0
       else \
722
0
         {  \
723
0
         /* Rewind the buffer */ \
724
0
         lengthleft += (buff_offset - casestart_offset); \
725
0
         buff_offset = casestart_offset; \
726
0
         /* Care! In-place case transformation */ \
727
0
         CHECKCASECPY_CALLOUT(chars_outstanding); \
728
0
         }  \
729
0
       }    \
730
0
     }      \
731
0
  while (0)
732
733
734
/* Here's the function */
735
736
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
737
pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
738
  PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
739
  pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
740
  PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
741
0
{
742
0
int rc;
743
0
int subs;
744
0
uint32_t ovector_count;
745
0
uint32_t goptions = 0;
746
0
uint32_t suboptions;
747
0
pcre2_match_data *internal_match_data = NULL;
748
0
BOOL escaped_literal = FALSE;
749
0
BOOL overflowed = FALSE;
750
0
BOOL use_existing_match;
751
0
BOOL replacement_only;
752
0
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
753
0
PCRE2_UCHAR temp[6];
754
0
PCRE2_SPTR ptr;
755
0
PCRE2_SPTR repend = NULL;
756
0
PCRE2_SIZE extra_needed = 0;
757
0
PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
758
0
PCRE2_SIZE *ovector;
759
0
PCRE2_SIZE ovecsave[3];
760
0
pcre2_substitute_callout_block scb;
761
0
PCRE2_SIZE sub_start_extra_needed;
762
0
PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
763
0
                                      PCRE2_SIZE, int, void *) = NULL;
764
0
void *substitute_case_callout_data = NULL;
765
766
/* General initialization */
767
768
0
buff_offset = 0;
769
0
lengthleft = buff_length = *blength;
770
0
*blength = PCRE2_UNSET;
771
0
ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
772
773
0
if (mcontext != NULL)
774
0
  {
775
0
  substitute_case_callout = mcontext->substitute_case_callout;
776
0
  substitute_case_callout_data = mcontext->substitute_case_callout_data;
777
0
  }
778
779
/* Partial matching is not valid. This must come after setting *blength to
780
PCRE2_UNSET, so as not to imply an offset in the replacement. */
781
782
0
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
783
0
  return PCRE2_ERROR_BADOPTION;
784
785
/* Validate length and find the end of the replacement. A NULL replacement of
786
zero length is interpreted as an empty string. */
787
788
0
if (replacement == NULL)
789
0
  {
790
0
  if (rlength != 0) return PCRE2_ERROR_NULL;
791
0
  replacement = (PCRE2_SPTR)"";
792
0
  }
793
794
0
if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
795
0
repend = replacement + rlength;
796
797
/* Check for using a match that has already happened. Note that the subject
798
pointer in the match data may be NULL after a no-match. */
799
800
0
use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
801
0
replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
802
803
/* If starting from an existing match, there must be an externally provided
804
match data block. We create an internal match_data block in two cases: (a) an
805
external one is not supplied (and we are not starting from an existing match);
806
(b) an existing match is to be used for the first substitution. In the latter
807
case, we copy the existing match into the internal block, except for any cached
808
heap frame size and pointer. This ensures that no changes are made to the
809
external match data block. */
810
811
/* WARNING: In both cases below a general context is constructed "by hand"
812
because calling pcre2_general_context_create() involves a memory allocation. If
813
the contents of a general context control block are ever changed there will
814
have to be changes below. */
815
816
0
if (match_data == NULL)
817
0
  {
818
0
  pcre2_general_context gcontext;
819
0
  if (use_existing_match) return PCRE2_ERROR_NULL;
820
0
  gcontext.memctl = (mcontext == NULL)?
821
0
    ((const pcre2_real_code *)code)->memctl :
822
0
    ((pcre2_real_match_context *)mcontext)->memctl;
823
0
  match_data = internal_match_data =
824
0
    pcre2_match_data_create_from_pattern(code, &gcontext);
825
0
  if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
826
0
  }
827
828
0
else if (use_existing_match)
829
0
  {
830
0
  int pairs;
831
0
  pcre2_general_context gcontext;
832
0
  gcontext.memctl = (mcontext == NULL)?
833
0
    ((const pcre2_real_code *)code)->memctl :
834
0
    ((pcre2_real_match_context *)mcontext)->memctl;
835
0
  pairs = (code->top_bracket + 1 < match_data->oveccount)?
836
0
    code->top_bracket + 1 : match_data->oveccount;
837
0
  internal_match_data = pcre2_match_data_create(match_data->oveccount,
838
0
    &gcontext);
839
0
  if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
840
0
  memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
841
0
    + 2*pairs*sizeof(PCRE2_SIZE));
842
0
  internal_match_data->heapframes = NULL;
843
0
  internal_match_data->heapframes_size = 0;
844
0
  match_data = internal_match_data;
845
0
  }
846
847
/* Remember ovector details */
848
849
0
ovector = pcre2_get_ovector_pointer(match_data);
850
0
ovector_count = pcre2_get_ovector_count(match_data);
851
852
/* Fixed things in the callout block */
853
854
0
scb.version = 0;
855
0
scb.input = subject;
856
0
scb.output = (PCRE2_SPTR)buffer;
857
0
scb.ovector = ovector;
858
859
/* A NULL subject of zero length is treated as an empty string. */
860
861
0
if (subject == NULL)
862
0
  {
863
0
  if (length != 0) return PCRE2_ERROR_NULL;
864
0
  subject = (PCRE2_SPTR)"";
865
0
  }
866
867
/* Find length of zero-terminated subject */
868
869
0
if (length == PCRE2_ZERO_TERMINATED)
870
0
  length = subject? PRIV(strlen)(subject) : 0;
871
872
/* Check UTF replacement string if necessary. */
873
874
0
#ifdef SUPPORT_UNICODE
875
0
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
876
0
  {
877
0
  rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
878
0
  if (rc != 0)
879
0
    {
880
0
    match_data->leftchar = 0;
881
0
    goto EXIT;
882
0
    }
883
0
  }
884
0
#endif  /* SUPPORT_UNICODE */
885
886
/* Save the substitute options and remove them from the match options. */
887
888
0
suboptions = options & SUBSTITUTE_OPTIONS;
889
0
options &= ~SUBSTITUTE_OPTIONS;
890
891
/* Error if the start match offset is greater than the length of the subject. */
892
893
0
if (start_offset > length)
894
0
  {
895
0
  match_data->leftchar = 0;
896
0
  rc = PCRE2_ERROR_BADOFFSET;
897
0
  goto EXIT;
898
0
  }
899
900
/* Copy up to the start offset, unless only the replacement is required. */
901
902
0
if (!replacement_only) CHECKMEMCPY(subject, start_offset);
903
904
/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
905
match is taken from the match_data that was passed in. */
906
907
0
subs = 0;
908
0
do
909
0
  {
910
0
  PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
911
0
  uint32_t ptrstackptr = 0;
912
0
  case_state forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };
913
0
  PCRE2_SIZE casestart_offset = 0;
914
0
  PCRE2_SIZE casestart_extra_needed = 0;
915
916
0
  if (use_existing_match)
917
0
    {
918
0
    rc = match_data->rc;
919
0
    use_existing_match = FALSE;
920
0
    }
921
0
  else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
922
0
    match_data, mcontext);
923
924
0
#ifdef SUPPORT_UNICODE
925
0
  if (utf) options |= PCRE2_NO_UTF_CHECK;  /* Only need to check once */
926
0
#endif
927
928
  /* Any error other than no match returns the error code. No match when not
929
  doing the special after-empty-match global rematch, or when at the end of the
930
  subject, breaks the global loop. Otherwise, advance the starting point by one
931
  character, copying it to the output, and try again. */
932
933
0
  if (rc < 0)
934
0
    {
935
0
    PCRE2_SIZE save_start;
936
937
0
    if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
938
0
    if (goptions == 0 || start_offset >= length) break;
939
940
    /* Advance by one code point. Then, if CRLF is a valid newline sequence and
941
    we have advanced into the middle of it, advance one more code point. In
942
    other words, do not start in the middle of CRLF, even if CR and LF on their
943
    own are valid newlines. */
944
945
0
    save_start = start_offset++;
946
0
    if (subject[start_offset-1] == CHAR_CR &&
947
0
        (code->newline_convention == PCRE2_NEWLINE_CRLF ||
948
0
         code->newline_convention == PCRE2_NEWLINE_ANY ||
949
0
         code->newline_convention == PCRE2_NEWLINE_ANYCRLF) &&
950
0
        start_offset < length &&
951
0
        subject[start_offset] == CHAR_LF)
952
0
      start_offset++;
953
954
    /* Otherwise, in UTF mode, advance past any secondary code points. */
955
956
0
    else if ((code->overall_options & PCRE2_UTF) != 0)
957
0
      {
958
0
#if PCRE2_CODE_UNIT_WIDTH == 8
959
0
      while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
960
0
        start_offset++;
961
#elif PCRE2_CODE_UNIT_WIDTH == 16
962
      while (start_offset < length &&
963
            (subject[start_offset] & 0xfc00) == 0xdc00)
964
        start_offset++;
965
#endif
966
0
      }
967
968
    /* Copy what we have advanced past (unless not required), reset the special
969
    global options, and continue to the next match. */
970
971
0
    fraglength = start_offset - save_start;
972
0
    if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
973
0
    goptions = 0;
974
0
    continue;
975
0
    }
976
977
  /* Handle a successful match. Matches that use \K to end before they start
978
  or start before the current point in the subject are not supported. */
979
980
0
  if (ovector[1] < ovector[0] || ovector[0] < start_offset)
981
0
    {
982
0
    rc = PCRE2_ERROR_BADSUBSPATTERN;
983
0
    goto EXIT;
984
0
    }
985
986
  /* Check for the same match as previous. This is legitimate after matching an
987
  empty string that starts after the initial match offset. We have tried again
988
  at the match point in case the pattern is one like /(?<=\G.)/ which can never
989
  match at its starting point, so running the match achieves the bumpalong. If
990
  we do get the same (null) match at the original match point, it isn't such a
991
  pattern, so we now do the empty string magic. In all other cases, a repeat
992
  match should never occur. */
993
994
0
  if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
995
0
    {
996
0
    if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
997
0
      {
998
0
      goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
999
0
      ovecsave[2] = start_offset;
1000
0
      continue;    /* Back to the top of the loop */
1001
0
      }
1002
0
    rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
1003
0
    goto EXIT;
1004
0
    }
1005
1006
  /* Count substitutions with a paranoid check for integer overflow; surely no
1007
  real call to this function would ever hit this! */
1008
1009
0
  if (subs == INT_MAX)
1010
0
    {
1011
0
    rc = PCRE2_ERROR_TOOMANYREPLACE;
1012
0
    goto EXIT;
1013
0
    }
1014
0
  subs++;
1015
1016
  /* Copy the text leading up to the match (unless not required); remember
1017
  where the insert begins and how many ovector pairs are set; and remember how
1018
  much space we have requested in extra_needed. */
1019
1020
0
  if (rc == 0) rc = ovector_count;
1021
0
  fraglength = ovector[0] - start_offset;
1022
0
  if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
1023
0
  scb.output_offsets[0] = buff_offset;
1024
0
  scb.oveccount = rc;
1025
0
  sub_start_extra_needed = extra_needed;
1026
1027
  /* Process the replacement string. If the entire replacement is literal, just
1028
  copy it with length check. */
1029
1030
0
  ptr = replacement;
1031
0
  if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
1032
0
    {
1033
0
    CHECKMEMCPY(ptr, rlength);
1034
0
    }
1035
1036
  /* Within a non-literal replacement, which must be scanned character by
1037
  character, local literal mode can be set by \Q, but only in extended mode
1038
  when backslashes are being interpreted. In extended mode we must handle
1039
  nested substrings that are to be reprocessed. */
1040
1041
0
  else for (;;)
1042
0
    {
1043
0
    uint32_t ch;
1044
0
    unsigned int chlen;
1045
0
    int group;
1046
0
    uint32_t special;
1047
0
    PCRE2_SPTR text1_start = NULL;
1048
0
    PCRE2_SPTR text1_end = NULL;
1049
0
    PCRE2_SPTR text2_start = NULL;
1050
0
    PCRE2_SPTR text2_end = NULL;
1051
0
    PCRE2_UCHAR name[MAX_NAME_SIZE + 1];
1052
1053
    /* If at the end of a nested substring, pop the stack. */
1054
1055
0
    if (ptr >= repend)
1056
0
      {
1057
0
      if (ptrstackptr == 0) break;       /* End of replacement string */
1058
0
      repend = ptrstack[--ptrstackptr];
1059
0
      ptr = ptrstack[--ptrstackptr];
1060
0
      continue;
1061
0
      }
1062
1063
    /* Handle the next character */
1064
1065
0
    if (escaped_literal)
1066
0
      {
1067
0
      if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
1068
0
        {
1069
0
        escaped_literal = FALSE;
1070
0
        ptr += 2;
1071
0
        continue;
1072
0
        }
1073
0
      goto LOADLITERAL;
1074
0
      }
1075
1076
    /* Not in literal mode. */
1077
1078
0
    if (*ptr == CHAR_DOLLAR_SIGN)
1079
0
      {
1080
0
      BOOL inparens;
1081
0
      BOOL inangle;
1082
0
      BOOL star;
1083
0
      PCRE2_SIZE sublength;
1084
0
      PCRE2_UCHAR next;
1085
0
      PCRE2_SPTR subptr, subptrend;
1086
1087
0
      if (++ptr >= repend) goto BAD;
1088
0
      if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
1089
1090
0
      special = 0;
1091
0
      text1_start = NULL;
1092
0
      text1_end = NULL;
1093
0
      text2_start = NULL;
1094
0
      text2_end = NULL;
1095
0
      group = -1;
1096
0
      inparens = FALSE;
1097
0
      inangle = FALSE;
1098
0
      star = FALSE;
1099
0
      subptr = NULL;
1100
0
      subptrend = NULL;
1101
1102
      /* Special $ sequences, as supported by Perl, JavaScript, .NET and others. */
1103
0
      if (next == CHAR_AMPERSAND)
1104
0
        {
1105
0
        ++ptr;
1106
0
        group = 0;
1107
0
        goto GROUP_SUBSTITUTE;
1108
0
        }
1109
0
      if (next == CHAR_GRAVE_ACCENT || next == CHAR_APOSTROPHE)
1110
0
        {
1111
0
        ++ptr;
1112
0
        rc = pcre2_substring_length_bynumber(match_data, 0, &sublength);
1113
0
        if (rc < 0) goto PTREXIT; /* (Sanity-check ovector before reading from it.) */
1114
1115
0
        if (next == CHAR_GRAVE_ACCENT)
1116
0
          {
1117
0
          subptr = subject;
1118
0
          subptrend = subject + ovector[0];
1119
0
          }
1120
0
        else
1121
0
          {
1122
0
          subptr = subject + ovector[1];
1123
0
          subptrend = subject + length;
1124
0
          }
1125
1126
0
        goto SUBPTR_SUBSTITUTE;
1127
0
        }
1128
0
      if (next == CHAR_UNDERSCORE)
1129
0
        {
1130
        /* Java, .NET support $_ for "entire input string". */
1131
0
        ++ptr;
1132
0
        subptr = subject;
1133
0
        subptrend = subject + length;
1134
0
        goto SUBPTR_SUBSTITUTE;
1135
0
        }
1136
1137
0
      if (next == CHAR_LEFT_CURLY_BRACKET)
1138
0
        {
1139
0
        if (++ptr >= repend) goto BAD;
1140
0
        next = *ptr;
1141
0
        inparens = TRUE;
1142
0
        }
1143
0
      else if (next == CHAR_LESS_THAN_SIGN)
1144
0
        {
1145
        /* JavaScript compatibility syntax, $<name>. Processes only named
1146
        groups (not numbered) and does not support extensions such as star
1147
        (you can do ${name} and ${*name}, but not $<*name>). */
1148
0
        if (++ptr >= repend) goto BAD;
1149
0
        next = *ptr;
1150
0
        inangle = TRUE;
1151
0
        }
1152
1153
0
      if (!inangle && next == CHAR_ASTERISK)
1154
0
        {
1155
0
        if (++ptr >= repend) goto BAD;
1156
0
        next = *ptr;
1157
0
        star = TRUE;
1158
0
        }
1159
1160
0
      if (!star && !inangle && next >= CHAR_0 && next <= CHAR_9)
1161
0
        {
1162
0
        group = next - CHAR_0;
1163
0
        while (++ptr < repend)
1164
0
          {
1165
0
          next = *ptr;
1166
0
          if (next < CHAR_0 || next > CHAR_9) break;
1167
0
          group = group * 10 + (next - CHAR_0);
1168
1169
          /* A check for a number greater than the hightest captured group
1170
          is sufficient here; no need for a separate overflow check. If unknown
1171
          groups are to be treated as unset, just skip over any remaining
1172
          digits and carry on. */
1173
1174
0
          if (group > code->top_bracket)
1175
0
            {
1176
0
            if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1177
0
              {
1178
0
              while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
1179
0
              break;
1180
0
              }
1181
0
            else
1182
0
              {
1183
0
              rc = PCRE2_ERROR_NOSUBSTRING;
1184
0
              goto PTREXIT;
1185
0
              }
1186
0
            }
1187
0
          }
1188
0
        }
1189
0
      else
1190
0
        {
1191
0
        PCRE2_SIZE name_len;
1192
0
        PCRE2_SPTR name_start = ptr;
1193
0
        if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))
1194
0
          goto BAD;
1195
0
        name_len = ptr - name_start;
1196
0
        memcpy(name, name_start, CU2BYTES(name_len));
1197
0
        name[name_len] = 0;
1198
0
        }
1199
1200
0
      next = 0; /* not used or updated after this point */
1201
0
      (void)next;
1202
1203
      /* In extended mode we recognize ${name:+set text:unset text} and
1204
      ${name:-default text}. */
1205
1206
0
      if (inparens)
1207
0
        {
1208
0
        if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
1209
0
             !star && ptr < repend - 2 && *ptr == CHAR_COLON)
1210
0
          {
1211
0
          special = *(++ptr);
1212
0
          if (special != CHAR_PLUS && special != CHAR_MINUS)
1213
0
            {
1214
0
            rc = PCRE2_ERROR_BADSUBSTITUTION;
1215
0
            goto PTREXIT;
1216
0
            }
1217
1218
0
          text1_start = ++ptr;
1219
0
          rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
1220
0
          if (rc != 0) goto PTREXIT;
1221
0
          text1_end = ptr;
1222
1223
0
          if (special == CHAR_PLUS && *ptr == CHAR_COLON)
1224
0
            {
1225
0
            text2_start = ++ptr;
1226
0
            rc = find_text_end(code, &ptr, repend, TRUE);
1227
0
            if (rc != 0) goto PTREXIT;
1228
0
            text2_end = ptr;
1229
0
            }
1230
0
          }
1231
1232
0
        else
1233
0
          {
1234
0
          if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
1235
0
            {
1236
0
            rc = PCRE2_ERROR_REPMISSINGBRACE;
1237
0
            goto PTREXIT;
1238
0
            }
1239
0
          }
1240
1241
0
        ptr++;
1242
0
        }
1243
1244
0
      if (inangle)
1245
0
        {
1246
0
        if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)
1247
0
          goto BAD;
1248
0
        ptr++;
1249
0
        }
1250
1251
      /* Have found a syntactically correct group number or name, or *name.
1252
      Only *MARK is currently recognized. */
1253
1254
0
      if (star)
1255
0
        {
1256
0
        if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
1257
0
          {
1258
0
          PCRE2_SPTR mark = pcre2_get_mark(match_data);
1259
0
          if (mark != NULL)
1260
0
            {
1261
            /* Peek backwards one code unit to obtain the length of the mark.
1262
            It can (theoretically) contain an embedded NUL. */
1263
0
            fraglength = mark[-1];
1264
0
            if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1265
0
                substitute_case_callout == NULL)
1266
0
              CHECKCASECPY_DEFAULT(mark, fraglength);
1267
0
            else
1268
0
              CHECKMEMCPY(mark, fraglength);
1269
0
            }
1270
0
          }
1271
0
        else goto BAD;
1272
0
        }
1273
1274
      /* Substitute the contents of a group. We don't use substring_copy
1275
      functions any more, in order to support case forcing. */
1276
1277
0
      else
1278
0
        {
1279
0
        GROUP_SUBSTITUTE:
1280
        /* Find a number for a named group. In case there are duplicate names,
1281
        search for the first one that is set. If the name is not found when
1282
        PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
1283
        non-existent group. */
1284
1285
0
        if (group < 0)
1286
0
          {
1287
0
          PCRE2_SPTR first, last, entry;
1288
0
          rc = pcre2_substring_nametable_scan(code, name, &first, &last);
1289
0
          if (rc == PCRE2_ERROR_NOSUBSTRING &&
1290
0
              (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1291
0
            {
1292
0
            group = code->top_bracket + 1;
1293
0
            }
1294
0
          else
1295
0
            {
1296
0
            if (rc < 0) goto PTREXIT;
1297
0
            for (entry = first; entry <= last; entry += rc)
1298
0
              {
1299
0
              uint32_t ng = GET2(entry, 0);
1300
0
              if (ng < ovector_count)
1301
0
                {
1302
0
                if (group < 0) group = ng;          /* First in ovector */
1303
0
                if (ovector[ng*2] != PCRE2_UNSET)
1304
0
                  {
1305
0
                  group = ng;                       /* First that is set */
1306
0
                  break;
1307
0
                  }
1308
0
                }
1309
0
              }
1310
1311
            /* If group is still negative, it means we did not find a group
1312
            that is in the ovector. Just set the first group. */
1313
1314
0
            if (group < 0) group = GET2(first, 0);
1315
0
            }
1316
0
          }
1317
1318
        /* We now have a group that is identified by number. Find the length of
1319
        the captured string. If a group in a non-special substitution is unset
1320
        when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
1321
1322
0
        rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
1323
0
        if (rc < 0)
1324
0
          {
1325
0
          if (rc == PCRE2_ERROR_NOSUBSTRING &&
1326
0
              (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1327
0
            {
1328
0
            rc = PCRE2_ERROR_UNSET;
1329
0
            }
1330
0
          if (rc != PCRE2_ERROR_UNSET) goto PTREXIT;  /* Non-unset errors */
1331
0
          if (special == 0)                           /* Plain substitution */
1332
0
            {
1333
0
            if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
1334
0
            goto PTREXIT;                             /* Else error */
1335
0
            }
1336
0
          }
1337
1338
        /* If special is '+' we have a 'set' and possibly an 'unset' text,
1339
        both of which are reprocessed when used. If special is '-' we have a
1340
        default text for when the group is unset; it must be reprocessed. */
1341
1342
0
        if (special != 0)
1343
0
          {
1344
0
          if (special == CHAR_MINUS)
1345
0
            {
1346
0
            if (rc == 0) goto LITERAL_SUBSTITUTE;
1347
0
            text2_start = text1_start;
1348
0
            text2_end = text1_end;
1349
0
            }
1350
1351
0
          if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
1352
0
          ptrstack[ptrstackptr++] = ptr;
1353
0
          ptrstack[ptrstackptr++] = repend;
1354
1355
0
          if (rc == 0)
1356
0
            {
1357
0
            ptr = text1_start;
1358
0
            repend = text1_end;
1359
0
            }
1360
0
          else
1361
0
            {
1362
0
            ptr = text2_start;
1363
0
            repend = text2_end;
1364
0
            }
1365
0
          continue;
1366
0
          }
1367
1368
        /* Otherwise we have a literal substitution of a group's contents. */
1369
1370
0
        LITERAL_SUBSTITUTE:
1371
0
        subptr = subject + ovector[group*2];
1372
0
        subptrend = subject + ovector[group*2 + 1];
1373
1374
        /* Substitute a literal string, possibly forcing alphabetic case. */
1375
1376
0
        SUBPTR_SUBSTITUTE:
1377
0
        if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1378
0
            substitute_case_callout == NULL)
1379
0
          CHECKCASECPY_DEFAULT(subptr, subptrend - subptr);
1380
0
        else
1381
0
          CHECKMEMCPY(subptr, subptrend - subptr);
1382
0
        }
1383
0
      }   /* End of $ processing */
1384
1385
    /* Handle an escape sequence in extended mode. We can use check_escape()
1386
    to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
1387
    the case-forcing escapes are not supported in pcre2_compile() so must be
1388
    recognized here. */
1389
1390
0
    else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
1391
0
              *ptr == CHAR_BACKSLASH)
1392
0
      {
1393
0
      int errorcode;
1394
0
      case_state new_forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };
1395
1396
0
      if (ptr < repend - 1) switch (ptr[1])
1397
0
        {
1398
0
        case CHAR_L:
1399
0
        new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
1400
0
        new_forcecase.single_char = FALSE;
1401
0
        ptr += 2;
1402
0
        break;
1403
1404
0
        case CHAR_l:
1405
0
        new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
1406
0
        new_forcecase.single_char = TRUE;
1407
0
        ptr += 2;
1408
0
        if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_U)
1409
0
          {
1410
          /* Perl reverse-title-casing feature for \l\U */
1411
0
          new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST;
1412
0
          new_forcecase.single_char = FALSE;
1413
0
          ptr += 2;
1414
0
          }
1415
0
        break;
1416
1417
0
        case CHAR_U:
1418
0
        new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
1419
0
        new_forcecase.single_char = FALSE;
1420
0
        ptr += 2;
1421
0
        break;
1422
1423
0
        case CHAR_u:
1424
0
        new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;
1425
0
        new_forcecase.single_char = TRUE;
1426
0
        ptr += 2;
1427
0
        if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_L)
1428
0
          {
1429
          /* Perl title-casing feature for \u\L */
1430
0
          new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;
1431
0
          new_forcecase.single_char = FALSE;
1432
0
          ptr += 2;
1433
0
          }
1434
0
        break;
1435
1436
0
        default:
1437
0
        break;
1438
0
        }
1439
1440
0
      if (new_forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1441
0
        {
1442
0
        SETFORCECASE:
1443
1444
        /* If the substitute_case_callout is unset, our case-forcing is done
1445
        immediately. If there is a callout however, then its action is delayed
1446
        until all the characters have been collected.
1447
1448
        Apply the callout now, before we set the new casing mode. */
1449
1450
0
        if (substitute_case_callout != NULL &&
1451
0
            forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1452
0
          DELAYEDFORCECASE();
1453
1454
0
        forcecase = new_forcecase;
1455
0
        casestart_offset = buff_offset;
1456
0
        casestart_extra_needed = extra_needed;
1457
0
        continue;
1458
0
        }
1459
1460
0
      ptr++;  /* Point after \ */
1461
0
      rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
1462
0
        code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);
1463
0
      if (errorcode != 0) goto BADESCAPE;
1464
1465
0
      switch(rc)
1466
0
        {
1467
0
        case ESC_E:
1468
0
        goto SETFORCECASE;
1469
1470
0
        case ESC_Q:
1471
0
        escaped_literal = TRUE;
1472
0
        continue;
1473
1474
0
        case 0:      /* Data character */
1475
0
        case ESC_b:  /* \b is backspace in a substitution */
1476
0
        case ESC_v:  /* \v is vertical tab in a substitution */
1477
1478
0
        if (rc == ESC_b) ch = CHAR_BS;
1479
0
        if (rc == ESC_v) ch = CHAR_VT;
1480
1481
0
#ifdef SUPPORT_UNICODE
1482
0
        if (utf) chlen = PRIV(ord2utf)(ch, temp); else
1483
0
#endif
1484
0
          {
1485
0
          temp[0] = ch;
1486
0
          chlen = 1;
1487
0
          }
1488
1489
0
        if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1490
0
            substitute_case_callout == NULL)
1491
0
          CHECKCASECPY_DEFAULT(temp, chlen);
1492
0
        else
1493
0
          CHECKMEMCPY(temp, chlen);
1494
0
        continue;
1495
1496
0
        case ESC_g:
1497
0
          {
1498
0
          PCRE2_SIZE name_len;
1499
0
          PCRE2_SPTR name_start;
1500
1501
          /* Parse the \g<name> form (\g<number> already handled by check_escape) */
1502
0
          if (ptr >= repend || *ptr != CHAR_LESS_THAN_SIGN)
1503
0
            goto BADESCAPE;
1504
0
          ++ptr;
1505
1506
0
          name_start = ptr;
1507
0
          if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))
1508
0
            goto BADESCAPE;
1509
0
          name_len = ptr - name_start;
1510
1511
0
          if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)
1512
0
            goto BADESCAPE;
1513
0
          ++ptr;
1514
1515
0
          special = 0;
1516
0
          group = -1;
1517
0
          memcpy(name, name_start, CU2BYTES(name_len));
1518
0
          name[name_len] = 0;
1519
0
          goto GROUP_SUBSTITUTE;
1520
0
          }
1521
1522
0
        default:
1523
0
        if (rc < 0)
1524
0
          {
1525
0
          special = 0;
1526
0
          group = -rc - 1;
1527
0
          goto GROUP_SUBSTITUTE;
1528
0
          }
1529
0
        goto BADESCAPE;
1530
0
        }
1531
0
      }   /* End of backslash processing */
1532
1533
    /* Handle a literal code unit */
1534
1535
0
    else
1536
0
      {
1537
0
      PCRE2_SPTR ch_start;
1538
1539
0
      LOADLITERAL:
1540
0
      ch_start = ptr;
1541
0
      GETCHARINCTEST(ch, ptr);    /* Get character value, increment pointer */
1542
0
      (void) ch;
1543
1544
0
      if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1545
0
          substitute_case_callout == NULL)
1546
0
        CHECKCASECPY_DEFAULT(ch_start, ptr - ch_start);
1547
0
      else
1548
0
        CHECKMEMCPY(ch_start, ptr - ch_start);
1549
0
      } /* End handling a literal code unit */
1550
0
    }   /* End of loop for scanning the replacement. */
1551
1552
  /* If the substitute_case_callout is unset, our case-forcing is done
1553
  immediately. If there is a callout however, then its action is delayed
1554
  until all the characters have been collected.
1555
1556
  We now clean up any trailing section of the replacement for which we deferred
1557
  the case-forcing. */
1558
1559
0
  if (substitute_case_callout != NULL &&
1560
0
      forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1561
0
    DELAYEDFORCECASE();
1562
1563
  /* The replacement has been copied to the output, or its size has been
1564
  remembered. Handle the callout if there is one. */
1565
1566
0
  if (mcontext != NULL && mcontext->substitute_callout != NULL)
1567
0
    {
1568
    /* If we an actual (non-simulated) replacement, do the callout. */
1569
1570
0
    if (!overflowed)
1571
0
      {
1572
0
      scb.subscount = subs;
1573
0
      scb.output_offsets[1] = buff_offset;
1574
0
      rc = mcontext->substitute_callout(&scb,
1575
0
                                        mcontext->substitute_callout_data);
1576
1577
      /* A non-zero return means cancel this substitution. Instead, copy the
1578
      matched string fragment. */
1579
1580
0
      if (rc != 0)
1581
0
        {
1582
0
        PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
1583
0
        PCRE2_SIZE oldlength = ovector[1] - ovector[0];
1584
1585
0
        buff_offset -= newlength;
1586
0
        lengthleft += newlength;
1587
0
        if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
1588
1589
        /* A negative return means do not do any more. */
1590
1591
0
        if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
1592
0
        }
1593
0
      }
1594
1595
    /* In this interesting case, we cannot do the callout, so it's hard to
1596
    estimate the required buffer size. What callers want is to be able to make
1597
    two calls to pcre2_substitute(), once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
1598
    to discover the buffer size, and then a second and final call. Older
1599
    versions of PCRE2 violated this assumption, by proceding as if the callout
1600
    had returned zero - but on the second call to pcre2_substitute() it could
1601
    return non-zero and then overflow the buffer again. Callers probably don't
1602
    want to keep on looping to incrementally discover the buffer size. */
1603
1604
0
    else
1605
0
      {
1606
0
      PCRE2_SIZE newlength_buf = buff_offset - scb.output_offsets[0];
1607
0
      PCRE2_SIZE newlength_extra = extra_needed - sub_start_extra_needed;
1608
0
      PCRE2_SIZE newlength =
1609
0
        (newlength_extra > ~(PCRE2_SIZE)0 - newlength_buf)?  /* Integer overflow */
1610
0
        ~(PCRE2_SIZE)0 : newlength_buf + newlength_extra;    /* Cap the addition */
1611
0
      PCRE2_SIZE oldlength = ovector[1] - ovector[0];
1612
1613
      /* Be pessimistic: request whichever buffer size is larger out of
1614
      accepting or rejecting the substitution. */
1615
1616
0
      if (oldlength > newlength)
1617
0
        {
1618
0
        PCRE2_SIZE additional = oldlength - newlength;
1619
0
        if (additional > ~(PCRE2_SIZE)0 - extra_needed)  /* Integer overflow */
1620
0
          goto TOOLARGEREPLACE;
1621
0
        extra_needed += additional;
1622
0
        }
1623
1624
      /* Proceed as if the callout did not return a negative. A negative
1625
      effectively rejects all future substitutions, but we want to examine them
1626
      pessimistically. */
1627
0
      }
1628
0
    }
1629
1630
  /* Save the details of this match. See above for how this data is used. If we
1631
  matched an empty string, do the magic for global matches. Update the start
1632
  offset to point to the rest of the subject string. If we re-used an existing
1633
  match for the first match, switch to the internal match data block. */
1634
1635
0
  ovecsave[0] = ovector[0];
1636
0
  ovecsave[1] = ovector[1];
1637
0
  ovecsave[2] = start_offset;
1638
1639
0
  goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
1640
0
    PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
1641
0
  start_offset = ovector[1];
1642
0
  } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0);  /* Repeat "do" loop */
1643
1644
/* Copy the rest of the subject unless not required, and terminate the output
1645
with a binary zero. */
1646
1647
0
if (!replacement_only)
1648
0
  {
1649
0
  fraglength = length - start_offset;
1650
0
  CHECKMEMCPY(subject + start_offset, fraglength);
1651
0
  }
1652
1653
0
temp[0] = 0;
1654
0
CHECKMEMCPY(temp, 1);
1655
1656
/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
1657
and matching has carried on after a full buffer, in order to compute the length
1658
needed. Otherwise, an overflow generates an immediate error return. */
1659
1660
0
if (overflowed)
1661
0
  {
1662
0
  rc = PCRE2_ERROR_NOMEMORY;
1663
1664
0
  if (extra_needed > ~(PCRE2_SIZE)0 - buff_length)  /* Integer overflow */
1665
0
    goto TOOLARGEREPLACE;
1666
0
  *blength = buff_length + extra_needed;
1667
0
  }
1668
1669
/* After a successful execution, return the number of substitutions and set the
1670
length of buffer used, excluding the trailing zero. */
1671
1672
0
else
1673
0
  {
1674
0
  rc = subs;
1675
0
  *blength = buff_offset - 1;
1676
0
  }
1677
1678
0
EXIT:
1679
0
if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
1680
0
  else match_data->rc = rc;
1681
0
return rc;
1682
1683
0
NOROOM:
1684
0
rc = PCRE2_ERROR_NOMEMORY;
1685
0
goto EXIT;
1686
1687
0
CASEERROR:
1688
0
rc = PCRE2_ERROR_REPLACECASE;
1689
0
goto EXIT;
1690
1691
0
TOOLARGEREPLACE:
1692
0
rc = PCRE2_ERROR_TOOLARGEREPLACE;
1693
0
goto EXIT;
1694
1695
0
BAD:
1696
0
rc = PCRE2_ERROR_BADREPLACEMENT;
1697
0
goto PTREXIT;
1698
1699
0
BADESCAPE:
1700
0
rc = PCRE2_ERROR_BADREPESCAPE;
1701
1702
0
PTREXIT:
1703
0
*blength = (PCRE2_SIZE)(ptr - replacement);
1704
0
goto EXIT;
1705
0
}
1706
1707
/* End of pcre2_substitute.c */