/src/php-src/ext/pcre/pcre2lib/pcre2_substitute.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2024 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | |
42 | | #ifdef HAVE_CONFIG_H |
43 | | #include "config.h" |
44 | | #endif |
45 | | |
46 | | #include "pcre2_internal.h" |
47 | | |
48 | 0 | #define PTR_STACK_SIZE 20 |
49 | | |
50 | | #define SUBSTITUTE_OPTIONS \ |
51 | 0 | (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ |
52 | 0 | PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \ |
53 | 0 | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \ |
54 | 0 | PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY) |
55 | | |
56 | | |
57 | | |
58 | | /************************************************* |
59 | | * Find end of substitute text * |
60 | | *************************************************/ |
61 | | |
62 | | /* In extended mode, we recognize ${name:+set text:unset text} and similar |
63 | | constructions. This requires the identification of unescaped : and } |
64 | | characters. This function scans for such. It must deal with nested ${ |
65 | | constructions. The pointer to the text is updated, either to the required end |
66 | | character, or to where an error was detected. |
67 | | |
68 | | Arguments: |
69 | | code points to the compiled expression (for options) |
70 | | ptrptr points to the pointer to the start of the text (updated) |
71 | | ptrend end of the whole string |
72 | | last TRUE if the last expected string (only } recognized) |
73 | | |
74 | | Returns: 0 on success |
75 | | negative error code on failure |
76 | | */ |
77 | | |
78 | | static int |
79 | | find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, |
80 | | BOOL last) |
81 | 0 | { |
82 | 0 | int rc = 0; |
83 | 0 | uint32_t nestlevel = 0; |
84 | 0 | BOOL literal = FALSE; |
85 | 0 | PCRE2_SPTR ptr = *ptrptr; |
86 | |
|
87 | 0 | for (; ptr < ptrend; ptr++) |
88 | 0 | { |
89 | 0 | if (literal) |
90 | 0 | { |
91 | 0 | if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E) |
92 | 0 | { |
93 | 0 | literal = FALSE; |
94 | 0 | ptr += 1; |
95 | 0 | } |
96 | 0 | } |
97 | | |
98 | 0 | else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) |
99 | 0 | { |
100 | 0 | if (nestlevel == 0) goto EXIT; |
101 | 0 | nestlevel--; |
102 | 0 | } |
103 | | |
104 | 0 | else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT; |
105 | | |
106 | 0 | else if (*ptr == CHAR_DOLLAR_SIGN) |
107 | 0 | { |
108 | 0 | if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
109 | 0 | { |
110 | 0 | nestlevel++; |
111 | 0 | ptr += 1; |
112 | 0 | } |
113 | 0 | } |
114 | | |
115 | 0 | else if (*ptr == CHAR_BACKSLASH) |
116 | 0 | { |
117 | 0 | int erc; |
118 | 0 | int errorcode; |
119 | 0 | uint32_t ch; |
120 | |
|
121 | 0 | if (ptr < ptrend - 1) switch (ptr[1]) |
122 | 0 | { |
123 | 0 | case CHAR_L: |
124 | 0 | case CHAR_l: |
125 | 0 | case CHAR_U: |
126 | 0 | case CHAR_u: |
127 | 0 | ptr += 1; |
128 | 0 | continue; |
129 | 0 | } |
130 | | |
131 | 0 | ptr += 1; /* Must point after \ */ |
132 | 0 | erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, |
133 | 0 | code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL); |
134 | 0 | ptr -= 1; /* Back to last code unit of escape */ |
135 | 0 | if (errorcode != 0) |
136 | 0 | { |
137 | | /* errorcode from check_escape is positive, so must not be returned by |
138 | | pcre2_substitute(). */ |
139 | 0 | rc = PCRE2_ERROR_BADREPESCAPE; |
140 | 0 | goto EXIT; |
141 | 0 | } |
142 | | |
143 | 0 | switch(erc) |
144 | 0 | { |
145 | 0 | case 0: /* Data character */ |
146 | 0 | case ESC_b: /* Data character */ |
147 | 0 | case ESC_v: /* Data character */ |
148 | 0 | case ESC_E: /* Isolated \E is ignored */ |
149 | 0 | break; |
150 | | |
151 | 0 | case ESC_Q: |
152 | 0 | literal = TRUE; |
153 | 0 | break; |
154 | | |
155 | 0 | case ESC_g: |
156 | | /* The \g<name> form (\g<number> already handled by check_escape) |
157 | | |
158 | | Don't worry about finding the matching ">". We are super, super lenient |
159 | | about validating ${} replacements inside find_text_end(), so we certainly |
160 | | don't need to worry about other syntax. Importantly, a \g<..> or $<...> |
161 | | sequence can't contain a '}' character. */ |
162 | 0 | break; |
163 | | |
164 | 0 | default: |
165 | 0 | if (erc < 0) |
166 | 0 | break; /* capture group reference */ |
167 | 0 | rc = PCRE2_ERROR_BADREPESCAPE; |
168 | 0 | goto EXIT; |
169 | 0 | } |
170 | 0 | } |
171 | 0 | } |
172 | | |
173 | 0 | rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */ |
174 | |
|
175 | 0 | EXIT: |
176 | 0 | *ptrptr = ptr; |
177 | 0 | return rc; |
178 | 0 | } |
179 | | |
180 | | |
181 | | /************************************************* |
182 | | * Validate group name * |
183 | | *************************************************/ |
184 | | |
185 | | /* This function scans for a capture group name, validating it |
186 | | consists of legal characters, is not empty, and does not exceed |
187 | | MAX_NAME_SIZE. |
188 | | |
189 | | Arguments: |
190 | | ptrptr points to the pointer to the start of the text (updated) |
191 | | ptrend end of the whole string |
192 | | utf true if the input is UTF-encoded |
193 | | ctypes pointer to the character types table |
194 | | |
195 | | Returns: TRUE if a name was read |
196 | | FALSE otherwise |
197 | | */ |
198 | | |
199 | | static BOOL |
200 | | read_name_subst(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, |
201 | | const uint8_t* ctypes) |
202 | 0 | { |
203 | 0 | PCRE2_SPTR ptr = *ptrptr; |
204 | 0 | PCRE2_SPTR nameptr = ptr; |
205 | |
|
206 | 0 | if (ptr >= ptrend) /* No characters in name */ |
207 | 0 | goto FAILED; |
208 | | |
209 | | /* We do not need to check whether the name starts with a non-digit. |
210 | | We are simply referencing names here, not defining them. */ |
211 | | |
212 | | /* See read_name in the pcre2_compile.c for the corresponding logic |
213 | | restricting group names inside the pattern itself. */ |
214 | | |
215 | 0 | #ifdef SUPPORT_UNICODE |
216 | 0 | if (utf) |
217 | 0 | { |
218 | 0 | uint32_t c, type; |
219 | |
|
220 | 0 | while (ptr < ptrend) |
221 | 0 | { |
222 | 0 | GETCHAR(c, ptr); |
223 | 0 | type = UCD_CHARTYPE(c); |
224 | 0 | if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L && |
225 | 0 | c != CHAR_UNDERSCORE) break; |
226 | 0 | ptr++; |
227 | 0 | FORWARDCHARTEST(ptr, ptrend); |
228 | 0 | } |
229 | 0 | } |
230 | 0 | else |
231 | | #else |
232 | | (void)utf; /* Avoid compiler warning */ |
233 | | #endif /* SUPPORT_UNICODE */ |
234 | | |
235 | | /* Handle group names in non-UTF modes. */ |
236 | | |
237 | 0 | { |
238 | 0 | while (ptr < ptrend && MAX_255(*ptr) && (ctypes[*ptr] & ctype_word) != 0) |
239 | 0 | { |
240 | 0 | ptr++; |
241 | 0 | } |
242 | 0 | } |
243 | | |
244 | | /* Check name length */ |
245 | |
|
246 | 0 | if (ptr - nameptr > MAX_NAME_SIZE) |
247 | 0 | goto FAILED; |
248 | | |
249 | | /* Subpattern names must not be empty */ |
250 | 0 | if (ptr == nameptr) |
251 | 0 | goto FAILED; |
252 | | |
253 | 0 | *ptrptr = ptr; |
254 | 0 | return TRUE; |
255 | | |
256 | 0 | FAILED: |
257 | 0 | *ptrptr = ptr; |
258 | 0 | return FALSE; |
259 | 0 | } |
260 | | |
261 | | |
262 | | /************************************************* |
263 | | * Case transformations * |
264 | | *************************************************/ |
265 | | |
266 | 0 | #define PCRE2_SUBSTITUTE_CASE_NONE 0 |
267 | | // 1, 2, 3 are PCRE2_SUBSTITUTE_CASE_LOWER, UPPER, TITLE_FIRST. |
268 | 0 | #define PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST 4 |
269 | | |
270 | | typedef struct { |
271 | | int to_case; /* One of PCRE2_SUBSTITUTE_CASE_xyz */ |
272 | | BOOL single_char; |
273 | | } case_state; |
274 | | |
275 | | /* Helper to guess how much a string is likely to increase in size when |
276 | | case-transformed. Usually, strings don't change size at all, but some rare |
277 | | characters do grow. Estimate +10%, plus another few characters. |
278 | | |
279 | | Performing this estimation is unfortunate, but inevitable, since we can't call |
280 | | the callout if we ran out of buffer space to prepare its input. |
281 | | |
282 | | Because this estimate is inexact (and in pathological cases, underestimates the |
283 | | required buffer size) we must document that when you have a |
284 | | substitute_case_callout, and you are using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, you |
285 | | may need more than two calls to determine the final buffer size. */ |
286 | | |
287 | | static PCRE2_SIZE |
288 | | pessimistic_case_inflation(PCRE2_SIZE len) |
289 | 0 | { |
290 | 0 | return (len >> 3u) + 10; |
291 | 0 | } |
292 | | |
293 | | /* Case transformation behaviour if no callout is passed. */ |
294 | | |
295 | | static PCRE2_SIZE |
296 | | default_substitute_case_callout( |
297 | | PCRE2_SPTR input, PCRE2_SIZE input_len, |
298 | | PCRE2_UCHAR *output, PCRE2_SIZE output_cap, |
299 | | case_state *state, const pcre2_code *code) |
300 | 0 | { |
301 | 0 | PCRE2_SPTR input_end = input + input_len; |
302 | 0 | #ifdef SUPPORT_UNICODE |
303 | 0 | BOOL utf; |
304 | 0 | BOOL ucp; |
305 | 0 | #endif |
306 | 0 | PCRE2_UCHAR temp[6]; |
307 | 0 | BOOL next_to_upper; |
308 | 0 | BOOL rest_to_upper; |
309 | 0 | BOOL single_char; |
310 | 0 | BOOL overflow = FALSE; |
311 | 0 | PCRE2_SIZE written = 0; |
312 | | |
313 | | /* Helpful simplifying invariant: input and output are disjoint buffers. |
314 | | I believe that this code is technically undefined behaviour, because the two |
315 | | pointers input/output are "unrelated" pointers and hence not comparable. Casting |
316 | | via char* bypasses some but not all of those technical rules. It is not included |
317 | | in release builds, in any case. */ |
318 | 0 | PCRE2_ASSERT((char *)(input + input_len) <= (char *)output || |
319 | 0 | (char *)(output + output_cap) <= (char *)input); |
320 | |
|
321 | 0 | #ifdef SUPPORT_UNICODE |
322 | 0 | utf = (code->overall_options & PCRE2_UTF) != 0; |
323 | 0 | ucp = (code->overall_options & PCRE2_UCP) != 0; |
324 | 0 | #endif |
325 | |
|
326 | 0 | if (input_len == 0) return 0; |
327 | | |
328 | 0 | switch (state->to_case) |
329 | 0 | { |
330 | 0 | default: |
331 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
332 | 0 | return 0; |
333 | | |
334 | 0 | case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE |
335 | 0 | case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE |
336 | 0 | next_to_upper = rest_to_upper = (state->to_case == PCRE2_SUBSTITUTE_CASE_UPPER); |
337 | 0 | break; |
338 | | |
339 | 0 | case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE |
340 | 0 | next_to_upper = TRUE; |
341 | 0 | rest_to_upper = FALSE; |
342 | 0 | state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER; |
343 | 0 | break; |
344 | | |
345 | 0 | case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE |
346 | 0 | next_to_upper = FALSE; |
347 | 0 | rest_to_upper = TRUE; |
348 | 0 | state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER; |
349 | 0 | break; |
350 | 0 | } |
351 | | |
352 | 0 | single_char = state->single_char; |
353 | 0 | if (single_char) |
354 | 0 | state->to_case = PCRE2_SUBSTITUTE_CASE_NONE; |
355 | |
|
356 | 0 | while (input < input_end) |
357 | 0 | { |
358 | 0 | uint32_t ch; |
359 | 0 | unsigned int chlen; |
360 | |
|
361 | 0 | GETCHARINCTEST(ch, input); |
362 | |
|
363 | 0 | #ifdef SUPPORT_UNICODE |
364 | 0 | if ((utf || ucp) && ch >= 128) |
365 | 0 | { |
366 | 0 | uint32_t type = UCD_CHARTYPE(ch); |
367 | 0 | if (PRIV(ucp_gentype)[type] == ucp_L && |
368 | 0 | type != (next_to_upper? ucp_Lu : ucp_Ll)) |
369 | 0 | ch = UCD_OTHERCASE(ch); |
370 | | |
371 | | /* TODO This is far from correct... it doesn't support the SpecialCasing.txt |
372 | | mappings, but worse, it's not even correct for all the ordinary case |
373 | | mappings. We should add support for those (at least), and then add the |
374 | | SpecialCasing.txt mappings for Esszet and ligatures, and finally use the |
375 | | Turkish casing flag on the match context. */ |
376 | 0 | } |
377 | 0 | else |
378 | 0 | #endif |
379 | 0 | if (MAX_255(ch)) |
380 | 0 | { |
381 | 0 | if (((code->tables + cbits_offset + |
382 | 0 | (next_to_upper? cbit_upper:cbit_lower) |
383 | 0 | )[ch/8] & (1u << (ch%8))) == 0) |
384 | 0 | ch = (code->tables + fcc_offset)[ch]; |
385 | 0 | } |
386 | |
|
387 | 0 | #ifdef SUPPORT_UNICODE |
388 | 0 | if (utf) chlen = PRIV(ord2utf)(ch, temp); else |
389 | 0 | #endif |
390 | 0 | { |
391 | 0 | temp[0] = ch; |
392 | 0 | chlen = 1; |
393 | 0 | } |
394 | |
|
395 | 0 | if (!overflow && chlen <= output_cap) |
396 | 0 | { |
397 | 0 | memcpy(output, temp, CU2BYTES(chlen)); |
398 | 0 | output += chlen; |
399 | 0 | output_cap -= chlen; |
400 | 0 | } |
401 | 0 | else |
402 | 0 | { |
403 | 0 | overflow = TRUE; |
404 | 0 | } |
405 | |
|
406 | 0 | if (chlen > ~(PCRE2_SIZE)0 - written) /* Integer overflow */ |
407 | 0 | return ~(PCRE2_SIZE)0; |
408 | 0 | written += chlen; |
409 | |
|
410 | 0 | next_to_upper = rest_to_upper; |
411 | | |
412 | | /* memcpy the remainder, if only transforming a single character. */ |
413 | |
|
414 | 0 | if (single_char) |
415 | 0 | { |
416 | 0 | PCRE2_SIZE rest_len = input_end - input; |
417 | |
|
418 | 0 | if (!overflow && rest_len <= output_cap) |
419 | 0 | memcpy(output, input, CU2BYTES(rest_len)); |
420 | |
|
421 | 0 | if (rest_len > ~(PCRE2_SIZE)0 - written) /* Integer overflow */ |
422 | 0 | return ~(PCRE2_SIZE)0; |
423 | 0 | written += rest_len; |
424 | |
|
425 | 0 | return written; |
426 | 0 | } |
427 | 0 | } |
428 | | |
429 | 0 | return written; |
430 | 0 | } |
431 | | |
432 | | /* Helper to perform the call to the substitute_case_callout. We wrap the |
433 | | user-provided callout because our internal arguments are slightly extended. We |
434 | | don't want the user callout to handle the case of "\l" (first character only to |
435 | | lowercase) or "\l\U" (first character to lowercase, rest to uppercase) because |
436 | | those are not operations defined by Unicode. Instead the user callout simply |
437 | | needs to provide the three Unicode primitives: lower, upper, titlecase. */ |
438 | | |
439 | | static PCRE2_SIZE |
440 | | do_case_copy( |
441 | | PCRE2_UCHAR *input_output, PCRE2_SIZE input_len, PCRE2_SIZE output_cap, |
442 | | case_state *state, BOOL utf, |
443 | | PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, |
444 | | PCRE2_SIZE, int, void *), |
445 | | void *substitute_case_callout_data) |
446 | 0 | { |
447 | 0 | PCRE2_SPTR input = input_output; |
448 | 0 | PCRE2_UCHAR *output = input_output; |
449 | 0 | PCRE2_SIZE rc; |
450 | 0 | PCRE2_SIZE rc2; |
451 | 0 | int ch1_to_case; |
452 | 0 | int rest_to_case; |
453 | 0 | PCRE2_UCHAR ch1[6]; |
454 | 0 | PCRE2_SIZE ch1_len; |
455 | 0 | PCRE2_SPTR rest; |
456 | 0 | PCRE2_SIZE rest_len; |
457 | 0 | BOOL ch1_overflow = FALSE; |
458 | 0 | BOOL rest_overflow = FALSE; |
459 | |
|
460 | | #if PCRE2_CODE_UNIT_WIDTH == 32 || !defined(SUPPORT_UNICODE) |
461 | | (void)utf; /* Avoid compiler warning. */ |
462 | | #endif |
463 | |
|
464 | 0 | PCRE2_ASSERT(input_len != 0); |
465 | |
|
466 | 0 | switch (state->to_case) |
467 | 0 | { |
468 | 0 | default: |
469 | 0 | PCRE2_DEBUG_UNREACHABLE(); |
470 | 0 | return 0; |
471 | | |
472 | 0 | case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE |
473 | 0 | case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE |
474 | 0 | case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE |
475 | | |
476 | | /* The easy case, where our internal casing operations align with those of |
477 | | the callout. */ |
478 | |
|
479 | 0 | if (state->single_char == FALSE) |
480 | 0 | { |
481 | 0 | rc = substitute_case_callout(input, input_len, output, output_cap, |
482 | 0 | state->to_case, substitute_case_callout_data); |
483 | |
|
484 | 0 | if (state->to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST) |
485 | 0 | state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER; |
486 | |
|
487 | 0 | return rc; |
488 | 0 | } |
489 | | |
490 | 0 | ch1_to_case = state->to_case; |
491 | 0 | rest_to_case = PCRE2_SUBSTITUTE_CASE_NONE; |
492 | 0 | break; |
493 | | |
494 | 0 | case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE |
495 | 0 | ch1_to_case = PCRE2_SUBSTITUTE_CASE_LOWER; |
496 | 0 | rest_to_case = PCRE2_SUBSTITUTE_CASE_UPPER; |
497 | 0 | break; |
498 | 0 | } |
499 | | |
500 | | /* Identify the leading character. Take copy, because its storage overlaps with |
501 | | `output`, and hence may be scrambled by the callout. */ |
502 | | |
503 | 0 | { |
504 | 0 | PCRE2_SPTR ch_end = input; |
505 | 0 | uint32_t ch; |
506 | |
|
507 | 0 | GETCHARINCTEST(ch, ch_end); |
508 | 0 | (void) ch; |
509 | 0 | PCRE2_ASSERT(ch_end <= input + input_len && ch_end - input <= 6); |
510 | 0 | ch1_len = ch_end - input; |
511 | 0 | memcpy(ch1, input, CU2BYTES(ch1_len)); |
512 | 0 | } |
513 | |
|
514 | 0 | rest = input + ch1_len; |
515 | 0 | rest_len = input_len - ch1_len; |
516 | | |
517 | | /* Transform just ch1. The buffers are always in-place (input == output). With a |
518 | | custom callout, we need a loop to discover its required buffer size. The loop |
519 | | wouldn't be required if the callout were well-behaved, but it might be naughty |
520 | | and return "5" the first time, then "10" the next time we call it using the |
521 | | exact same input! */ |
522 | |
|
523 | 0 | { |
524 | 0 | PCRE2_SIZE ch1_cap; |
525 | 0 | PCRE2_SIZE max_ch1_cap; |
526 | |
|
527 | 0 | ch1_cap = ch1_len; /* First attempt uses the space vacated by ch1. */ |
528 | 0 | PCRE2_ASSERT(output_cap >= input_len && input_len >= rest_len); |
529 | 0 | max_ch1_cap = output_cap - rest_len; |
530 | |
|
531 | 0 | while (TRUE) |
532 | 0 | { |
533 | 0 | rc = substitute_case_callout(ch1, ch1_len, output, ch1_cap, ch1_to_case, |
534 | 0 | substitute_case_callout_data); |
535 | 0 | if (rc == ~(PCRE2_SIZE)0) return rc; |
536 | | |
537 | 0 | if (rc <= ch1_cap) break; |
538 | | |
539 | 0 | if (rc > max_ch1_cap) |
540 | 0 | { |
541 | 0 | ch1_overflow = TRUE; |
542 | 0 | break; |
543 | 0 | } |
544 | | |
545 | | /* Move the rest to the right, to make room for expanding ch1. */ |
546 | | |
547 | 0 | memmove(input_output + rc, rest, CU2BYTES(rest_len)); |
548 | 0 | rest = input + rc; |
549 | |
|
550 | 0 | ch1_cap = rc; |
551 | | |
552 | | /* Proof of loop termination: `ch1_cap` is growing on each iteration, but |
553 | | the loop ends if `rc` reaches the (unchanging) upper bound of output_cap. */ |
554 | 0 | } |
555 | 0 | } |
556 | | |
557 | 0 | if (rest_to_case == PCRE2_SUBSTITUTE_CASE_NONE) |
558 | 0 | { |
559 | 0 | if (!ch1_overflow) |
560 | 0 | { |
561 | 0 | PCRE2_ASSERT(rest_len <= output_cap - rc); |
562 | 0 | memmove(output + rc, rest, CU2BYTES(rest_len)); |
563 | 0 | } |
564 | 0 | rc2 = rest_len; |
565 | |
|
566 | 0 | state->to_case = PCRE2_SUBSTITUTE_CASE_NONE; |
567 | 0 | } |
568 | 0 | else |
569 | 0 | { |
570 | 0 | PCRE2_UCHAR dummy[1]; |
571 | |
|
572 | 0 | rc2 = substitute_case_callout(rest, rest_len, |
573 | 0 | ch1_overflow? dummy : output + rc, |
574 | 0 | ch1_overflow? 0u : output_cap - rc, |
575 | 0 | rest_to_case, substitute_case_callout_data); |
576 | 0 | if (rc2 == ~(PCRE2_SIZE)0) return rc2; |
577 | | |
578 | 0 | if (!ch1_overflow && rc2 > output_cap - rc) rest_overflow = TRUE; |
579 | | |
580 | | /* If ch1 grows so that `xform(ch1)+rest` can't fit in the buffer, but then |
581 | | `rest` shrinks, it's actually possible for the total calculated length of |
582 | | `xform(ch1)+xform(rest)` to come out at less than output_cap. But we can't |
583 | | report that, because it would make it seem that the operation succeeded. |
584 | | If either of xform(ch1) or xform(rest) won't fit in the buffer, our final |
585 | | result must be > output_cap. */ |
586 | 0 | if (ch1_overflow && rc2 < rest_len) |
587 | 0 | rc2 = rest_len; |
588 | |
|
589 | 0 | state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER; |
590 | 0 | } |
591 | | |
592 | 0 | if (rc2 > ~(PCRE2_SIZE)0 - rc) /* Integer overflow */ |
593 | 0 | return ~(PCRE2_SIZE)0; |
594 | | |
595 | 0 | PCRE2_ASSERT(!(ch1_overflow || rest_overflow) || rc + rc2 > output_cap); |
596 | 0 | (void)rest_overflow; |
597 | |
|
598 | 0 | return rc + rc2; |
599 | 0 | } |
600 | | |
601 | | |
602 | | /************************************************* |
603 | | * Match and substitute * |
604 | | *************************************************/ |
605 | | |
606 | | /* This function applies a compiled re to a subject string and creates a new |
607 | | string with substitutions. The first 7 arguments are the same as for |
608 | | pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED. |
609 | | |
610 | | Arguments: |
611 | | code points to the compiled expression |
612 | | subject points to the subject string |
613 | | length length of subject string (may contain binary zeros) |
614 | | start_offset where to start in the subject string |
615 | | options option bits |
616 | | match_data points to a match_data block, or is NULL |
617 | | context points a PCRE2 context |
618 | | replacement points to the replacement string |
619 | | rlength length of replacement string |
620 | | buffer where to put the substituted string |
621 | | blength points to length of buffer; updated to length of string |
622 | | |
623 | | Returns: >= 0 number of substitutions made |
624 | | < 0 an error code |
625 | | PCRE2_ERROR_BADREPLACEMENT means invalid use of $ |
626 | | */ |
627 | | |
628 | | /* This macro checks for space in the buffer before copying into it. On |
629 | | overflow, either give an error immediately, or keep on, accumulating the |
630 | | length. */ |
631 | | |
632 | | #define CHECKMEMCPY(from, length_) \ |
633 | 0 | do { \ |
634 | 0 | PCRE2_SIZE chkmc_length = length_; \ |
635 | 0 | if (overflowed) \ |
636 | 0 | { \ |
637 | 0 | if (chkmc_length > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \ |
638 | 0 | goto TOOLARGEREPLACE; \ |
639 | 0 | extra_needed += chkmc_length; \ |
640 | 0 | } \ |
641 | 0 | else if (lengthleft < chkmc_length) \ |
642 | 0 | { \ |
643 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ |
644 | 0 | overflowed = TRUE; \ |
645 | 0 | extra_needed = chkmc_length - lengthleft; \ |
646 | 0 | } \ |
647 | 0 | else \ |
648 | 0 | { \ |
649 | 0 | memcpy(buffer + buff_offset, from, CU2BYTES(chkmc_length)); \ |
650 | 0 | buff_offset += chkmc_length; \ |
651 | 0 | lengthleft -= chkmc_length; \ |
652 | 0 | } \ |
653 | 0 | } \ |
654 | 0 | while (0) |
655 | | |
656 | | /* This macro checks for space and copies characters with casing modifications. |
657 | | On overflow, it behaves as for CHECKMEMCPY(). |
658 | | |
659 | | When substitute_case_callout is NULL, the source and destination buffers must |
660 | | not overlap, because our default handler does not support this. */ |
661 | | |
662 | | #define CHECKCASECPY_BASE(length_, do_call) \ |
663 | 0 | do { \ |
664 | 0 | PCRE2_SIZE chkcc_length = (PCRE2_SIZE)(length_); \ |
665 | 0 | PCRE2_SIZE chkcc_rc; \ |
666 | 0 | do_call \ |
667 | 0 | if (lengthleft < chkcc_rc) \ |
668 | 0 | { \ |
669 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ |
670 | 0 | overflowed = TRUE; \ |
671 | 0 | extra_needed = chkcc_rc - lengthleft; \ |
672 | 0 | } \ |
673 | 0 | else \ |
674 | 0 | { \ |
675 | 0 | buff_offset += chkcc_rc; \ |
676 | 0 | lengthleft -= chkcc_rc; \ |
677 | 0 | } \ |
678 | 0 | } \ |
679 | 0 | while (0) |
680 | | |
681 | | #define CHECKCASECPY_DEFAULT(from, length_) \ |
682 | 0 | CHECKCASECPY_BASE(length_, { \ |
683 | 0 | chkcc_rc = default_substitute_case_callout(from, chkcc_length, \ |
684 | 0 | buffer + buff_offset, \ |
685 | 0 | overflowed? 0 : lengthleft, \ |
686 | 0 | &forcecase, code); \ |
687 | 0 | if (overflowed) \ |
688 | 0 | { \ |
689 | 0 | if (chkcc_rc > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \ |
690 | 0 | goto TOOLARGEREPLACE; \ |
691 | 0 | extra_needed += chkcc_rc; \ |
692 | 0 | break; \ |
693 | 0 | } \ |
694 | 0 | }) |
695 | | |
696 | | #define CHECKCASECPY_CALLOUT(length_) \ |
697 | 0 | CHECKCASECPY_BASE(length_, { \ |
698 | 0 | chkcc_rc = do_case_copy(buffer + buff_offset, chkcc_length, \ |
699 | 0 | lengthleft, &forcecase, utf, \ |
700 | 0 | substitute_case_callout, \ |
701 | 0 | substitute_case_callout_data); \ |
702 | 0 | if (chkcc_rc == ~(PCRE2_SIZE)0) goto CASEERROR; \ |
703 | 0 | }) |
704 | | |
705 | | /* This macro does a delayed case transformation, for the situation when we have |
706 | | a case-forcing callout. */ |
707 | | |
708 | | #define DELAYEDFORCECASE() \ |
709 | 0 | do { \ |
710 | 0 | PCRE2_SIZE chars_outstanding = (buff_offset - casestart_offset) + \ |
711 | 0 | (extra_needed - casestart_extra_needed); \ |
712 | 0 | if (chars_outstanding > 0) \ |
713 | 0 | { \ |
714 | 0 | if (overflowed) \ |
715 | 0 | { \ |
716 | 0 | PCRE2_SIZE guess = pessimistic_case_inflation(chars_outstanding); \ |
717 | 0 | if (guess > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \ |
718 | 0 | goto TOOLARGEREPLACE; \ |
719 | 0 | extra_needed += guess; \ |
720 | 0 | } \ |
721 | 0 | else \ |
722 | 0 | { \ |
723 | 0 | /* Rewind the buffer */ \ |
724 | 0 | lengthleft += (buff_offset - casestart_offset); \ |
725 | 0 | buff_offset = casestart_offset; \ |
726 | 0 | /* Care! In-place case transformation */ \ |
727 | 0 | CHECKCASECPY_CALLOUT(chars_outstanding); \ |
728 | 0 | } \ |
729 | 0 | } \ |
730 | 0 | } \ |
731 | 0 | while (0) |
732 | | |
733 | | |
734 | | /* Here's the function */ |
735 | | |
736 | | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION |
737 | | pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, |
738 | | PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, |
739 | | pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, |
740 | | PCRE2_UCHAR *buffer, PCRE2_SIZE *blength) |
741 | 0 | { |
742 | 0 | int rc; |
743 | 0 | int subs; |
744 | 0 | uint32_t ovector_count; |
745 | 0 | uint32_t goptions = 0; |
746 | 0 | uint32_t suboptions; |
747 | 0 | pcre2_match_data *internal_match_data = NULL; |
748 | 0 | BOOL escaped_literal = FALSE; |
749 | 0 | BOOL overflowed = FALSE; |
750 | 0 | BOOL use_existing_match; |
751 | 0 | BOOL replacement_only; |
752 | 0 | BOOL utf = (code->overall_options & PCRE2_UTF) != 0; |
753 | 0 | PCRE2_UCHAR temp[6]; |
754 | 0 | PCRE2_SPTR ptr; |
755 | 0 | PCRE2_SPTR repend = NULL; |
756 | 0 | PCRE2_SIZE extra_needed = 0; |
757 | 0 | PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; |
758 | 0 | PCRE2_SIZE *ovector; |
759 | 0 | PCRE2_SIZE ovecsave[3]; |
760 | 0 | pcre2_substitute_callout_block scb; |
761 | 0 | PCRE2_SIZE sub_start_extra_needed; |
762 | 0 | PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, |
763 | 0 | PCRE2_SIZE, int, void *) = NULL; |
764 | 0 | void *substitute_case_callout_data = NULL; |
765 | | |
766 | | /* General initialization */ |
767 | |
|
768 | 0 | buff_offset = 0; |
769 | 0 | lengthleft = buff_length = *blength; |
770 | 0 | *blength = PCRE2_UNSET; |
771 | 0 | ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; |
772 | |
|
773 | 0 | if (mcontext != NULL) |
774 | 0 | { |
775 | 0 | substitute_case_callout = mcontext->substitute_case_callout; |
776 | 0 | substitute_case_callout_data = mcontext->substitute_case_callout_data; |
777 | 0 | } |
778 | | |
779 | | /* Partial matching is not valid. This must come after setting *blength to |
780 | | PCRE2_UNSET, so as not to imply an offset in the replacement. */ |
781 | |
|
782 | 0 | if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) |
783 | 0 | return PCRE2_ERROR_BADOPTION; |
784 | | |
785 | | /* Validate length and find the end of the replacement. A NULL replacement of |
786 | | zero length is interpreted as an empty string. */ |
787 | | |
788 | 0 | if (replacement == NULL) |
789 | 0 | { |
790 | 0 | if (rlength != 0) return PCRE2_ERROR_NULL; |
791 | 0 | replacement = (PCRE2_SPTR)""; |
792 | 0 | } |
793 | | |
794 | 0 | if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); |
795 | 0 | repend = replacement + rlength; |
796 | | |
797 | | /* Check for using a match that has already happened. Note that the subject |
798 | | pointer in the match data may be NULL after a no-match. */ |
799 | |
|
800 | 0 | use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0); |
801 | 0 | replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0); |
802 | | |
803 | | /* If starting from an existing match, there must be an externally provided |
804 | | match data block. We create an internal match_data block in two cases: (a) an |
805 | | external one is not supplied (and we are not starting from an existing match); |
806 | | (b) an existing match is to be used for the first substitution. In the latter |
807 | | case, we copy the existing match into the internal block, except for any cached |
808 | | heap frame size and pointer. This ensures that no changes are made to the |
809 | | external match data block. */ |
810 | | |
811 | | /* WARNING: In both cases below a general context is constructed "by hand" |
812 | | because calling pcre2_general_context_create() involves a memory allocation. If |
813 | | the contents of a general context control block are ever changed there will |
814 | | have to be changes below. */ |
815 | |
|
816 | 0 | if (match_data == NULL) |
817 | 0 | { |
818 | 0 | pcre2_general_context gcontext; |
819 | 0 | if (use_existing_match) return PCRE2_ERROR_NULL; |
820 | 0 | gcontext.memctl = (mcontext == NULL)? |
821 | 0 | ((const pcre2_real_code *)code)->memctl : |
822 | 0 | ((pcre2_real_match_context *)mcontext)->memctl; |
823 | 0 | match_data = internal_match_data = |
824 | 0 | pcre2_match_data_create_from_pattern(code, &gcontext); |
825 | 0 | if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; |
826 | 0 | } |
827 | | |
828 | 0 | else if (use_existing_match) |
829 | 0 | { |
830 | 0 | int pairs; |
831 | 0 | pcre2_general_context gcontext; |
832 | 0 | gcontext.memctl = (mcontext == NULL)? |
833 | 0 | ((const pcre2_real_code *)code)->memctl : |
834 | 0 | ((pcre2_real_match_context *)mcontext)->memctl; |
835 | 0 | pairs = (code->top_bracket + 1 < match_data->oveccount)? |
836 | 0 | code->top_bracket + 1 : match_data->oveccount; |
837 | 0 | internal_match_data = pcre2_match_data_create(match_data->oveccount, |
838 | 0 | &gcontext); |
839 | 0 | if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; |
840 | 0 | memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector) |
841 | 0 | + 2*pairs*sizeof(PCRE2_SIZE)); |
842 | 0 | internal_match_data->heapframes = NULL; |
843 | 0 | internal_match_data->heapframes_size = 0; |
844 | 0 | match_data = internal_match_data; |
845 | 0 | } |
846 | | |
847 | | /* Remember ovector details */ |
848 | | |
849 | 0 | ovector = pcre2_get_ovector_pointer(match_data); |
850 | 0 | ovector_count = pcre2_get_ovector_count(match_data); |
851 | | |
852 | | /* Fixed things in the callout block */ |
853 | |
|
854 | 0 | scb.version = 0; |
855 | 0 | scb.input = subject; |
856 | 0 | scb.output = (PCRE2_SPTR)buffer; |
857 | 0 | scb.ovector = ovector; |
858 | | |
859 | | /* A NULL subject of zero length is treated as an empty string. */ |
860 | |
|
861 | 0 | if (subject == NULL) |
862 | 0 | { |
863 | 0 | if (length != 0) return PCRE2_ERROR_NULL; |
864 | 0 | subject = (PCRE2_SPTR)""; |
865 | 0 | } |
866 | | |
867 | | /* Find length of zero-terminated subject */ |
868 | | |
869 | 0 | if (length == PCRE2_ZERO_TERMINATED) |
870 | 0 | length = subject? PRIV(strlen)(subject) : 0; |
871 | | |
872 | | /* Check UTF replacement string if necessary. */ |
873 | |
|
874 | 0 | #ifdef SUPPORT_UNICODE |
875 | 0 | if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) |
876 | 0 | { |
877 | 0 | rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); |
878 | 0 | if (rc != 0) |
879 | 0 | { |
880 | 0 | match_data->leftchar = 0; |
881 | 0 | goto EXIT; |
882 | 0 | } |
883 | 0 | } |
884 | 0 | #endif /* SUPPORT_UNICODE */ |
885 | | |
886 | | /* Save the substitute options and remove them from the match options. */ |
887 | | |
888 | 0 | suboptions = options & SUBSTITUTE_OPTIONS; |
889 | 0 | options &= ~SUBSTITUTE_OPTIONS; |
890 | | |
891 | | /* Error if the start match offset is greater than the length of the subject. */ |
892 | |
|
893 | 0 | if (start_offset > length) |
894 | 0 | { |
895 | 0 | match_data->leftchar = 0; |
896 | 0 | rc = PCRE2_ERROR_BADOFFSET; |
897 | 0 | goto EXIT; |
898 | 0 | } |
899 | | |
900 | | /* Copy up to the start offset, unless only the replacement is required. */ |
901 | | |
902 | 0 | if (!replacement_only) CHECKMEMCPY(subject, start_offset); |
903 | | |
904 | | /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first |
905 | | match is taken from the match_data that was passed in. */ |
906 | | |
907 | 0 | subs = 0; |
908 | 0 | do |
909 | 0 | { |
910 | 0 | PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; |
911 | 0 | uint32_t ptrstackptr = 0; |
912 | 0 | case_state forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE }; |
913 | 0 | PCRE2_SIZE casestart_offset = 0; |
914 | 0 | PCRE2_SIZE casestart_extra_needed = 0; |
915 | |
|
916 | 0 | if (use_existing_match) |
917 | 0 | { |
918 | 0 | rc = match_data->rc; |
919 | 0 | use_existing_match = FALSE; |
920 | 0 | } |
921 | 0 | else rc = pcre2_match(code, subject, length, start_offset, options|goptions, |
922 | 0 | match_data, mcontext); |
923 | |
|
924 | 0 | #ifdef SUPPORT_UNICODE |
925 | 0 | if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */ |
926 | 0 | #endif |
927 | | |
928 | | /* Any error other than no match returns the error code. No match when not |
929 | | doing the special after-empty-match global rematch, or when at the end of the |
930 | | subject, breaks the global loop. Otherwise, advance the starting point by one |
931 | | character, copying it to the output, and try again. */ |
932 | |
|
933 | 0 | if (rc < 0) |
934 | 0 | { |
935 | 0 | PCRE2_SIZE save_start; |
936 | |
|
937 | 0 | if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; |
938 | 0 | if (goptions == 0 || start_offset >= length) break; |
939 | | |
940 | | /* Advance by one code point. Then, if CRLF is a valid newline sequence and |
941 | | we have advanced into the middle of it, advance one more code point. In |
942 | | other words, do not start in the middle of CRLF, even if CR and LF on their |
943 | | own are valid newlines. */ |
944 | | |
945 | 0 | save_start = start_offset++; |
946 | 0 | if (subject[start_offset-1] == CHAR_CR && |
947 | 0 | (code->newline_convention == PCRE2_NEWLINE_CRLF || |
948 | 0 | code->newline_convention == PCRE2_NEWLINE_ANY || |
949 | 0 | code->newline_convention == PCRE2_NEWLINE_ANYCRLF) && |
950 | 0 | start_offset < length && |
951 | 0 | subject[start_offset] == CHAR_LF) |
952 | 0 | start_offset++; |
953 | | |
954 | | /* Otherwise, in UTF mode, advance past any secondary code points. */ |
955 | | |
956 | 0 | else if ((code->overall_options & PCRE2_UTF) != 0) |
957 | 0 | { |
958 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
959 | 0 | while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) |
960 | 0 | start_offset++; |
961 | | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
962 | | while (start_offset < length && |
963 | | (subject[start_offset] & 0xfc00) == 0xdc00) |
964 | | start_offset++; |
965 | | #endif |
966 | 0 | } |
967 | | |
968 | | /* Copy what we have advanced past (unless not required), reset the special |
969 | | global options, and continue to the next match. */ |
970 | |
|
971 | 0 | fraglength = start_offset - save_start; |
972 | 0 | if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength); |
973 | 0 | goptions = 0; |
974 | 0 | continue; |
975 | 0 | } |
976 | | |
977 | | /* Handle a successful match. Matches that use \K to end before they start |
978 | | or start before the current point in the subject are not supported. */ |
979 | | |
980 | 0 | if (ovector[1] < ovector[0] || ovector[0] < start_offset) |
981 | 0 | { |
982 | 0 | rc = PCRE2_ERROR_BADSUBSPATTERN; |
983 | 0 | goto EXIT; |
984 | 0 | } |
985 | | |
986 | | /* Check for the same match as previous. This is legitimate after matching an |
987 | | empty string that starts after the initial match offset. We have tried again |
988 | | at the match point in case the pattern is one like /(?<=\G.)/ which can never |
989 | | match at its starting point, so running the match achieves the bumpalong. If |
990 | | we do get the same (null) match at the original match point, it isn't such a |
991 | | pattern, so we now do the empty string magic. In all other cases, a repeat |
992 | | match should never occur. */ |
993 | | |
994 | 0 | if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) |
995 | 0 | { |
996 | 0 | if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) |
997 | 0 | { |
998 | 0 | goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; |
999 | 0 | ovecsave[2] = start_offset; |
1000 | 0 | continue; /* Back to the top of the loop */ |
1001 | 0 | } |
1002 | 0 | rc = PCRE2_ERROR_INTERNAL_DUPMATCH; |
1003 | 0 | goto EXIT; |
1004 | 0 | } |
1005 | | |
1006 | | /* Count substitutions with a paranoid check for integer overflow; surely no |
1007 | | real call to this function would ever hit this! */ |
1008 | | |
1009 | 0 | if (subs == INT_MAX) |
1010 | 0 | { |
1011 | 0 | rc = PCRE2_ERROR_TOOMANYREPLACE; |
1012 | 0 | goto EXIT; |
1013 | 0 | } |
1014 | 0 | subs++; |
1015 | | |
1016 | | /* Copy the text leading up to the match (unless not required); remember |
1017 | | where the insert begins and how many ovector pairs are set; and remember how |
1018 | | much space we have requested in extra_needed. */ |
1019 | |
|
1020 | 0 | if (rc == 0) rc = ovector_count; |
1021 | 0 | fraglength = ovector[0] - start_offset; |
1022 | 0 | if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); |
1023 | 0 | scb.output_offsets[0] = buff_offset; |
1024 | 0 | scb.oveccount = rc; |
1025 | 0 | sub_start_extra_needed = extra_needed; |
1026 | | |
1027 | | /* Process the replacement string. If the entire replacement is literal, just |
1028 | | copy it with length check. */ |
1029 | |
|
1030 | 0 | ptr = replacement; |
1031 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) |
1032 | 0 | { |
1033 | 0 | CHECKMEMCPY(ptr, rlength); |
1034 | 0 | } |
1035 | | |
1036 | | /* Within a non-literal replacement, which must be scanned character by |
1037 | | character, local literal mode can be set by \Q, but only in extended mode |
1038 | | when backslashes are being interpreted. In extended mode we must handle |
1039 | | nested substrings that are to be reprocessed. */ |
1040 | | |
1041 | 0 | else for (;;) |
1042 | 0 | { |
1043 | 0 | uint32_t ch; |
1044 | 0 | unsigned int chlen; |
1045 | 0 | int group; |
1046 | 0 | uint32_t special; |
1047 | 0 | PCRE2_SPTR text1_start = NULL; |
1048 | 0 | PCRE2_SPTR text1_end = NULL; |
1049 | 0 | PCRE2_SPTR text2_start = NULL; |
1050 | 0 | PCRE2_SPTR text2_end = NULL; |
1051 | 0 | PCRE2_UCHAR name[MAX_NAME_SIZE + 1]; |
1052 | | |
1053 | | /* If at the end of a nested substring, pop the stack. */ |
1054 | |
|
1055 | 0 | if (ptr >= repend) |
1056 | 0 | { |
1057 | 0 | if (ptrstackptr == 0) break; /* End of replacement string */ |
1058 | 0 | repend = ptrstack[--ptrstackptr]; |
1059 | 0 | ptr = ptrstack[--ptrstackptr]; |
1060 | 0 | continue; |
1061 | 0 | } |
1062 | | |
1063 | | /* Handle the next character */ |
1064 | | |
1065 | 0 | if (escaped_literal) |
1066 | 0 | { |
1067 | 0 | if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) |
1068 | 0 | { |
1069 | 0 | escaped_literal = FALSE; |
1070 | 0 | ptr += 2; |
1071 | 0 | continue; |
1072 | 0 | } |
1073 | 0 | goto LOADLITERAL; |
1074 | 0 | } |
1075 | | |
1076 | | /* Not in literal mode. */ |
1077 | | |
1078 | 0 | if (*ptr == CHAR_DOLLAR_SIGN) |
1079 | 0 | { |
1080 | 0 | BOOL inparens; |
1081 | 0 | BOOL inangle; |
1082 | 0 | BOOL star; |
1083 | 0 | PCRE2_SIZE sublength; |
1084 | 0 | PCRE2_UCHAR next; |
1085 | 0 | PCRE2_SPTR subptr, subptrend; |
1086 | |
|
1087 | 0 | if (++ptr >= repend) goto BAD; |
1088 | 0 | if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL; |
1089 | | |
1090 | 0 | special = 0; |
1091 | 0 | text1_start = NULL; |
1092 | 0 | text1_end = NULL; |
1093 | 0 | text2_start = NULL; |
1094 | 0 | text2_end = NULL; |
1095 | 0 | group = -1; |
1096 | 0 | inparens = FALSE; |
1097 | 0 | inangle = FALSE; |
1098 | 0 | star = FALSE; |
1099 | 0 | subptr = NULL; |
1100 | 0 | subptrend = NULL; |
1101 | | |
1102 | | /* Special $ sequences, as supported by Perl, JavaScript, .NET and others. */ |
1103 | 0 | if (next == CHAR_AMPERSAND) |
1104 | 0 | { |
1105 | 0 | ++ptr; |
1106 | 0 | group = 0; |
1107 | 0 | goto GROUP_SUBSTITUTE; |
1108 | 0 | } |
1109 | 0 | if (next == CHAR_GRAVE_ACCENT || next == CHAR_APOSTROPHE) |
1110 | 0 | { |
1111 | 0 | ++ptr; |
1112 | 0 | rc = pcre2_substring_length_bynumber(match_data, 0, &sublength); |
1113 | 0 | if (rc < 0) goto PTREXIT; /* (Sanity-check ovector before reading from it.) */ |
1114 | | |
1115 | 0 | if (next == CHAR_GRAVE_ACCENT) |
1116 | 0 | { |
1117 | 0 | subptr = subject; |
1118 | 0 | subptrend = subject + ovector[0]; |
1119 | 0 | } |
1120 | 0 | else |
1121 | 0 | { |
1122 | 0 | subptr = subject + ovector[1]; |
1123 | 0 | subptrend = subject + length; |
1124 | 0 | } |
1125 | |
|
1126 | 0 | goto SUBPTR_SUBSTITUTE; |
1127 | 0 | } |
1128 | 0 | if (next == CHAR_UNDERSCORE) |
1129 | 0 | { |
1130 | | /* Java, .NET support $_ for "entire input string". */ |
1131 | 0 | ++ptr; |
1132 | 0 | subptr = subject; |
1133 | 0 | subptrend = subject + length; |
1134 | 0 | goto SUBPTR_SUBSTITUTE; |
1135 | 0 | } |
1136 | | |
1137 | 0 | if (next == CHAR_LEFT_CURLY_BRACKET) |
1138 | 0 | { |
1139 | 0 | if (++ptr >= repend) goto BAD; |
1140 | 0 | next = *ptr; |
1141 | 0 | inparens = TRUE; |
1142 | 0 | } |
1143 | 0 | else if (next == CHAR_LESS_THAN_SIGN) |
1144 | 0 | { |
1145 | | /* JavaScript compatibility syntax, $<name>. Processes only named |
1146 | | groups (not numbered) and does not support extensions such as star |
1147 | | (you can do ${name} and ${*name}, but not $<*name>). */ |
1148 | 0 | if (++ptr >= repend) goto BAD; |
1149 | 0 | next = *ptr; |
1150 | 0 | inangle = TRUE; |
1151 | 0 | } |
1152 | | |
1153 | 0 | if (!inangle && next == CHAR_ASTERISK) |
1154 | 0 | { |
1155 | 0 | if (++ptr >= repend) goto BAD; |
1156 | 0 | next = *ptr; |
1157 | 0 | star = TRUE; |
1158 | 0 | } |
1159 | | |
1160 | 0 | if (!star && !inangle && next >= CHAR_0 && next <= CHAR_9) |
1161 | 0 | { |
1162 | 0 | group = next - CHAR_0; |
1163 | 0 | while (++ptr < repend) |
1164 | 0 | { |
1165 | 0 | next = *ptr; |
1166 | 0 | if (next < CHAR_0 || next > CHAR_9) break; |
1167 | 0 | group = group * 10 + (next - CHAR_0); |
1168 | | |
1169 | | /* A check for a number greater than the hightest captured group |
1170 | | is sufficient here; no need for a separate overflow check. If unknown |
1171 | | groups are to be treated as unset, just skip over any remaining |
1172 | | digits and carry on. */ |
1173 | |
|
1174 | 0 | if (group > code->top_bracket) |
1175 | 0 | { |
1176 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
1177 | 0 | { |
1178 | 0 | while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9); |
1179 | 0 | break; |
1180 | 0 | } |
1181 | 0 | else |
1182 | 0 | { |
1183 | 0 | rc = PCRE2_ERROR_NOSUBSTRING; |
1184 | 0 | goto PTREXIT; |
1185 | 0 | } |
1186 | 0 | } |
1187 | 0 | } |
1188 | 0 | } |
1189 | 0 | else |
1190 | 0 | { |
1191 | 0 | PCRE2_SIZE name_len; |
1192 | 0 | PCRE2_SPTR name_start = ptr; |
1193 | 0 | if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset)) |
1194 | 0 | goto BAD; |
1195 | 0 | name_len = ptr - name_start; |
1196 | 0 | memcpy(name, name_start, CU2BYTES(name_len)); |
1197 | 0 | name[name_len] = 0; |
1198 | 0 | } |
1199 | | |
1200 | 0 | next = 0; /* not used or updated after this point */ |
1201 | 0 | (void)next; |
1202 | | |
1203 | | /* In extended mode we recognize ${name:+set text:unset text} and |
1204 | | ${name:-default text}. */ |
1205 | |
|
1206 | 0 | if (inparens) |
1207 | 0 | { |
1208 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && |
1209 | 0 | !star && ptr < repend - 2 && *ptr == CHAR_COLON) |
1210 | 0 | { |
1211 | 0 | special = *(++ptr); |
1212 | 0 | if (special != CHAR_PLUS && special != CHAR_MINUS) |
1213 | 0 | { |
1214 | 0 | rc = PCRE2_ERROR_BADSUBSTITUTION; |
1215 | 0 | goto PTREXIT; |
1216 | 0 | } |
1217 | | |
1218 | 0 | text1_start = ++ptr; |
1219 | 0 | rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS); |
1220 | 0 | if (rc != 0) goto PTREXIT; |
1221 | 0 | text1_end = ptr; |
1222 | |
|
1223 | 0 | if (special == CHAR_PLUS && *ptr == CHAR_COLON) |
1224 | 0 | { |
1225 | 0 | text2_start = ++ptr; |
1226 | 0 | rc = find_text_end(code, &ptr, repend, TRUE); |
1227 | 0 | if (rc != 0) goto PTREXIT; |
1228 | 0 | text2_end = ptr; |
1229 | 0 | } |
1230 | 0 | } |
1231 | | |
1232 | 0 | else |
1233 | 0 | { |
1234 | 0 | if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET) |
1235 | 0 | { |
1236 | 0 | rc = PCRE2_ERROR_REPMISSINGBRACE; |
1237 | 0 | goto PTREXIT; |
1238 | 0 | } |
1239 | 0 | } |
1240 | | |
1241 | 0 | ptr++; |
1242 | 0 | } |
1243 | | |
1244 | 0 | if (inangle) |
1245 | 0 | { |
1246 | 0 | if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN) |
1247 | 0 | goto BAD; |
1248 | 0 | ptr++; |
1249 | 0 | } |
1250 | | |
1251 | | /* Have found a syntactically correct group number or name, or *name. |
1252 | | Only *MARK is currently recognized. */ |
1253 | | |
1254 | 0 | if (star) |
1255 | 0 | { |
1256 | 0 | if (PRIV(strcmp_c8)(name, STRING_MARK) == 0) |
1257 | 0 | { |
1258 | 0 | PCRE2_SPTR mark = pcre2_get_mark(match_data); |
1259 | 0 | if (mark != NULL) |
1260 | 0 | { |
1261 | | /* Peek backwards one code unit to obtain the length of the mark. |
1262 | | It can (theoretically) contain an embedded NUL. */ |
1263 | 0 | fraglength = mark[-1]; |
1264 | 0 | if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE && |
1265 | 0 | substitute_case_callout == NULL) |
1266 | 0 | CHECKCASECPY_DEFAULT(mark, fraglength); |
1267 | 0 | else |
1268 | 0 | CHECKMEMCPY(mark, fraglength); |
1269 | 0 | } |
1270 | 0 | } |
1271 | 0 | else goto BAD; |
1272 | 0 | } |
1273 | | |
1274 | | /* Substitute the contents of a group. We don't use substring_copy |
1275 | | functions any more, in order to support case forcing. */ |
1276 | | |
1277 | 0 | else |
1278 | 0 | { |
1279 | 0 | GROUP_SUBSTITUTE: |
1280 | | /* Find a number for a named group. In case there are duplicate names, |
1281 | | search for the first one that is set. If the name is not found when |
1282 | | PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a |
1283 | | non-existent group. */ |
1284 | |
|
1285 | 0 | if (group < 0) |
1286 | 0 | { |
1287 | 0 | PCRE2_SPTR first, last, entry; |
1288 | 0 | rc = pcre2_substring_nametable_scan(code, name, &first, &last); |
1289 | 0 | if (rc == PCRE2_ERROR_NOSUBSTRING && |
1290 | 0 | (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
1291 | 0 | { |
1292 | 0 | group = code->top_bracket + 1; |
1293 | 0 | } |
1294 | 0 | else |
1295 | 0 | { |
1296 | 0 | if (rc < 0) goto PTREXIT; |
1297 | 0 | for (entry = first; entry <= last; entry += rc) |
1298 | 0 | { |
1299 | 0 | uint32_t ng = GET2(entry, 0); |
1300 | 0 | if (ng < ovector_count) |
1301 | 0 | { |
1302 | 0 | if (group < 0) group = ng; /* First in ovector */ |
1303 | 0 | if (ovector[ng*2] != PCRE2_UNSET) |
1304 | 0 | { |
1305 | 0 | group = ng; /* First that is set */ |
1306 | 0 | break; |
1307 | 0 | } |
1308 | 0 | } |
1309 | 0 | } |
1310 | | |
1311 | | /* If group is still negative, it means we did not find a group |
1312 | | that is in the ovector. Just set the first group. */ |
1313 | |
|
1314 | 0 | if (group < 0) group = GET2(first, 0); |
1315 | 0 | } |
1316 | 0 | } |
1317 | | |
1318 | | /* We now have a group that is identified by number. Find the length of |
1319 | | the captured string. If a group in a non-special substitution is unset |
1320 | | when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */ |
1321 | | |
1322 | 0 | rc = pcre2_substring_length_bynumber(match_data, group, &sublength); |
1323 | 0 | if (rc < 0) |
1324 | 0 | { |
1325 | 0 | if (rc == PCRE2_ERROR_NOSUBSTRING && |
1326 | 0 | (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
1327 | 0 | { |
1328 | 0 | rc = PCRE2_ERROR_UNSET; |
1329 | 0 | } |
1330 | 0 | if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */ |
1331 | 0 | if (special == 0) /* Plain substitution */ |
1332 | 0 | { |
1333 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue; |
1334 | 0 | goto PTREXIT; /* Else error */ |
1335 | 0 | } |
1336 | 0 | } |
1337 | | |
1338 | | /* If special is '+' we have a 'set' and possibly an 'unset' text, |
1339 | | both of which are reprocessed when used. If special is '-' we have a |
1340 | | default text for when the group is unset; it must be reprocessed. */ |
1341 | | |
1342 | 0 | if (special != 0) |
1343 | 0 | { |
1344 | 0 | if (special == CHAR_MINUS) |
1345 | 0 | { |
1346 | 0 | if (rc == 0) goto LITERAL_SUBSTITUTE; |
1347 | 0 | text2_start = text1_start; |
1348 | 0 | text2_end = text1_end; |
1349 | 0 | } |
1350 | | |
1351 | 0 | if (ptrstackptr >= PTR_STACK_SIZE) goto BAD; |
1352 | 0 | ptrstack[ptrstackptr++] = ptr; |
1353 | 0 | ptrstack[ptrstackptr++] = repend; |
1354 | |
|
1355 | 0 | if (rc == 0) |
1356 | 0 | { |
1357 | 0 | ptr = text1_start; |
1358 | 0 | repend = text1_end; |
1359 | 0 | } |
1360 | 0 | else |
1361 | 0 | { |
1362 | 0 | ptr = text2_start; |
1363 | 0 | repend = text2_end; |
1364 | 0 | } |
1365 | 0 | continue; |
1366 | 0 | } |
1367 | | |
1368 | | /* Otherwise we have a literal substitution of a group's contents. */ |
1369 | | |
1370 | 0 | LITERAL_SUBSTITUTE: |
1371 | 0 | subptr = subject + ovector[group*2]; |
1372 | 0 | subptrend = subject + ovector[group*2 + 1]; |
1373 | | |
1374 | | /* Substitute a literal string, possibly forcing alphabetic case. */ |
1375 | |
|
1376 | 0 | SUBPTR_SUBSTITUTE: |
1377 | 0 | if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE && |
1378 | 0 | substitute_case_callout == NULL) |
1379 | 0 | CHECKCASECPY_DEFAULT(subptr, subptrend - subptr); |
1380 | 0 | else |
1381 | 0 | CHECKMEMCPY(subptr, subptrend - subptr); |
1382 | 0 | } |
1383 | 0 | } /* End of $ processing */ |
1384 | | |
1385 | | /* Handle an escape sequence in extended mode. We can use check_escape() |
1386 | | to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but |
1387 | | the case-forcing escapes are not supported in pcre2_compile() so must be |
1388 | | recognized here. */ |
1389 | | |
1390 | 0 | else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && |
1391 | 0 | *ptr == CHAR_BACKSLASH) |
1392 | 0 | { |
1393 | 0 | int errorcode; |
1394 | 0 | case_state new_forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE }; |
1395 | |
|
1396 | 0 | if (ptr < repend - 1) switch (ptr[1]) |
1397 | 0 | { |
1398 | 0 | case CHAR_L: |
1399 | 0 | new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER; |
1400 | 0 | new_forcecase.single_char = FALSE; |
1401 | 0 | ptr += 2; |
1402 | 0 | break; |
1403 | | |
1404 | 0 | case CHAR_l: |
1405 | 0 | new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER; |
1406 | 0 | new_forcecase.single_char = TRUE; |
1407 | 0 | ptr += 2; |
1408 | 0 | if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_U) |
1409 | 0 | { |
1410 | | /* Perl reverse-title-casing feature for \l\U */ |
1411 | 0 | new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST; |
1412 | 0 | new_forcecase.single_char = FALSE; |
1413 | 0 | ptr += 2; |
1414 | 0 | } |
1415 | 0 | break; |
1416 | | |
1417 | 0 | case CHAR_U: |
1418 | 0 | new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_UPPER; |
1419 | 0 | new_forcecase.single_char = FALSE; |
1420 | 0 | ptr += 2; |
1421 | 0 | break; |
1422 | | |
1423 | 0 | case CHAR_u: |
1424 | 0 | new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST; |
1425 | 0 | new_forcecase.single_char = TRUE; |
1426 | 0 | ptr += 2; |
1427 | 0 | if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_L) |
1428 | 0 | { |
1429 | | /* Perl title-casing feature for \u\L */ |
1430 | 0 | new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST; |
1431 | 0 | new_forcecase.single_char = FALSE; |
1432 | 0 | ptr += 2; |
1433 | 0 | } |
1434 | 0 | break; |
1435 | | |
1436 | 0 | default: |
1437 | 0 | break; |
1438 | 0 | } |
1439 | | |
1440 | 0 | if (new_forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE) |
1441 | 0 | { |
1442 | 0 | SETFORCECASE: |
1443 | | |
1444 | | /* If the substitute_case_callout is unset, our case-forcing is done |
1445 | | immediately. If there is a callout however, then its action is delayed |
1446 | | until all the characters have been collected. |
1447 | | |
1448 | | Apply the callout now, before we set the new casing mode. */ |
1449 | |
|
1450 | 0 | if (substitute_case_callout != NULL && |
1451 | 0 | forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE) |
1452 | 0 | DELAYEDFORCECASE(); |
1453 | | |
1454 | 0 | forcecase = new_forcecase; |
1455 | 0 | casestart_offset = buff_offset; |
1456 | 0 | casestart_extra_needed = extra_needed; |
1457 | 0 | continue; |
1458 | 0 | } |
1459 | | |
1460 | 0 | ptr++; /* Point after \ */ |
1461 | 0 | rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, |
1462 | 0 | code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL); |
1463 | 0 | if (errorcode != 0) goto BADESCAPE; |
1464 | | |
1465 | 0 | switch(rc) |
1466 | 0 | { |
1467 | 0 | case ESC_E: |
1468 | 0 | goto SETFORCECASE; |
1469 | | |
1470 | 0 | case ESC_Q: |
1471 | 0 | escaped_literal = TRUE; |
1472 | 0 | continue; |
1473 | | |
1474 | 0 | case 0: /* Data character */ |
1475 | 0 | case ESC_b: /* \b is backspace in a substitution */ |
1476 | 0 | case ESC_v: /* \v is vertical tab in a substitution */ |
1477 | |
|
1478 | 0 | if (rc == ESC_b) ch = CHAR_BS; |
1479 | 0 | if (rc == ESC_v) ch = CHAR_VT; |
1480 | |
|
1481 | 0 | #ifdef SUPPORT_UNICODE |
1482 | 0 | if (utf) chlen = PRIV(ord2utf)(ch, temp); else |
1483 | 0 | #endif |
1484 | 0 | { |
1485 | 0 | temp[0] = ch; |
1486 | 0 | chlen = 1; |
1487 | 0 | } |
1488 | |
|
1489 | 0 | if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE && |
1490 | 0 | substitute_case_callout == NULL) |
1491 | 0 | CHECKCASECPY_DEFAULT(temp, chlen); |
1492 | 0 | else |
1493 | 0 | CHECKMEMCPY(temp, chlen); |
1494 | 0 | continue; |
1495 | | |
1496 | 0 | case ESC_g: |
1497 | 0 | { |
1498 | 0 | PCRE2_SIZE name_len; |
1499 | 0 | PCRE2_SPTR name_start; |
1500 | | |
1501 | | /* Parse the \g<name> form (\g<number> already handled by check_escape) */ |
1502 | 0 | if (ptr >= repend || *ptr != CHAR_LESS_THAN_SIGN) |
1503 | 0 | goto BADESCAPE; |
1504 | 0 | ++ptr; |
1505 | |
|
1506 | 0 | name_start = ptr; |
1507 | 0 | if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset)) |
1508 | 0 | goto BADESCAPE; |
1509 | 0 | name_len = ptr - name_start; |
1510 | |
|
1511 | 0 | if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN) |
1512 | 0 | goto BADESCAPE; |
1513 | 0 | ++ptr; |
1514 | |
|
1515 | 0 | special = 0; |
1516 | 0 | group = -1; |
1517 | 0 | memcpy(name, name_start, CU2BYTES(name_len)); |
1518 | 0 | name[name_len] = 0; |
1519 | 0 | goto GROUP_SUBSTITUTE; |
1520 | 0 | } |
1521 | | |
1522 | 0 | default: |
1523 | 0 | if (rc < 0) |
1524 | 0 | { |
1525 | 0 | special = 0; |
1526 | 0 | group = -rc - 1; |
1527 | 0 | goto GROUP_SUBSTITUTE; |
1528 | 0 | } |
1529 | 0 | goto BADESCAPE; |
1530 | 0 | } |
1531 | 0 | } /* End of backslash processing */ |
1532 | | |
1533 | | /* Handle a literal code unit */ |
1534 | | |
1535 | 0 | else |
1536 | 0 | { |
1537 | 0 | PCRE2_SPTR ch_start; |
1538 | |
|
1539 | 0 | LOADLITERAL: |
1540 | 0 | ch_start = ptr; |
1541 | 0 | GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */ |
1542 | 0 | (void) ch; |
1543 | |
|
1544 | 0 | if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE && |
1545 | 0 | substitute_case_callout == NULL) |
1546 | 0 | CHECKCASECPY_DEFAULT(ch_start, ptr - ch_start); |
1547 | 0 | else |
1548 | 0 | CHECKMEMCPY(ch_start, ptr - ch_start); |
1549 | 0 | } /* End handling a literal code unit */ |
1550 | 0 | } /* End of loop for scanning the replacement. */ |
1551 | | |
1552 | | /* If the substitute_case_callout is unset, our case-forcing is done |
1553 | | immediately. If there is a callout however, then its action is delayed |
1554 | | until all the characters have been collected. |
1555 | | |
1556 | | We now clean up any trailing section of the replacement for which we deferred |
1557 | | the case-forcing. */ |
1558 | | |
1559 | 0 | if (substitute_case_callout != NULL && |
1560 | 0 | forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE) |
1561 | 0 | DELAYEDFORCECASE(); |
1562 | | |
1563 | | /* The replacement has been copied to the output, or its size has been |
1564 | | remembered. Handle the callout if there is one. */ |
1565 | | |
1566 | 0 | if (mcontext != NULL && mcontext->substitute_callout != NULL) |
1567 | 0 | { |
1568 | | /* If we an actual (non-simulated) replacement, do the callout. */ |
1569 | |
|
1570 | 0 | if (!overflowed) |
1571 | 0 | { |
1572 | 0 | scb.subscount = subs; |
1573 | 0 | scb.output_offsets[1] = buff_offset; |
1574 | 0 | rc = mcontext->substitute_callout(&scb, |
1575 | 0 | mcontext->substitute_callout_data); |
1576 | | |
1577 | | /* A non-zero return means cancel this substitution. Instead, copy the |
1578 | | matched string fragment. */ |
1579 | |
|
1580 | 0 | if (rc != 0) |
1581 | 0 | { |
1582 | 0 | PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; |
1583 | 0 | PCRE2_SIZE oldlength = ovector[1] - ovector[0]; |
1584 | |
|
1585 | 0 | buff_offset -= newlength; |
1586 | 0 | lengthleft += newlength; |
1587 | 0 | if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength); |
1588 | | |
1589 | | /* A negative return means do not do any more. */ |
1590 | | |
1591 | 0 | if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); |
1592 | 0 | } |
1593 | 0 | } |
1594 | | |
1595 | | /* In this interesting case, we cannot do the callout, so it's hard to |
1596 | | estimate the required buffer size. What callers want is to be able to make |
1597 | | two calls to pcre2_substitute(), once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH |
1598 | | to discover the buffer size, and then a second and final call. Older |
1599 | | versions of PCRE2 violated this assumption, by proceding as if the callout |
1600 | | had returned zero - but on the second call to pcre2_substitute() it could |
1601 | | return non-zero and then overflow the buffer again. Callers probably don't |
1602 | | want to keep on looping to incrementally discover the buffer size. */ |
1603 | | |
1604 | 0 | else |
1605 | 0 | { |
1606 | 0 | PCRE2_SIZE newlength_buf = buff_offset - scb.output_offsets[0]; |
1607 | 0 | PCRE2_SIZE newlength_extra = extra_needed - sub_start_extra_needed; |
1608 | 0 | PCRE2_SIZE newlength = |
1609 | 0 | (newlength_extra > ~(PCRE2_SIZE)0 - newlength_buf)? /* Integer overflow */ |
1610 | 0 | ~(PCRE2_SIZE)0 : newlength_buf + newlength_extra; /* Cap the addition */ |
1611 | 0 | PCRE2_SIZE oldlength = ovector[1] - ovector[0]; |
1612 | | |
1613 | | /* Be pessimistic: request whichever buffer size is larger out of |
1614 | | accepting or rejecting the substitution. */ |
1615 | |
|
1616 | 0 | if (oldlength > newlength) |
1617 | 0 | { |
1618 | 0 | PCRE2_SIZE additional = oldlength - newlength; |
1619 | 0 | if (additional > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ |
1620 | 0 | goto TOOLARGEREPLACE; |
1621 | 0 | extra_needed += additional; |
1622 | 0 | } |
1623 | | |
1624 | | /* Proceed as if the callout did not return a negative. A negative |
1625 | | effectively rejects all future substitutions, but we want to examine them |
1626 | | pessimistically. */ |
1627 | 0 | } |
1628 | 0 | } |
1629 | | |
1630 | | /* Save the details of this match. See above for how this data is used. If we |
1631 | | matched an empty string, do the magic for global matches. Update the start |
1632 | | offset to point to the rest of the subject string. If we re-used an existing |
1633 | | match for the first match, switch to the internal match data block. */ |
1634 | | |
1635 | 0 | ovecsave[0] = ovector[0]; |
1636 | 0 | ovecsave[1] = ovector[1]; |
1637 | 0 | ovecsave[2] = start_offset; |
1638 | |
|
1639 | 0 | goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : |
1640 | 0 | PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; |
1641 | 0 | start_offset = ovector[1]; |
1642 | 0 | } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ |
1643 | | |
1644 | | /* Copy the rest of the subject unless not required, and terminate the output |
1645 | | with a binary zero. */ |
1646 | | |
1647 | 0 | if (!replacement_only) |
1648 | 0 | { |
1649 | 0 | fraglength = length - start_offset; |
1650 | 0 | CHECKMEMCPY(subject + start_offset, fraglength); |
1651 | 0 | } |
1652 | | |
1653 | 0 | temp[0] = 0; |
1654 | 0 | CHECKMEMCPY(temp, 1); |
1655 | | |
1656 | | /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, |
1657 | | and matching has carried on after a full buffer, in order to compute the length |
1658 | | needed. Otherwise, an overflow generates an immediate error return. */ |
1659 | | |
1660 | 0 | if (overflowed) |
1661 | 0 | { |
1662 | 0 | rc = PCRE2_ERROR_NOMEMORY; |
1663 | |
|
1664 | 0 | if (extra_needed > ~(PCRE2_SIZE)0 - buff_length) /* Integer overflow */ |
1665 | 0 | goto TOOLARGEREPLACE; |
1666 | 0 | *blength = buff_length + extra_needed; |
1667 | 0 | } |
1668 | | |
1669 | | /* After a successful execution, return the number of substitutions and set the |
1670 | | length of buffer used, excluding the trailing zero. */ |
1671 | | |
1672 | 0 | else |
1673 | 0 | { |
1674 | 0 | rc = subs; |
1675 | 0 | *blength = buff_offset - 1; |
1676 | 0 | } |
1677 | | |
1678 | 0 | EXIT: |
1679 | 0 | if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); |
1680 | 0 | else match_data->rc = rc; |
1681 | 0 | return rc; |
1682 | | |
1683 | 0 | NOROOM: |
1684 | 0 | rc = PCRE2_ERROR_NOMEMORY; |
1685 | 0 | goto EXIT; |
1686 | | |
1687 | 0 | CASEERROR: |
1688 | 0 | rc = PCRE2_ERROR_REPLACECASE; |
1689 | 0 | goto EXIT; |
1690 | | |
1691 | 0 | TOOLARGEREPLACE: |
1692 | 0 | rc = PCRE2_ERROR_TOOLARGEREPLACE; |
1693 | 0 | goto EXIT; |
1694 | | |
1695 | 0 | BAD: |
1696 | 0 | rc = PCRE2_ERROR_BADREPLACEMENT; |
1697 | 0 | goto PTREXIT; |
1698 | | |
1699 | 0 | BADESCAPE: |
1700 | 0 | rc = PCRE2_ERROR_BADREPESCAPE; |
1701 | |
|
1702 | 0 | PTREXIT: |
1703 | 0 | *blength = (PCRE2_SIZE)(ptr - replacement); |
1704 | 0 | goto EXIT; |
1705 | 0 | } |
1706 | | |
1707 | | /* End of pcre2_substitute.c */ |